Add support for Xen blkif indirect segment I/Os. This makes it possible for

the blkfront driver to perform I/Os of up to 2 MB, subject to support from
the blkback to which it is connected and the initiation of such large I/Os
by the rest of the kernel.  In practice, the I/O size is increased from 40 kB
to 128 kB.

The changes to xen/interface/io/blkif.h consist merely of merging updates
from the upstream Xen repository.

In dev/xen/blkfront/block.h we add some convenience macros and structure
fields used for indirect-page I/Os: The device records its negotiated limit
on the number of indirect pages used, while each I/O command structure gains
permanently allocated page(s) for indirect page references and the Xen grant
references for those pages.

In dev/xen/blkfront/blkfront.c we now check in xbd_queue_cb whether a request
is small enough to handle without an indirection page, and either follow the
previous behaviour or use new code for issuing an indirect segment I/O.  In
xbd_connect we read the size of indirect segment I/Os supported by the backend
and select the maximum size we will use; then allocate the pages and Xen grant
references for each I/O command structure.  In xbd_free those grants and pages
are released.

A new loader tunable, hw.xbd.xbd_enable_indirect, can be set to 0 in order to
disable this functionality; it works by pretending that the backend does not
support this feature.  Some backends exhibit a loss of performance with large
I/Os, so users may wish to test with and without this functionality enabled.

Reviewed by:	royger
MFC after:	3 days
Relnotes:	yes
This commit is contained in:
Colin Percival 2015-07-30 03:50:01 +00:00
parent 8b15f615e0
commit aaebf69062
3 changed files with 271 additions and 34 deletions

View File

@ -84,6 +84,11 @@ static void xbd_startio(struct xbd_softc *sc);
/*---------------------------- Global Static Data ----------------------------*/
static MALLOC_DEFINE(M_XENBLOCKFRONT, "xbd", "Xen Block Front driver data");
static int xbd_enable_indirect = 1;
SYSCTL_NODE(_hw, OID_AUTO, xbd, CTLFLAG_RD, 0, "xbd driver parameters");
SYSCTL_INT(_hw_xbd, OID_AUTO, xbd_enable_indirect, CTLFLAG_RDTUN,
&xbd_enable_indirect, 0, "Enable xbd indirect segments");
/*---------------------------- Command Processing ----------------------------*/
static void
xbd_freeze(struct xbd_softc *sc, xbd_flag_t xbd_flag)
@ -205,7 +210,6 @@ xbd_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
{
struct xbd_softc *sc;
struct xbd_command *cm;
blkif_request_t *ring_req;
int op;
cm = arg;
@ -218,22 +222,47 @@ xbd_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
return;
}
KASSERT(nsegs <= BLKIF_MAX_SEGMENTS_PER_REQUEST,
KASSERT(nsegs <= sc->xbd_max_request_segments,
("Too many segments in a blkfront I/O"));
/* Fill out a communications ring structure. */
ring_req = RING_GET_REQUEST(&sc->xbd_ring, sc->xbd_ring.req_prod_pvt);
sc->xbd_ring.req_prod_pvt++;
ring_req->id = cm->cm_id;
ring_req->operation = cm->cm_operation;
ring_req->sector_number = cm->cm_sector_number;
ring_req->handle = (blkif_vdev_t)(uintptr_t)sc->xbd_disk;
ring_req->nr_segments = nsegs;
cm->cm_nseg = nsegs;
xbd_mksegarray(segs, nsegs, &cm->cm_gref_head,
xenbus_get_otherend_id(sc->xbd_dev),
cm->cm_operation == BLKIF_OP_WRITE,
cm->cm_sg_refs, ring_req->seg);
if (nsegs <= BLKIF_MAX_SEGMENTS_PER_REQUEST) {
blkif_request_t *ring_req;
/* Fill out a blkif_request_t structure. */
ring_req = (blkif_request_t *)
RING_GET_REQUEST(&sc->xbd_ring, sc->xbd_ring.req_prod_pvt);
sc->xbd_ring.req_prod_pvt++;
ring_req->id = cm->cm_id;
ring_req->operation = cm->cm_operation;
ring_req->sector_number = cm->cm_sector_number;
ring_req->handle = (blkif_vdev_t)(uintptr_t)sc->xbd_disk;
ring_req->nr_segments = nsegs;
cm->cm_nseg = nsegs;
xbd_mksegarray(segs, nsegs, &cm->cm_gref_head,
xenbus_get_otherend_id(sc->xbd_dev),
cm->cm_operation == BLKIF_OP_WRITE,
cm->cm_sg_refs, ring_req->seg);
} else {
blkif_request_indirect_t *ring_req;
/* Fill out a blkif_request_indirect_t structure. */
ring_req = (blkif_request_indirect_t *)
RING_GET_REQUEST(&sc->xbd_ring, sc->xbd_ring.req_prod_pvt);
sc->xbd_ring.req_prod_pvt++;
ring_req->id = cm->cm_id;
ring_req->operation = BLKIF_OP_INDIRECT;
ring_req->indirect_op = cm->cm_operation;
ring_req->sector_number = cm->cm_sector_number;
ring_req->handle = (blkif_vdev_t)(uintptr_t)sc->xbd_disk;
ring_req->nr_segments = nsegs;
cm->cm_nseg = nsegs;
xbd_mksegarray(segs, nsegs, &cm->cm_gref_head,
xenbus_get_otherend_id(sc->xbd_dev),
cm->cm_operation == BLKIF_OP_WRITE,
cm->cm_sg_refs, cm->cm_indirectionpages);
memcpy(ring_req->indirect_grefs, &cm->cm_indirectionrefs,
sizeof(grant_ref_t) * sc->xbd_max_request_indirectpages);
}
if (cm->cm_operation == BLKIF_OP_READ)
op = BUS_DMASYNC_PREREAD;
@ -1015,6 +1044,16 @@ xbd_free(struct xbd_softc *sc)
cm->cm_sg_refs = NULL;
}
if (cm->cm_indirectionpages != NULL) {
gnttab_end_foreign_access_references(
sc->xbd_max_request_indirectpages,
&cm->cm_indirectionrefs[0]);
contigfree(cm->cm_indirectionpages, PAGE_SIZE *
sc->xbd_max_request_indirectpages,
M_XENBLOCKFRONT);
cm->cm_indirectionpages = NULL;
}
bus_dmamap_destroy(sc->xbd_io_dmat, cm->cm_map);
}
free(sc->xbd_shadow, M_XENBLOCKFRONT);
@ -1051,9 +1090,6 @@ xbd_initialize(struct xbd_softc *sc)
*/
max_ring_page_order = 0;
sc->xbd_ring_pages = 1;
sc->xbd_max_request_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST;
sc->xbd_max_request_size =
XBD_SEGS_TO_SIZE(sc->xbd_max_request_segments);
/*
* Protocol negotiation.
@ -1167,7 +1203,7 @@ xbd_connect(struct xbd_softc *sc)
unsigned long sectors, sector_size;
unsigned int binfo;
int err, feature_barrier, feature_flush;
int i;
int i, j;
if (sc->xbd_state == XBD_STATE_CONNECTED ||
sc->xbd_state == XBD_STATE_SUSPENDED)
@ -1198,6 +1234,22 @@ xbd_connect(struct xbd_softc *sc)
if (err == 0 && feature_flush != 0)
sc->xbd_flags |= XBDF_FLUSH;
err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev),
"feature-max-indirect-segments", "%" PRIu32,
&sc->xbd_max_request_segments, NULL);
if ((err != 0) || (xbd_enable_indirect == 0))
sc->xbd_max_request_segments = 0;
if (sc->xbd_max_request_segments > XBD_MAX_INDIRECT_SEGMENTS)
sc->xbd_max_request_segments = XBD_MAX_INDIRECT_SEGMENTS;
if (sc->xbd_max_request_segments > XBD_SIZE_TO_SEGS(MAXPHYS))
sc->xbd_max_request_segments = XBD_SIZE_TO_SEGS(MAXPHYS);
sc->xbd_max_request_indirectpages =
XBD_INDIRECT_SEGS_TO_PAGES(sc->xbd_max_request_segments);
if (sc->xbd_max_request_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST)
sc->xbd_max_request_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST;
sc->xbd_max_request_size =
XBD_SEGS_TO_SIZE(sc->xbd_max_request_segments);
/* Allocate datastructures based on negotiated values. */
err = bus_dma_tag_create(
bus_get_dma_tag(sc->xbd_dev), /* parent */
@ -1230,6 +1282,7 @@ xbd_connect(struct xbd_softc *sc)
for (i = 0; i < sc->xbd_max_requests; i++) {
struct xbd_command *cm;
void * indirectpages;
cm = &sc->xbd_shadow[i];
cm->cm_sg_refs = malloc(
@ -1242,6 +1295,24 @@ xbd_connect(struct xbd_softc *sc)
cm->cm_sc = sc;
if (bus_dmamap_create(sc->xbd_io_dmat, 0, &cm->cm_map) != 0)
break;
if (sc->xbd_max_request_indirectpages > 0) {
indirectpages = contigmalloc(
PAGE_SIZE * sc->xbd_max_request_indirectpages,
M_XENBLOCKFRONT, M_ZERO, 0, ~0, PAGE_SIZE, 0);
} else {
indirectpages = NULL;
}
for (j = 0; j < sc->xbd_max_request_indirectpages; j++) {
if (gnttab_grant_foreign_access(
xenbus_get_otherend_id(sc->xbd_dev),
(vtomach(indirectpages) >> PAGE_SHIFT) + j,
1 /* grant read-only access */,
&cm->cm_indirectionrefs[j]))
break;
}
if (j < sc->xbd_max_request_indirectpages)
break;
cm->cm_indirectionpages = indirectpages;
xbd_free_command(cm);
}

View File

@ -75,11 +75,25 @@
__CONST_RING_SIZE(blkif, PAGE_SIZE * XBD_MAX_RING_PAGES)
/**
* The maximum mapped region size per request we will allow in a negotiated
* block-front/back communication channel.
* The maximum number of blkif segments which can be provided per indirect
* page in an indirect request.
*/
#define XBD_MAX_REQUEST_SIZE \
MIN(MAXPHYS, XBD_SEGS_TO_SIZE(BLKIF_MAX_SEGMENTS_PER_REQUEST))
#define XBD_MAX_SEGMENTS_PER_PAGE \
(PAGE_SIZE / sizeof(struct blkif_request_segment))
/**
* The maximum number of blkif segments which can be provided in an indirect
* request.
*/
#define XBD_MAX_INDIRECT_SEGMENTS \
(BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST * XBD_MAX_SEGMENTS_PER_PAGE)
/**
* Compute the number of indirect segment pages required for an I/O with the
* specified number of indirect segments.
*/
#define XBD_INDIRECT_SEGS_TO_PAGES(segs) \
((segs + XBD_MAX_SEGMENTS_PER_PAGE - 1) / XBD_MAX_SEGMENTS_PER_PAGE)
typedef enum {
XBDCF_Q_MASK = 0xFF,
@ -111,6 +125,8 @@ struct xbd_command {
blkif_sector_t cm_sector_number;
int cm_status;
xbd_cbcf_t *cm_complete;
void *cm_indirectionpages;
grant_ref_t cm_indirectionrefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
};
typedef enum {
@ -165,6 +181,7 @@ struct xbd_softc {
uint32_t xbd_max_requests;
uint32_t xbd_max_request_segments;
uint32_t xbd_max_request_size;
uint32_t xbd_max_request_indirectpages;
grant_ref_t xbd_ring_ref[XBD_MAX_RING_PAGES];
blkif_front_ring_t xbd_ring;
xen_intr_handle_t xen_intr_handle;

View File

@ -97,6 +97,28 @@
*
* The type of the backing device/object.
*
*
* direct-io-safe
* Values: 0/1 (boolean)
* Default Value: 0
*
* The underlying storage is not affected by the direct IO memory
* lifetime bug. See:
* http://lists.xen.org/archives/html/xen-devel/2012-12/msg01154.html
*
* Therefore this option gives the backend permission to use
* O_DIRECT, notwithstanding that bug.
*
* That is, if this option is enabled, use of O_DIRECT is safe,
* in circumstances where we would normally have avoided it as a
* workaround for that bug. This option is not relevant for all
* backends, and even not necessarily supported for those for
* which it is relevant. A backend which knows that it is not
* affected by the bug can ignore this option.
*
* This option doesn't require a backend to use O_DIRECT, so it
* should not be used to try to control the caching behaviour.
*
*--------------------------------- Features ---------------------------------
*
* feature-barrier
@ -126,6 +148,34 @@
* of this type may still be returned at any time with the
* BLKIF_RSP_EOPNOTSUPP result code.
*
* feature-persistent
* Values: 0/1 (boolean)
* Default Value: 0
* Notes: 7
*
* A value of "1" indicates that the backend can keep the grants used
* by the frontend driver mapped, so the same set of grants should be
* used in all transactions. The maximum number of grants the backend
* can map persistently depends on the implementation, but ideally it
* should be RING_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST. Using this
* feature the backend doesn't need to unmap each grant, preventing
* costly TLB flushes. The backend driver should only map grants
* persistently if the frontend supports it. If a backend driver chooses
* to use the persistent protocol when the frontend doesn't support it,
* it will probably hit the maximum number of persistently mapped grants
* (due to the fact that the frontend won't be reusing the same grants),
* and fall back to non-persistent mode. Backend implementations may
* shrink or expand the number of persistently mapped grants without
* notifying the frontend depending on memory constraints (this might
* cause a performance degradation).
*
* If a backend driver wants to limit the maximum number of persistently
* mapped grants to a value less than RING_SIZE *
* BLKIF_MAX_SEGMENTS_PER_REQUEST a LRU strategy should be used to
* discard the grants that are less commonly used. Using a LRU in the
* backend driver paired with a LIFO queue in the frontend will
* allow us to have better performance in this scenario.
*
*----------------------- Request Transport Parameters ------------------------
*
* max-ring-page-order
@ -147,6 +197,16 @@
*
*------------------------- Backend Device Properties -------------------------
*
* discard-enable
* Values: 0/1 (boolean)
* Default Value: 1
*
* This optional property, set by the toolstack, instructs the backend
* to offer discard to the frontend. If the property is missing the
* backend should offer discard if the backing storage actually supports
* it. This optional property, set by the toolstack, requests that the
* backend offer, or not offer, discard to the frontend.
*
* discard-alignment
* Values: <uint32_t>
* Default Value: 0
@ -166,6 +226,7 @@
* discard-secure
* Values: 0/1 (boolean)
* Default Value: 0
* Notes: 10
*
* A value of "1" indicates that the backend can process BLKIF_OP_DISCARD
* requests with the BLKIF_DISCARD_SECURE flag set.
@ -180,13 +241,17 @@
* sector-size
* Values: <uint32_t>
*
* The size, in bytes, of the individually addressible data blocks
* on the backend device.
* The logical sector size, in bytes, of the backend device.
*
* physical-sector-size
* Values: <uint32_t>
*
* The physical sector size, in bytes, of the backend device.
*
* sectors
* Values: <uint64_t>
*
* The size of the backend device, expressed in units of its native
* The size of the backend device, expressed in units of its logical
* sector size ("sector-size").
*
*****************************************************************************
@ -243,6 +308,27 @@
* The size of the frontend allocated request ring buffer in units of
* machine pages. The value must be a power of 2.
*
* feature-persistent
* Values: 0/1 (boolean)
* Default Value: 0
* Notes: 7, 8, 9
*
* A value of "1" indicates that the frontend will reuse the same grants
* for all transactions, allowing the backend to map them with write
* access (even when it should be read-only). If the frontend hits the
* maximum number of allowed persistently mapped grants, it can fallback
* to non persistent mode. This will cause a performance degradation,
* since the the backend driver will still try to map those grants
* persistently. Since the persistent grants protocol is compatible with
* the previous protocol, a frontend driver can choose to work in
* persistent mode even when the backend doesn't support it.
*
* It is recommended that the frontend driver stores the persistently
* mapped grants in a LIFO queue, so a subset of all persistently mapped
* grants gets used commonly. This is done in case the backend driver
* decides to limit the maximum number of persistently mapped grants
* to a value less than RING_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST.
*
*------------------------- Virtual Device Properties -------------------------
*
* device-type
@ -262,17 +348,23 @@
* -----
* (1) Multi-page ring buffer scheme first developed in the Citrix XenServer
* PV drivers.
* (2) Multi-page ring buffer scheme first used in some Red Hat distributions
* (2) Multi-page ring buffer scheme first used in some RedHat distributions
* including a distribution deployed on certain nodes of the Amazon
* EC2 cluster.
* (3) Support for multi-page ring buffers was implemented independently,
* in slightly different forms, by both Citrix and Red Hat/Amazon.
* in slightly different forms, by both Citrix and RedHat/Amazon.
* For full interoperability, block front and backends should publish
* identical ring parameters, adjusted for unit differences, to the
* XenStore nodes used in both schemes.
* (4) Devices that support discard functionality may internally allocate
* space (discardable extents) in units that are larger than the
* exported logical block size.
* (4) Devices that support discard functionality may internally allocate space
* (discardable extents) in units that are larger than the exported logical
* block size. If the backing device has such discardable extents the
* backend should provide both discard-granularity and discard-alignment.
* Providing just one of the two may be considered an error by the frontend.
* Backends supporting discard should include discard-granularity and
* discard-alignment even if it supports discarding individual sectors.
* Frontends should assume discard-alignment == 0 and discard-granularity
* == sector size if these keys are missing.
* (5) The discard-alignment parameter allows a physical device to be
* partitioned into virtual devices that do not necessarily begin or
* end on a discardable extent boundary.
@ -280,6 +372,19 @@
* 'ring-ref' is used to communicate the grant reference for this
* page to the backend. When using a multi-page ring, the 'ring-ref'
* node is not created. Instead 'ring-ref0' - 'ring-refN' are used.
* (7) When using persistent grants data has to be copied from/to the page
* where the grant is currently mapped. The overhead of doing this copy
* however doesn't suppress the speed improvement of not having to unmap
* the grants.
* (8) The frontend driver has to allow the backend driver to map all grants
* with write access, even when they should be mapped read-only, since
* further requests may reuse these grants and require write permissions.
* (9) Linux implementation doesn't have a limit on the maximum number of
* grants that can be persistently mapped in the frontend driver, but
* due to the frontent driver implementation it should never be bigger
* than RING_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST.
*(10) The discard-secure property may be present and will be set to 1 if the
* backing device supports secure discard.
*/
/*
@ -403,6 +508,30 @@
*/
#define BLKIF_OP_DISCARD 5
/*
* Recognized if "feature-max-indirect-segments" in present in the backend
* xenbus info. The "feature-max-indirect-segments" node contains the maximum
* number of segments allowed by the backend per request. If the node is
* present, the frontend might use blkif_request_indirect structs in order to
* issue requests with more than BLKIF_MAX_SEGMENTS_PER_REQUEST (11). The
* maximum number of indirect segments is fixed by the backend, but the
* frontend can issue requests with any number of indirect segments as long as
* it's less than the number provided by the backend. The indirect_grefs field
* in blkif_request_indirect should be filled by the frontend with the
* grant references of the pages that are holding the indirect segments.
* These pages are filled with an array of blkif_request_segment that hold the
* information about the segments. The number of indirect pages to use is
* determined by the number of segments an indirect request contains. Every
* indirect page can contain a maximum of
* (PAGE_SIZE / sizeof(struct blkif_request_segment)) segments, so to
* calculate the number of indirect pages to use we have to do
* ceil(indirect_segments / (PAGE_SIZE / sizeof(struct blkif_request_segment))).
*
* If a backend does not recognize BLKIF_OP_INDIRECT, it should *not*
* create the "feature-max-indirect-segments" node!
*/
#define BLKIF_OP_INDIRECT 6
/*
* Maximum scatter/gather segments per request.
* This is carefully chosen so that sizeof(blkif_ring_t) <= PAGE_SIZE.
@ -410,12 +539,18 @@
*/
#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
/*
* Maximum number of indirect pages to use per request.
*/
#define BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST 8
/*
* NB. first_sect and last_sect in blkif_request_segment, as well as
* sector_number in blkif_request, are always expressed in 512-byte units.
* However they must be properly aligned to the real sector size of the
* physical disk, which is reported in the "sector-size" node in the backend
* xenbus info. Also the xenbus "sectors" node is expressed in 512-byte units.
* physical disk, which is reported in the "physical-sector-size" node in
* the backend xenbus info. Also the xenbus "sectors" node is expressed in
* 512-byte units.
*/
struct blkif_request_segment {
grant_ref_t gref; /* reference to I/O buffer frame */
@ -453,6 +588,20 @@ struct blkif_request_discard {
};
typedef struct blkif_request_discard blkif_request_discard_t;
struct blkif_request_indirect {
uint8_t operation; /* BLKIF_OP_INDIRECT */
uint8_t indirect_op; /* BLKIF_OP_{READ/WRITE} */
uint16_t nr_segments; /* number of segments */
uint64_t id; /* private guest value, echoed in resp */
blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
blkif_vdev_t handle; /* same as for read/write requests */
grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
#ifdef __i386__
uint64_t pad; /* Make it 64 byte aligned on i386 */
#endif
};
typedef struct blkif_request_indirect blkif_request_indirect_t;
struct blkif_response {
uint64_t id; /* copied from request */
uint8_t operation; /* copied from request */
@ -484,7 +633,7 @@ DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response);
/*
* Local variables:
* mode: C
* c-set-style: "BSD"
* c-file-style: "BSD"
* c-basic-offset: 4
* tab-width: 4
* indent-tabs-mode: nil