xen: fix blkback pushing responses before releasing internal resources

Fix a problem where the blockback driver could run out of requests,
despite the fact that we allocate enough request and reqlist
structures to satisfy the maximum possible number of requests.

The problem was that we were sending responses back to the other
end (blockfront) before freeing resources. The Citrix Windows
driver is pretty agressive about queueing, and would queue more I/O
to us immediately after we sent responses to it. We would run into
a resource shortage and stall out I/O until we freed resources.

It isn't clear whether the request shortage condition was an
indirect cause of the I/O hangs we've been seeing between Windows
with the Citrix PV drivers and FreeBSD's blockback, but the above
problem is certainly a bug.

Sponsored by: Spectra Logic
Submitted by: ken
Reviewed by: royger

dev/xen/blkback/blkback.c:
 - Break xbb_send_response() into two sub-functions,
   xbb_queue_response() and xbb_push_responses().
   Remove xbb_send_response(), because it is no longer
   used.

 - Adjust xbb_complete_reqlist() so that it calls the
   two new functions, and holds the mutex around both
   calls.  The mutex insures that another context
   can't come along and push responses before we've
   freed our resources.

 - Change xbb_release_reqlist() so that it requires
   the mutex to be held instead of acquiring the mutex
   itself.  Both callers could easily hold the mutex
   while calling it, and one really needs to hold the
   mutex during the call.

 - Add two new counters, accessible via sysctl
   variables.  The first one counts the number of
   I/Os that are queued and waiting to be pushed
   (reqs_queued_for_completion).  The second one
   (reqs_completed_with_error) counts the number of
   requests we've completed with an error status.
This commit is contained in:
Roger Pau Monné 2014-09-30 17:41:16 +00:00
parent 22c1633270
commit 59adbba20f

@ -1,5 +1,5 @@
/*-
* Copyright (c) 2009-2011 Spectra Logic Corporation
* Copyright (c) 2009-2012 Spectra Logic Corporation
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@ -784,6 +784,12 @@ struct xbb_softc {
/** Number of requests we have completed*/
uint64_t reqs_completed;
/** Number of requests we queued but not pushed*/
uint64_t reqs_queued_for_completion;
/** Number of requests we completed with an error status*/
uint64_t reqs_completed_with_error;
/** How many forced dispatches (i.e. without coalescing) have happend */
uint64_t forced_dispatch;
@ -1143,7 +1149,7 @@ xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
int wakeup)
{
mtx_lock(&xbb->lock);
mtx_assert(&xbb->lock, MA_OWNED);
if (wakeup) {
wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE;
@ -1167,8 +1173,6 @@ xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
xbb_shutdown(xbb);
}
mtx_unlock(&xbb->lock);
if (wakeup != 0)
taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task);
}
@ -1261,16 +1265,16 @@ bailout_error:
if (nreq != NULL)
xbb_release_req(xbb, nreq);
mtx_unlock(&xbb->lock);
if (nreqlist != NULL)
xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0);
mtx_unlock(&xbb->lock);
return (1);
}
/**
* Create and transmit a response to a blkif request.
* Create and queue a response to a blkif request.
*
* \param xbb Per-instance xbb configuration structure.
* \param req The request structure to which to respond.
@ -1278,20 +1282,28 @@ bailout_error:
* in sys/xen/interface/io/blkif.h.
*/
static void
xbb_send_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status)
xbb_queue_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status)
{
blkif_response_t *resp;
int more_to_do;
int notify;
more_to_do = 0;
/*
* The mutex is required here, and should be held across this call
* until after the subsequent call to xbb_push_responses(). This
* is to guarantee that another context won't queue responses and
* push them while we're active.
*
* That could lead to the other end being notified of responses
* before the resources have been freed on this end. The other end
* would then be able to queue additional I/O, and we may run out
* of resources because we haven't freed them all yet.
*/
mtx_assert(&xbb->lock, MA_OWNED);
/*
* Place on the response ring for the relevant domain.
* For now, only the spacing between entries is different
* in the different ABIs, not the response entry layout.
*/
mtx_lock(&xbb->lock);
switch (xbb->abi) {
case BLKIF_PROTOCOL_NATIVE:
resp = RING_GET_RESPONSE(&xbb->rings.native,
@ -1315,8 +1327,38 @@ xbb_send_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status)
resp->operation = req->operation;
resp->status = status;
if (status != BLKIF_RSP_OKAY)
xbb->reqs_completed_with_error++;
xbb->rings.common.rsp_prod_pvt += BLKIF_SEGS_TO_BLOCKS(req->nr_pages);
RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, notify);
xbb->reqs_queued_for_completion++;
}
/**
* Send queued responses to blkif requests.
*
* \param xbb Per-instance xbb configuration structure.
* \param run_taskqueue Flag that is set to 1 if the taskqueue
* should be run, 0 if it does not need to be run.
* \param notify Flag that is set to 1 if the other end should be
* notified via irq, 0 if the other end should not be
* notified.
*/
static void
xbb_push_responses(struct xbb_softc *xbb, int *run_taskqueue, int *notify)
{
int more_to_do;
/*
* The mutex is required here.
*/
mtx_assert(&xbb->lock, MA_OWNED);
more_to_do = 0;
RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, *notify);
if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) {
@ -1331,15 +1373,10 @@ xbb_send_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status)
more_to_do = 1;
}
xbb->reqs_completed++;
xbb->reqs_completed += xbb->reqs_queued_for_completion;
xbb->reqs_queued_for_completion = 0;
mtx_unlock(&xbb->lock);
if (more_to_do)
taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task);
if (notify)
xen_intr_signal(xbb->xen_intr_handle);
*run_taskqueue = more_to_do;
}
/**
@ -1353,23 +1390,29 @@ xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist)
{
struct xbb_xen_req *nreq;
off_t sectors_sent;
int notify, run_taskqueue;
sectors_sent = 0;
if (reqlist->flags & XBB_REQLIST_MAPPED)
xbb_unmap_reqlist(reqlist);
mtx_lock(&xbb->lock);
/*
* All I/O is done, send the response. A lock should not be
* necessary here because the request list is complete, and
* therefore this is the only context accessing this request
* right now. The functions we call do their own locking if
* necessary.
* All I/O is done, send the response. A lock is not necessary
* to protect the request list, because all requests have
* completed. Therefore this is the only context accessing this
* reqlist right now. However, in order to make sure that no one
* else queues responses onto the queue or pushes them to the other
* side while we're active, we need to hold the lock across the
* calls to xbb_queue_response() and xbb_push_responses().
*/
STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) {
off_t cur_sectors_sent;
xbb_send_response(xbb, nreq, reqlist->status);
/* Put this response on the ring, but don't push yet */
xbb_queue_response(xbb, nreq, reqlist->status);
/* We don't report bytes sent if there is an error. */
if (reqlist->status == BLKIF_RSP_OKAY)
@ -1404,6 +1447,16 @@ xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist)
/*then*/&reqlist->ds_t0);
xbb_release_reqlist(xbb, reqlist, /*wakeup*/ 1);
xbb_push_responses(xbb, &run_taskqueue, &notify);
mtx_unlock(&xbb->lock);
if (run_taskqueue)
taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task);
if (notify)
xen_intr_signal(xbb->xen_intr_handle);
}
/**
@ -3588,6 +3641,16 @@ xbb_setup_sysctl(struct xbb_softc *xbb)
"reqs_completed", CTLFLAG_RW, &xbb->reqs_completed,
"how many I/O requests have been completed");
SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
"reqs_queued_for_completion", CTLFLAG_RW,
&xbb->reqs_queued_for_completion,
"how many I/O requests queued but not yet pushed");
SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
"reqs_completed_with_error", CTLFLAG_RW,
&xbb->reqs_completed_with_error,
"how many I/O requests completed with error status");
SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
"forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch,
"how many I/O dispatches were forced");