nvme: don't disconnect qpairs from admin thread.

Disconnecting qpairs from the admin thread during a reset led to an
inevitable race with the data thread. QP related memory is freed during
the disconnect and cannot be touched from the other threads.

The only way to fix this is to force the qpair disconnect onto the
data thread.

This requires a small change in the way that resets are handled for
pcie. Please see the code in reset.c for that change.

fixes: bb01a089

Signed-off-by: Seth Howell <seth.howell@intel.com>
Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/472749 (master)

(cherry picked from commit 13f30a254e)
Change-Id: I8a39e444c7cbbe85fafca42ffd040e929721ce95
Signed-off-by: Tomasz Zawadzki <tomasz.zawadzki@intel.com>
Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/472960
Reviewed-by: Jim Harris <james.r.harris@intel.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
Reviewed-by: Seth Howell <seth.howell@intel.com>
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
This commit is contained in:
Seth Howell 2019-10-29 14:09:36 -07:00 committed by Tomasz Zawadzki
parent 1f737887cb
commit ec6de131f7
4 changed files with 21 additions and 6 deletions

View File

@ -1091,6 +1091,8 @@ struct spdk_nvme_qpair *spdk_nvme_ctrlr_alloc_io_qpair(struct spdk_nvme_ctrlr *c
* This function is intended to be called on qpairs that have already been connected,
* but have since entered a failed state as indicated by a return value of -ENXIO from
* either spdk_nvme_qpair_process_completions or one of the spdk_nvme_ns_cmd_* functions.
* This function must be called from the same thread as spdk_nvme_qpair_process_completions
* and the spdk_nvme_ns_cmd_* functions.
*
* \param qpair The qpair to reconnect.
*

View File

@ -412,13 +412,18 @@ spdk_nvme_ctrlr_reconnect_io_qpair(struct spdk_nvme_qpair *qpair)
goto out;
}
/* We have to confirm that any old memory is cleaned up. */
nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair);
rc = nvme_transport_ctrlr_connect_qpair(ctrlr, qpair);
if (rc) {
nvme_qpair_set_state(qpair, NVME_QPAIR_DISABLED);
qpair->transport_qp_is_failed = true;
rc = -EAGAIN;
goto out;
}
nvme_qpair_set_state(qpair, NVME_QPAIR_CONNECTED);
qpair->transport_qp_is_failed = false;
out:
nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
@ -1073,7 +1078,7 @@ spdk_nvme_ctrlr_reset(struct spdk_nvme_ctrlr *ctrlr)
/* Disable all queues before disabling the controller hardware. */
TAILQ_FOREACH(qpair, &ctrlr->active_io_qpairs, tailq) {
nvme_qpair_set_state(qpair, NVME_QPAIR_DISABLED);
nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair);
qpair->transport_qp_is_failed = true;
}
nvme_qpair_set_state(ctrlr->adminq, NVME_QPAIR_DISABLED);
nvme_qpair_complete_error_reqs(ctrlr->adminq);
@ -1102,7 +1107,14 @@ spdk_nvme_ctrlr_reset(struct spdk_nvme_ctrlr *ctrlr)
}
}
if (rc == 0) {
/*
* For PCIe controllers, the memory locations of the tranpsort qpair
* don't change when the controller is reset. They simply need to be
* re-enabled with admin commands to the controller. For fabric
* controllers we need to disconnect and reconnect the qpair on its
* own thread outside of the context of the reset.
*/
if (rc == 0 && ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
/* Reinitialize qpairs */
TAILQ_FOREACH(qpair, &ctrlr->active_io_qpairs, tailq) {
if (nvme_transport_ctrlr_connect_qpair(ctrlr, qpair) != 0) {
@ -1111,6 +1123,7 @@ spdk_nvme_ctrlr_reset(struct spdk_nvme_ctrlr *ctrlr)
continue;
}
nvme_qpair_set_state(qpair, NVME_QPAIR_CONNECTED);
qpair->transport_qp_is_failed = false;
}
}

View File

@ -458,7 +458,7 @@ spdk_nvme_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_
* qpair is not enabled, likely because a controller reset is
* in progress.
*/
return 0;
return -ENXIO;
}
/* error injection for those queued error requests */

View File

@ -243,18 +243,18 @@ static void test_nvme_qpair_process_completions(void)
ctrlr.is_removed = false;
ctrlr.is_resetting = true;
rc = spdk_nvme_qpair_process_completions(&qpair, 0);
CU_ASSERT(rc == 0);
CU_ASSERT(rc == -ENXIO);
CU_ASSERT(g_called_transport_process_completions == false);
/* We also need to make sure we didn't abort the requests. */
CU_ASSERT(!STAILQ_EMPTY(&qpair.queued_req));
CU_ASSERT(g_num_cb_passed == 0);
CU_ASSERT(g_num_cb_failed == 0);
/* The case where we aren't resetting, but are enablign the qpair is the same as above. */
/* The case where we aren't resetting, but are enabling the qpair is the same as above. */
ctrlr.is_resetting = false;
qpair.state = NVME_QPAIR_ENABLING;
rc = spdk_nvme_qpair_process_completions(&qpair, 0);
CU_ASSERT(rc == 0);
CU_ASSERT(rc == -ENXIO);
CU_ASSERT(g_called_transport_process_completions == false);
CU_ASSERT(!STAILQ_EMPTY(&qpair.queued_req));
CU_ASSERT(g_num_cb_passed == 0);