nvme: don't disconnect qpairs from admin thread.

Disconnecting qpairs from the admin thread during a reset led to an inevitable race with the data thread. QP related memory is freed during the disconnect and cannot be touched from the other threads. The only way to fix this is to force the qpair disconnect onto the data thread. This requires a small change in the way that resets are handled for pcie. Please see the code in reset.c for that change. fixes: bb01a089 Signed-off-by: Seth Howell <seth.howell@intel.com> Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/472749 (master) (cherry picked from commit 13f30a254e) Change-Id: I8a39e444c7cbbe85fafca42ffd040e929721ce95 Signed-off-by: Tomasz Zawadzki <tomasz.zawadzki@intel.com> Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/472960 Reviewed-by: Jim Harris <james.r.harris@intel.com> Reviewed-by: Ben Walker <benjamin.walker@intel.com> Reviewed-by: Seth Howell <seth.howell@intel.com> Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
2019-10-29 14:09:36 -07:00 · 2019-10-29 14:09:36 -07:00 · ec6de131f7
commit ec6de131f7
parent 1f737887cb
4 changed files with 21 additions and 6 deletions
--- a/include/spdk/nvme.h
+++ b/include/spdk/nvme.h
@ -1091,6 +1091,8 @@ struct spdk_nvme_qpair *spdk_nvme_ctrlr_alloc_io_qpair(struct spdk_nvme_ctrlr *c
 * This function is intended to be called on qpairs that have already been connected,
 * but have since entered a failed state as indicated by a return value of -ENXIO from
 * either spdk_nvme_qpair_process_completions or one of the spdk_nvme_ns_cmd_* functions.
+ * This function must be called from the same thread as spdk_nvme_qpair_process_completions
+ * and the spdk_nvme_ns_cmd_* functions.
 *
 * \param qpair The qpair to reconnect.
 *
--- a/lib/nvme/nvme_ctrlr.c
+++ b/lib/nvme/nvme_ctrlr.c
@ -412,13 +412,18 @@ spdk_nvme_ctrlr_reconnect_io_qpair(struct spdk_nvme_qpair *qpair)
 		goto out;
 	}

+	/* We have to confirm that any old memory is cleaned up. */
+	nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair);
+
 	rc = nvme_transport_ctrlr_connect_qpair(ctrlr, qpair);
 	if (rc) {
 		nvme_qpair_set_state(qpair, NVME_QPAIR_DISABLED);
+		qpair->transport_qp_is_failed = true;
 		rc = -EAGAIN;
 		goto out;
 	}
 	nvme_qpair_set_state(qpair, NVME_QPAIR_CONNECTED);
+	qpair->transport_qp_is_failed = false;

 out:
 	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
@ -1073,7 +1078,7 @@ spdk_nvme_ctrlr_reset(struct spdk_nvme_ctrlr *ctrlr)
 	/* Disable all queues before disabling the controller hardware. */
 	TAILQ_FOREACH(qpair, &ctrlr->active_io_qpairs, tailq) {
 		nvme_qpair_set_state(qpair, NVME_QPAIR_DISABLED);
-		nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair);
+		qpair->transport_qp_is_failed = true;
 	}
 	nvme_qpair_set_state(ctrlr->adminq, NVME_QPAIR_DISABLED);
 	nvme_qpair_complete_error_reqs(ctrlr->adminq);
@ -1102,7 +1107,14 @@ spdk_nvme_ctrlr_reset(struct spdk_nvme_ctrlr *ctrlr)
 		}
 	}

-	if (rc == 0) {
+	/*
+	 * For PCIe controllers, the memory locations of the tranpsort qpair
+	 * don't change when the controller is reset. They simply need to be
+	 * re-enabled with admin commands to the controller. For fabric
+	 * controllers we need to disconnect and reconnect the qpair on its
+	 * own thread outside of the context of the reset.
+	 */
+	if (rc == 0 && ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
 		/* Reinitialize qpairs */
 		TAILQ_FOREACH(qpair, &ctrlr->active_io_qpairs, tailq) {
 			if (nvme_transport_ctrlr_connect_qpair(ctrlr, qpair) != 0) {
@ -1111,6 +1123,7 @@ spdk_nvme_ctrlr_reset(struct spdk_nvme_ctrlr *ctrlr)
 				continue;
 			}
 			nvme_qpair_set_state(qpair, NVME_QPAIR_CONNECTED);
+			qpair->transport_qp_is_failed = false;
 		}
 	}

--- a/lib/nvme/nvme_qpair.c
+++ b/lib/nvme/nvme_qpair.c
@ -458,7 +458,7 @@ spdk_nvme_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_
 		 * qpair is not enabled, likely because a controller reset is
 		 *  in progress.
 		 */
-		return 0;
+		return -ENXIO;
 	}

 	/* error injection for those queued error requests */
--- a/test/unit/lib/nvme/nvme_qpair.c/nvme_qpair_ut.c
+++ b/test/unit/lib/nvme/nvme_qpair.c/nvme_qpair_ut.c
@ -243,18 +243,18 @@ static void test_nvme_qpair_process_completions(void)
 	ctrlr.is_removed = false;
 	ctrlr.is_resetting = true;
 	rc = spdk_nvme_qpair_process_completions(&qpair, 0);
-	CU_ASSERT(rc == 0);
+	CU_ASSERT(rc == -ENXIO);
 	CU_ASSERT(g_called_transport_process_completions == false);
 	/* We also need to make sure we didn't abort the requests. */
 	CU_ASSERT(!STAILQ_EMPTY(&qpair.queued_req));
 	CU_ASSERT(g_num_cb_passed == 0);
 	CU_ASSERT(g_num_cb_failed == 0);

-	/* The case where we aren't resetting, but are enablign the qpair is the same as above. */
+	/* The case where we aren't resetting, but are enabling the qpair is the same as above. */
 	ctrlr.is_resetting = false;
 	qpair.state = NVME_QPAIR_ENABLING;
 	rc = spdk_nvme_qpair_process_completions(&qpair, 0);
-	CU_ASSERT(rc == 0);
+	CU_ASSERT(rc == -ENXIO);
 	CU_ASSERT(g_called_transport_process_completions == false);
 	CU_ASSERT(!STAILQ_EMPTY(&qpair.queued_req));
 	CU_ASSERT(g_num_cb_passed == 0);