nvme/rdma: Correct qpair disconnect process

In current implementation RDMA qpair is destroyed right after disconnect. That is not graceful qpair shutdown process since there can be requests submitted to HW and we may receive completions for already destroyed/freed qpair. To avoid this, only disconnect qpair in ctrlr_disconnect_qpair transport callback, all other resources will be released in ctrlr_delete_io_qpair cb. This patch is useful when nvme poll groups are used since in that case we use shared CQ, if the disconnected qpair has WRs submitted to HW then qpair's destruction will be deferred to poll group. When nvme poll groups are not used, this patch doesn't change anything, in that case destruction flow is still ungraceful. However since CQ is destroyed immediately after qpair, we shouldn't receive any requests which point to released resources. A correct solution for non-poll group case requires async diconnect API which may lead to significant rework. There is a bug when Soft Roce is used - we may receive a completion with "normal" status when qpair is already disconnected and all nvme requests are aborted. Added a workaround for it. Signed-off-by: Alexey Marchuk <alexeymar@mellanox.com> Change-Id: I0680d9ef9aaa8737d7a6d1454cd70a384bb8efac Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/10327 Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Community-CI: Mellanox Build Bot Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com> Reviewed-by: Shuhei Matsumoto <shuheimatsumoto@gmail.com> Reviewed-by: Jim Harris <james.r.harris@intel.com>
2021-11-18 21:45:14 +03:00 · 2021-11-18 21:45:14 +03:00 · eb09178a59
commit eb09178a59
parent 9cea323284
2 changed files with 83 additions and 52 deletions
--- a/lib/nvme/nvme_rdma.c
+++ b/lib/nvme/nvme_rdma.c
@ -310,6 +310,7 @@ static const char *rdma_cm_event_str[] = {

 struct nvme_rdma_qpair *nvme_rdma_poll_group_get_qpair_by_id(struct nvme_rdma_poll_group *group,
 		uint32_t qp_num);
+static void nvme_rdma_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr);

 static TAILQ_HEAD(, nvme_rdma_memory_domain) g_memory_domains = TAILQ_HEAD_INITIALIZER(
 			g_memory_domains);
@ -1795,12 +1796,33 @@ nvme_rdma_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme
 {
 	struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair);
 	struct nvme_rdma_ctrlr *rctrlr = NULL;
-	struct nvme_rdma_cm_event_entry *entry, *tmp;
 	int rc;

-	spdk_rdma_free_mem_map(&rqpair->mr_map);
-	nvme_rdma_unregister_reqs(rqpair);
-	nvme_rdma_unregister_rsps(rqpair);
+	if (rqpair->rdma_qp) {
+		rc = spdk_rdma_qp_disconnect(rqpair->rdma_qp);
+		rctrlr = nvme_rdma_ctrlr(qpair->ctrlr);
+		if (rctrlr != NULL && rc == 0) {
+			if (nvme_rdma_process_event(rqpair, rctrlr->cm_channel, RDMA_CM_EVENT_DISCONNECTED)) {
+				SPDK_DEBUGLOG(nvme, "Target did not respond to qpair disconnect.\n");
+			}
+		}
+	}
+}
+
+static int
+nvme_rdma_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+	struct nvme_rdma_qpair *rqpair;
+	struct nvme_rdma_ctrlr *rctrlr = NULL;
+	struct nvme_rdma_cm_event_entry *entry, *tmp;
+
+	assert(qpair != NULL);
+	rqpair = nvme_rdma_qpair(qpair);
+
+	if (rqpair->defer_deletion_to_pg) {
+		nvme_qpair_set_state(qpair, NVME_QPAIR_DESTROYING);
+		return 0;
+	}

 	if (rqpair->evt) {
 		rdma_ack_cm_event(rqpair->evt);
@ -1822,42 +1844,9 @@ nvme_rdma_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme
 		}
 	}

-	if (rqpair->cm_id) {
-		if (rqpair->rdma_qp) {
-			rc = spdk_rdma_qp_disconnect(rqpair->rdma_qp);
-			if ((rctrlr != NULL) && (rc == 0)) {
-				if (nvme_rdma_process_event(rqpair, rctrlr->cm_channel, RDMA_CM_EVENT_DISCONNECTED)) {
-					SPDK_DEBUGLOG(nvme, "Target did not respond to qpair disconnect.\n");
-				}
-			}
-			spdk_rdma_qp_destroy(rqpair->rdma_qp);
-			rqpair->rdma_qp = NULL;
-		}
-
-		rdma_destroy_id(rqpair->cm_id);
-		rqpair->cm_id = NULL;
-	}
-
-	if (rqpair->cq) {
-		ibv_destroy_cq(rqpair->cq);
-		rqpair->cq = NULL;
-	}
-}
-
-static void nvme_rdma_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr);
-
-static int
-nvme_rdma_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
-{
-	struct nvme_rdma_qpair *rqpair;
-
-	assert(qpair != NULL);
-	rqpair = nvme_rdma_qpair(qpair);
-
-	if (rqpair->defer_deletion_to_pg) {
-		nvme_qpair_set_state(qpair, NVME_QPAIR_DESTROYING);
-		return 0;
-	}
+	spdk_rdma_free_mem_map(&rqpair->mr_map);
+	nvme_rdma_unregister_reqs(rqpair);
+	nvme_rdma_unregister_rsps(rqpair);

 	nvme_rdma_qpair_abort_reqs(qpair, 1);
 	nvme_qpair_deinit(qpair);
@ -1866,6 +1855,22 @@ nvme_rdma_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_

 	nvme_rdma_free_reqs(rqpair);
 	nvme_rdma_free_rsps(rqpair);
+
+	if (rqpair->cm_id) {
+		if (rqpair->rdma_qp) {
+			/* qpair is already disconnected at this stage */
+			spdk_rdma_qp_destroy(rqpair->rdma_qp);
+			rqpair->rdma_qp = NULL;
+		}
+		rdma_destroy_id(rqpair->cm_id);
+		rqpair->cm_id = NULL;
+	}
+
+	if (rqpair->cq) {
+		ibv_destroy_cq(rqpair->cq);
+		rqpair->cq = NULL;
+	}
+
 	nvme_rdma_free(rqpair);

 	return 0;
@ -2177,16 +2182,24 @@ nvme_rdma_log_wc_status(struct nvme_rdma_qpair *rqpair, struct ibv_wc *wc)
 	if (wc->status == IBV_WC_WR_FLUSH_ERR) {
 		/* If qpair is in ERR state, we will receive completions for all posted and not completed
 		 * Work Requests with IBV_WC_WR_FLUSH_ERR status. Don't log an error in that case */
-		SPDK_DEBUGLOG(nvme, "WC error, qid %u, qp state %d, request 0x%lu type %d, status: (%d): %s\n",
+		SPDK_DEBUGLOG(nvme,
+			      "WC error, qid %u, qp state %d, request 0x%"PRIx64" type %d, status: (%d): %s\n",
 			      rqpair->qpair.id, rqpair->qpair.state, wc->wr_id, rdma_wr->type, wc->status,
 			      ibv_wc_status_str(wc->status));
 	} else {
-		SPDK_ERRLOG("WC error, qid %u, qp state %d, request 0x%lu type %d, status: (%d): %s\n",
+		SPDK_ERRLOG("WC error, qid %u, qp state %d, request 0x%"PRIx64" type %d, status: (%d): %s\n",
 			    rqpair->qpair.id, rqpair->qpair.state, wc->wr_id, rdma_wr->type, wc->status,
 			    ibv_wc_status_str(wc->status));
 	}
 }

+static inline bool
+nvme_rdma_is_rxe_device(struct ibv_device_attr *dev_attr)
+{
+	return dev_attr->vendor_id == SPDK_RDMA_RXE_VENDOR_ID_OLD ||
+	       dev_attr->vendor_id == SPDK_RDMA_RXE_VENDOR_ID_NEW;
+}
+
 static int
 nvme_rdma_cq_process_completions(struct ibv_cq *cq, uint32_t batch_size,
 				 struct nvme_rdma_poll_group *group,
@ -2262,12 +2275,7 @@ nvme_rdma_cq_process_completions(struct ibv_cq *cq, uint32_t batch_size,
 							wc[i].qp_num);
 				}
 				if (!rqpair) {
-					/* When poll_group is used, several qpairs share the same CQ and it is possible to
-					 * receive a completion with error (e.g. IBV_WC_WR_FLUSH_ERR) for already disconnected qpair
-					 * That happens due to qpair is destroyed while there are submitted but not completed send/receive
-					 * Work Requests
-					 * TODO: ibv qpair must be destroyed only when all submitted Work Requests are completed */
-					assert(group);
+					assert(0);
 					continue;
 				}
 				assert(rqpair->current_num_sends > 0);
@ -2278,6 +2286,26 @@ nvme_rdma_cq_process_completions(struct ibv_cq *cq, uint32_t batch_size,
 				continue;
 			}

+			if (spdk_unlikely(rdma_req->req == NULL)) {
+				struct ibv_device_attr dev_attr;
+				int query_status;
+
+				/* Bug in Soft Roce - we may receive a completion without error status when qpair is disconnected/destroyed.
+				 * As sanity check - log an error if we use a real HW (it should never happen) */
+				query_status = ibv_query_device(cq->context, &dev_attr);
+				if (query_status == 0) {
+					if (!nvme_rdma_is_rxe_device(&dev_attr)) {
+						SPDK_ERRLOG("Received malformed completion: request 0x%"PRIx64" type %d\n", wc->wr_id,
+							    rdma_wr->type);
+						assert(0);
+					}
+				} else {
+					SPDK_ERRLOG("Failed to query ib device\n");
+					assert(0);
+				}
+				continue;
+			}
+
 			rqpair = nvme_rdma_qpair(rdma_req->req->qpair);
 			rdma_req->completion_flags |= NVME_RDMA_SEND_COMPLETED;
 			rqpair->current_num_sends--;
@ -2635,12 +2663,14 @@ nvme_rdma_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair)
 		nvme_ctrlr_disconnect_qpair(qpair);
 	}

-	/*
-	 * If this fails, the system is in serious trouble,
-	 * just let the qpair get cleaned up immediately.
-	 */
+	/* qpair has requests submitted to HW, need to wait for their completion.
+	 * Allocate a tracker and attach it to poll group */
 	destroyed_qpair = calloc(1, sizeof(*destroyed_qpair));
 	if (destroyed_qpair == NULL) {
+		/*
+		 * If this fails, the system is in serious trouble,
+		 * just let the qpair get cleaned up immediately.
+		 */
 		return 0;
 	}

--- a/test/unit/lib/nvme/nvme_rdma.c/nvme_rdma_ut.c
+++ b/test/unit/lib/nvme/nvme_rdma.c/nvme_rdma_ut.c
@ -58,6 +58,7 @@ DEFINE_STUB(rdma_ack_cm_event, int, (struct rdma_cm_event *event), 0);
 DEFINE_STUB_V(rdma_free_devices, (struct ibv_context **list));
 DEFINE_STUB(fcntl, int, (int fd, int cmd, ...), 0);
 DEFINE_STUB_V(rdma_destroy_event_channel, (struct rdma_event_channel *channel));
+DEFINE_STUB(rdma_destroy_id, int, (struct rdma_cm_id *id), 0);

 DEFINE_STUB(ibv_dereg_mr, int, (struct ibv_mr *mr), 0);
 DEFINE_STUB(ibv_resize_cq, int, (struct ibv_cq *cq, int cqe), 0);