nvme: split non-compliant SGLs into multiple requests

Since nvme_ns_cmd.c now walks the SGL, some of the test code needs to also be updated to initialize and return correct values such as ctrlr->flags and sge_length. Signed-off-by: Jim Harris <james.r.harris@intel.com> Change-Id: I521213695def35d0897aabf57a0638a6c347632e
2016-12-16 09:41:18 -07:00 · 2016-12-16 09:41:18 -07:00 · 86e8a920bf
commit 86e8a920bf
parent 38c09e5eed
5 changed files with 211 additions and 28 deletions
--- a/lib/nvme/nvme_internal.h
+++ b/lib/nvme/nvme_internal.h
@ -133,7 +133,7 @@ struct __attribute__((packed)) nvme_payload {
 		/**
 		 * Functions for retrieving physical addresses for scattered payloads.
 		 */
-		struct {
+		struct nvme_sgl_args {
 			spdk_nvme_req_reset_sgl_cb reset_sgl_fn;
 			spdk_nvme_req_next_sge_cb next_sge_fn;
 			void *cb_arg;
@ -603,4 +603,10 @@ void	nvme_ctrlr_proc_get_ref(struct spdk_nvme_ctrlr *ctrlr);
 void	nvme_ctrlr_proc_put_ref(struct spdk_nvme_ctrlr *ctrlr);
 int	nvme_ctrlr_get_ref_count(struct spdk_nvme_ctrlr *ctrlr);

+static inline bool
+_is_page_aligned(uint64_t address)
+{
+	return (address & (PAGE_SIZE - 1)) == 0;
+}
+
 #endif /* __NVME_INTERNAL_H__ */
--- a/lib/nvme/nvme_ns_cmd.c
+++ b/lib/nvme/nvme_ns_cmd.c
@ -37,7 +37,7 @@ static struct nvme_request *_nvme_ns_cmd_rw(struct spdk_nvme_ns *ns,
 		const struct nvme_payload *payload, uint32_t payload_offset, uint32_t md_offset,
 		uint64_t lba, uint32_t lba_count, spdk_nvme_cmd_cb cb_fn,
 		void *cb_arg, uint32_t opc, uint32_t io_flags,
-		uint16_t apptag_mask, uint16_t apptag);
+		uint16_t apptag_mask, uint16_t apptag, bool check_sgl);

 static void
 nvme_cb_complete_child(void *child_arg, const struct spdk_nvme_cpl *cpl)
@ -114,12 +114,12 @@ _nvme_add_child_request(struct spdk_nvme_ns *ns, const struct nvme_payload *payl
 			uint32_t payload_offset, uint32_t md_offset,
 			uint64_t lba, uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc,
 			uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag,
-			struct nvme_request *parent)
+			struct nvme_request *parent, bool check_sgl)
 {
 	struct nvme_request	*child;

 	child = _nvme_ns_cmd_rw(ns, payload, payload_offset, md_offset, lba, lba_count, cb_fn,
-				cb_arg, opc, io_flags, apptag_mask, apptag);
+				cb_arg, opc, io_flags, apptag_mask, apptag, check_sgl);
 	if (child == NULL) {
 		nvme_request_free_children(parent);
 		nvme_free_request(parent);
@ -157,7 +157,7 @@ _nvme_ns_cmd_split_request(struct spdk_nvme_ns *ns,

 		child = _nvme_add_child_request(ns, payload, payload_offset, md_offset,
 						lba, lba_count, cb_fn, cb_arg, opc,
-						io_flags, apptag_mask, apptag, req);
+						io_flags, apptag_mask, apptag, req, true);
 		if (child == NULL) {
 			return NULL;
 		}
@ -200,11 +200,117 @@ _nvme_ns_cmd_setup_request(struct spdk_nvme_ns *ns, struct nvme_request *req,
 	cmd->cdw15 = (cmd->cdw15 << 16 | apptag);
 }

+static struct nvme_request *
+_nvme_ns_cmd_split_sgl_request(struct spdk_nvme_ns *ns,
+			       const struct nvme_payload *payload,
+			       uint32_t payload_offset, uint32_t md_offset,
+			       uint64_t lba, uint32_t lba_count,
+			       spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc,
+			       uint32_t io_flags, struct nvme_request *req,
+			       uint16_t apptag_mask, uint16_t apptag)
+{
+	struct nvme_sgl_args *args;
+	bool start_valid, end_valid, last_sge, child_equals_parent;
+	uint64_t child_lba = lba;
+	uint32_t req_current_length = 0;
+	uint32_t child_length = 0;
+	uint32_t sge_length;
+	uintptr_t address;
+
+	args = &req->payload.u.sgl;
+
+	args->reset_sgl_fn(args->cb_arg, payload_offset);
+	args->next_sge_fn(args->cb_arg, (void **)&address, &sge_length);
+	while (req_current_length < req->payload_size) {
+
+		if (sge_length == 0) {
+			continue;
+		} else if (req_current_length + sge_length > req->payload_size) {
+			sge_length = req->payload_size - req_current_length;
+		}
+
+		/*
+		 * The start of the SGE is invalid if the start address is not page aligned,
+		 *  unless it is the first SGE in the child request.
+		 */
+		start_valid = child_length == 0 || _is_page_aligned(address);
+
+		/* Boolean for whether this is the last SGE in the parent request. */
+		last_sge = (req_current_length + sge_length == req->payload_size);
+
+		/*
+		 * The end of the SGE is invalid if the end address is not page aligned,
+		 *  unless it is the last SGE in the parent request.
+		 */
+		end_valid = last_sge || _is_page_aligned(address + sge_length);
+
+		/*
+		 * This child request equals the parent request, meaning that no splitting
+		 *  was required for the parent request (the one passed into this function).
+		 *  In this case, we do not create a child request at all - we just send
+		 *  the original request as a single request at the end of this function.
+		 */
+		child_equals_parent = (child_length + sge_length == req->payload_size);
+
+		if (start_valid) {
+			/*
+			 * The start of the SGE is valid, so advance the length parameters,
+			 *  to include this SGE with previous SGEs for this child request
+			 *  (if any).  If it is not valid, we do not advance the length
+			 *  parameters nor get the next SGE, because we must send what has
+			 *  been collected before this SGE as a child request.
+			 */
+			child_length += sge_length;
+			req_current_length += sge_length;
+			if (req_current_length < req->payload_size) {
+				args->next_sge_fn(args->cb_arg, (void **)&address, &sge_length);
+			}
+		}
+
+		/*
+		 * If one of these criteria is met, send what we have accumlated so far as a
+		 *  child request.
+		 */
+		if (!start_valid || !end_valid || !child_equals_parent) {
+			struct nvme_request *child;
+			uint32_t child_lba_count;
+
+			if ((child_length % ns->sector_size) != 0) {
+				return NULL;
+			}
+			child_lba_count = child_length / ns->sector_size;
+			/*
+			 * Note the last parameter is set to "false" - this tells the recursive
+			 *  call to _nvme_ns_cmd_rw() to not bother with checking for SGL splitting
+			 *  since we have already verified it here.
+			 */
+			child = _nvme_add_child_request(ns, payload, payload_offset, md_offset,
+							child_lba, child_lba_count,
+							cb_fn, cb_arg, opc, io_flags,
+							apptag_mask, apptag, req, false);
+			if (child == NULL) {
+				return NULL;
+			}
+			payload_offset += child_length;
+			md_offset += child_lba_count * ns->md_size;
+			child_lba += child_lba_count;
+			child_length = 0;
+		}
+	}
+
+	if (child_length == req->payload_size) {
+		/* No splitting was required, so setup the whole payload as one request. */
+		_nvme_ns_cmd_setup_request(ns, req, opc, lba, lba_count, io_flags, apptag_mask, apptag);
+	}
+
+	return req;
+}
+
 static struct nvme_request *
 _nvme_ns_cmd_rw(struct spdk_nvme_ns *ns, const struct nvme_payload *payload,
 		uint32_t payload_offset, uint32_t md_offset,
 		uint64_t lba, uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc,
-		uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag)
+		uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag, bool check_sgl)
 {
 	struct nvme_request	*req;
 	uint32_t		sector_size;
@ -250,6 +356,10 @@ _nvme_ns_cmd_rw(struct spdk_nvme_ns *ns, const struct nvme_payload *payload,
 		return _nvme_ns_cmd_split_request(ns, payload, payload_offset, md_offset, lba, lba_count, cb_fn,
 						  cb_arg, opc,
 						  io_flags, req, sectors_per_max_io, 0, apptag_mask, apptag);
+	} else if (req->payload.type == NVME_PAYLOAD_TYPE_SGL && check_sgl &&
+		   !(ns->ctrlr->flags & SPDK_NVME_CTRLR_SGL_SUPPORTED)) {
+		return _nvme_ns_cmd_split_sgl_request(ns, payload, payload_offset, md_offset, lba, lba_count,
+						      cb_fn, cb_arg, opc, io_flags, req, apptag_mask, apptag);
 	}

 	_nvme_ns_cmd_setup_request(ns, req, opc, lba, lba_count, io_flags, apptag_mask, apptag);
@ -271,7 +381,7 @@ spdk_nvme_ns_cmd_read(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, vo

 	req = _nvme_ns_cmd_rw(ns, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_READ,
 			      io_flags, 0,
-			      0);
+			      0, true);
 	if (req != NULL) {
 		return nvme_qpair_submit_request(qpair, req);
 	} else {
@ -295,7 +405,7 @@ spdk_nvme_ns_cmd_read_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *q

 	req = _nvme_ns_cmd_rw(ns, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_READ,
 			      io_flags,
-			      apptag_mask, apptag);
+			      apptag_mask, apptag, true);
 	if (req != NULL) {
 		return nvme_qpair_submit_request(qpair, req);
 	} else {
@ -323,7 +433,7 @@ spdk_nvme_ns_cmd_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
 	payload.u.sgl.cb_arg = cb_arg;

 	req = _nvme_ns_cmd_rw(ns, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_READ,
-			      io_flags, 0, 0);
+			      io_flags, 0, 0, true);
 	if (req != NULL) {
 		return nvme_qpair_submit_request(qpair, req);
 	} else {
@ -345,7 +455,7 @@ spdk_nvme_ns_cmd_write(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
 	payload.md = NULL;

 	req = _nvme_ns_cmd_rw(ns, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_WRITE,
-			      io_flags, 0, 0);
+			      io_flags, 0, 0, true);
 	if (req != NULL) {
 		return nvme_qpair_submit_request(qpair, req);
 	} else {
@ -367,7 +477,7 @@ spdk_nvme_ns_cmd_write_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *
 	payload.md = metadata;

 	req = _nvme_ns_cmd_rw(ns, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_WRITE,
-			      io_flags, apptag_mask, apptag);
+			      io_flags, apptag_mask, apptag, true);
 	if (req != NULL) {
 		return nvme_qpair_submit_request(qpair, req);
 	} else {
@ -395,7 +505,7 @@ spdk_nvme_ns_cmd_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
 	payload.u.sgl.cb_arg = cb_arg;

 	req = _nvme_ns_cmd_rw(ns, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_WRITE,
-			      io_flags, 0, 0);
+			      io_flags, 0, 0, true);
 	if (req != NULL) {
 		return nvme_qpair_submit_request(qpair, req);
 	} else {
--- a/lib/nvme/nvme_pcie.c
+++ b/lib/nvme/nvme_pcie.c
@ -1666,12 +1666,15 @@ nvme_pcie_qpair_build_prps_sgl_request(struct spdk_nvme_qpair *qpair, struct nvm
 			return -1;
 		}

-		/* Confirm that this sge is prp compatible. */
-		if (phys_addr & 0x3 ||
-		    (length < remaining_transfer_len && ((phys_addr + length) & (PAGE_SIZE - 1)))) {
-			nvme_pcie_fail_request_bad_vtophys(qpair, tr);
-			return -1;
-		}
+		/*
+		 * Any incompatible sges should have been handled up in the splitting routine,
+		 *  but assert here as an additional check.
+		 */
+		assert((phys_addr & 0x3) == 0); /* Address must be dword aligned. */
+		/* All SGEs except last must end on a page boundary. */
+		assert((length >= remaining_transfer_len) || _is_page_aligned(phys_addr + length));
+		/* All SGe except first must start on a page boundary. */
+		assert((sge_count == 0) || _is_page_aligned(phys_addr));

 		data_transferred = nvme_min(remaining_transfer_len, length);

--- a/test/lib/nvme/sgl/sgl.c
+++ b/test/lib/nvme/sgl/sgl.c
@ -232,6 +232,48 @@ static void build_io_request_7(struct io_request *req)
 	req->iovs[0].len = 0x10000;
 }

+static void build_io_request_8(struct io_request *req)
+{
+	req->nseg = 2;
+
+	/*
+	 * 1KB for 1st sge, make sure the iov address does not start and end
+	 * at 0x1000 boundary
+	 */
+	req->iovs[0].base = spdk_zmalloc(0x1000, 0x1000, NULL);
+	req->iovs[0].offset = 0x400;
+	req->iovs[0].len = 0x400;
+
+	/*
+	 * 1KB for 1st sge, make sure the iov address does not start and end
+	 * at 0x1000 boundary
+	 */
+	req->iovs[1].base = spdk_zmalloc(0x1000, 0x1000, NULL);
+	req->iovs[1].offset = 0x400;
+	req->iovs[1].len = 0x400;
+}
+
+static void build_io_request_9(struct io_request *req)
+{
+	/*
+	 * Check if mixed PRP complaint and not complaint requests are handled
+	 * properly by spliting them into subrequests.
+	 * Construct buffers with following theme:
+	 */
+	const size_t req_len[] = {  2048, 4096, 2048,  4096,  2048,  1024 };
+	const size_t req_off[] = { 0x800,  0x0,  0x0, 0x100, 0x800, 0x800 };
+	struct sgl_element *iovs = req->iovs;
+	uint32_t i;
+	req->nseg = sizeof(req_len) / sizeof(req_len[0]);
+	assert(sizeof(req_len) / sizeof(req_len[0]) == sizeof(req_off) / sizeof(req_off[0]));
+
+	for (i = 0; i < req->nseg; i++) {
+		iovs[i].base = spdk_zmalloc(req_off[i] + req_len[i], 0x4000, NULL);
+		iovs[i].offset = req_off[i];
+		iovs[i].len = req_len[i];
+	}
+}
+
 typedef void (*nvme_build_io_req_fn_t)(struct io_request *req);

 static void
@ -442,7 +484,9 @@ int main(int argc, char **argv)
 		    || TEST(build_io_request_4)
 		    || TEST(build_io_request_5)
 		    || TEST(build_io_request_6)
-		    || TEST(build_io_request_7)) {
+		    || TEST(build_io_request_7)
+		    || TEST(build_io_request_8)
+		    || TEST(build_io_request_9)) {
 #undef TEST
 			rc = 1;
 			printf("%s: failed sgl tests\n", iter->name);
--- a/test/lib/nvme/unit/nvme_ns_cmd_c/nvme_ns_cmd_ut.c
+++ b/test/lib/nvme/unit/nvme_ns_cmd_c/nvme_ns_cmd_ut.c
@ -58,6 +58,16 @@ static void nvme_request_reset_sgl(void *cb_arg, uint32_t sgl_offset)

 static int nvme_request_next_sge(void *cb_arg, void **address, uint32_t *length)
 {
+	uint32_t *lba_count = cb_arg;
+
+	/*
+	 * We need to set address to something here, since the SGL splitting code will
+	 *  use it to determine PRP compatibility.  Just use a rather arbitrary address
+	 *  for now - these tests will not actually cause data to be read from or written
+	 *  to this address.
+	 */
+	*address = (void *)(uintptr_t)0x10000000;
+	*length = *lba_count;
 	return 0;
 }

@ -187,6 +197,11 @@ prepare_for_test(struct spdk_nvme_ns *ns, struct spdk_nvme_ctrlr *ctrlr,
 		 uint32_t stripe_size)
 {
 	ctrlr->max_xfer_size = max_xfer_size;
+	/*
+	 * Clear the flags field - we especially want to make sure the SGL_SUPPORTED flag is not set
+	 *  so that we test the SGL splitting path.
+	 */
+	ctrlr->flags = 0;
 	memset(ns, 0, sizeof(*ns));
 	ns->ctrlr = ctrlr;
 	ns->sector_size = sector_size;
@ -583,11 +598,14 @@ test_nvme_ns_cmd_readv(void)
 	struct spdk_nvme_qpair		qpair;
 	int				rc = 0;
 	void				*cb_arg;
+	uint32_t			lba_count = 256;
+	uint32_t			sector_size = 512;
+	uint64_t			sge_length = lba_count * sector_size;

 	cb_arg = malloc(512);
-	prepare_for_test(&ns, &ctrlr, &qpair, 512, 128 * 1024, 0);
-	rc = spdk_nvme_ns_cmd_readv(&ns, &qpair, 0x1000, 256, NULL, cb_arg, 0, nvme_request_reset_sgl,
-				    nvme_request_next_sge);
+	prepare_for_test(&ns, &ctrlr, &qpair, sector_size, 128 * 1024, 0);
+	rc = spdk_nvme_ns_cmd_readv(&ns, &qpair, 0x1000, lba_count, NULL, &sge_length, 0,
+				    nvme_request_reset_sgl, nvme_request_next_sge);

 	SPDK_CU_ASSERT_FATAL(rc == 0);
 	SPDK_CU_ASSERT_FATAL(g_request != NULL);
@ -595,7 +613,7 @@ test_nvme_ns_cmd_readv(void)
 	CU_ASSERT(g_request->payload.type == NVME_PAYLOAD_TYPE_SGL);
 	CU_ASSERT(g_request->payload.u.sgl.reset_sgl_fn == nvme_request_reset_sgl);
 	CU_ASSERT(g_request->payload.u.sgl.next_sge_fn == nvme_request_next_sge);
-	CU_ASSERT(g_request->payload.u.sgl.cb_arg == cb_arg);
+	CU_ASSERT(g_request->payload.u.sgl.cb_arg == &sge_length);
 	CU_ASSERT(g_request->cmd.nsid == ns.id);

 	rc = spdk_nvme_ns_cmd_readv(&ns, &qpair, 0x1000, 256, NULL, cb_arg, 0, nvme_request_reset_sgl,
@ -614,12 +632,14 @@ test_nvme_ns_cmd_writev(void)
 	struct spdk_nvme_qpair		qpair;
 	int				rc = 0;
 	void				*cb_arg;
+	uint32_t			lba_count = 256;
+	uint32_t			sector_size = 512;
+	uint64_t			sge_length = lba_count * sector_size;

 	cb_arg = malloc(512);
-	prepare_for_test(&ns, &ctrlr, &qpair, 512, 128 * 1024, 0);
-	rc = spdk_nvme_ns_cmd_writev(&ns, &qpair, 0x1000, 256, NULL, cb_arg, 0,
-				     nvme_request_reset_sgl,
-				     nvme_request_next_sge);
+	prepare_for_test(&ns, &ctrlr, &qpair, sector_size, 128 * 1024, 0);
+	rc = spdk_nvme_ns_cmd_writev(&ns, &qpair, 0x1000, lba_count, NULL, &sge_length, 0,
+				     nvme_request_reset_sgl, nvme_request_next_sge);

 	SPDK_CU_ASSERT_FATAL(rc == 0);
 	SPDK_CU_ASSERT_FATAL(g_request != NULL);
@ -627,7 +647,7 @@ test_nvme_ns_cmd_writev(void)
 	CU_ASSERT(g_request->payload.type == NVME_PAYLOAD_TYPE_SGL);
 	CU_ASSERT(g_request->payload.u.sgl.reset_sgl_fn == nvme_request_reset_sgl);
 	CU_ASSERT(g_request->payload.u.sgl.next_sge_fn == nvme_request_next_sge);
-	CU_ASSERT(g_request->payload.u.sgl.cb_arg == cb_arg);
+	CU_ASSERT(g_request->payload.u.sgl.cb_arg == &sge_length);
 	CU_ASSERT(g_request->cmd.nsid == ns.id);

 	rc = spdk_nvme_ns_cmd_writev(&ns, &qpair, 0x1000, 256, NULL, cb_arg, 0,