Cleanup uio-related code to use struct nvme_request and

nvme_ctrlr_submit_io_request(). While here, also fix case where a uio may have more than 1 iovec. NVMe's definition of SGEs (called PRPs) only allows for the first SGE to start on a non-page boundary. The simplest way to handle this is to construct a temporary uio for each iovec, and submit an NVMe request for each. Sponsored by: Intel
2012-10-18 00:40:40 +00:00 · 2012-10-18 00:40:40 +00:00 · 9becceac55
commit 9becceac55
parent c9e224f9c9
5 changed files with 118 additions and 86 deletions
--- a/sys/dev/nvme/nvme.c
+++ b/sys/dev/nvme/nvme.c
@ -230,15 +230,11 @@ nvme_dump_completion(struct nvme_completion *cpl)
 void
 nvme_payload_map(void *arg, bus_dma_segment_t *seg, int nseg, int error)
 {
-	struct nvme_tracker 	*tr;
-	struct nvme_qpair 	*qpair;
+	struct nvme_tracker 	*tr = arg;
 	uint32_t		cur_nseg;

 	KASSERT(error == 0, ("nvme_payload_map error != 0\n"));

-	tr = (struct nvme_tracker *)arg;
-	qpair = tr->qpair;
-
 	/*
 	 * Note that we specified PAGE_SIZE for alignment and max
 	 *  segment size when creating the bus dma tags.  So here
@ -259,7 +255,7 @@ nvme_payload_map(void *arg, bus_dma_segment_t *seg, int nseg, int error)
 		}
 	}

-	nvme_qpair_submit_cmd(qpair, tr);
+	nvme_qpair_submit_cmd(tr->qpair, tr);
 }

 static int
--- a/sys/dev/nvme/nvme_ctrlr.c
+++ b/sys/dev/nvme/nvme_ctrlr.c
@ -833,12 +833,21 @@ nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr,

 	tr->req = req;

-	if (req->payload_size > 0) {
-		err = bus_dmamap_load(tr->qpair->dma_tag, tr->payload_dma_map,
-				      req->payload, req->payload_size,
-				      nvme_payload_map, tr, 0);
+	if (req->uio == NULL) {
+		if (req->payload_size > 0) {
+			err = bus_dmamap_load(tr->qpair->dma_tag,
+					      tr->payload_dma_map, req->payload,
+					      req->payload_size,
+					      nvme_payload_map, tr, 0);
+			if (err != 0)
+				panic("bus_dmamap_load returned non-zero!\n");
+		} else
+			nvme_qpair_submit_cmd(tr->qpair, tr);
+	} else {
+		err = bus_dmamap_load_uio(tr->qpair->dma_tag,
+					  tr->payload_dma_map, req->uio,
+					  nvme_payload_map_uio, tr, 0);
 		if (err != 0)
 			panic("bus_dmamap_load returned non-zero!\n");
-	} else
-		nvme_qpair_submit_cmd(tr->qpair, tr);
+	}
 }
--- a/sys/dev/nvme/nvme_private.h
+++ b/sys/dev/nvme/nvme_private.h
@ -101,6 +101,7 @@ struct nvme_request {
 	struct nvme_command		cmd;
 	void				*payload;
 	uint32_t			payload_size;
+	struct uio			*uio;
 	nvme_cb_fn_t			cb_fn;
 	void				*cb_arg;
 	SLIST_ENTRY(nvme_request)	slist;
@ -333,6 +334,8 @@ void	nvme_ctrlr_cmd_asynchronous_event_request(struct nvme_controller *ctrlr,

 void	nvme_payload_map(void *arg, bus_dma_segment_t *seg, int nseg,
 			 int error);
+void	nvme_payload_map_uio(void *arg, bus_dma_segment_t *seg, int nseg,
+			     bus_size_t mapsize, int error);

 int	nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev);
 int	nvme_ctrlr_reset(struct nvme_controller *ctrlr);
@ -392,6 +395,22 @@ nvme_allocate_request(void *payload, uint32_t payload_size, nvme_cb_fn_t cb_fn,
 	return (req);
 }

+static __inline struct nvme_request *
+nvme_allocate_request_uio(struct uio *uio, nvme_cb_fn_t cb_fn, void *cb_arg)
+{
+	struct nvme_request *req;
+
+	req = uma_zalloc(nvme_request_zone, M_NOWAIT | M_ZERO);
+	if (req == NULL)
+		return (NULL);
+
+	req->uio = uio;
+	req->cb_fn = cb_fn;
+	req->cb_arg = cb_arg;
+
+	return (req);
+}
+
 #define nvme_free_request(req)	uma_zfree(nvme_request_zone, req)

 #endif /* __NVME_PRIVATE_H__ */
--- a/sys/dev/nvme/nvme_qpair.c
+++ b/sys/dev/nvme/nvme_qpair.c
@ -163,7 +163,7 @@ nvme_qpair_process_completions(struct nvme_qpair *qpair)
 			/* nvme_qpair_submit_cmd() will release the lock. */
 			nvme_qpair_submit_cmd(qpair, tr);
 		else {
-			if (req->payload_size > 0)
+			if (req->payload_size > 0 || req->uio != NULL)
 				bus_dmamap_unload(qpair->dma_tag,
 				    tr->payload_dma_map);

--- a/sys/dev/nvme/nvme_uio.c
+++ b/sys/dev/nvme/nvme_uio.c
@ -38,8 +38,10 @@ static void
 nvme_uio_done(void *arg, const struct nvme_completion *status)
 {
 	struct mtx *mtx;
+	struct uio *uio = arg;

-	/* TODO: update uio flags based on status */
+	if (status->sf_sc == 0 && status->sf_sct == 0)
+		uio->uio_resid = 0;

 	mtx = mtx_pool_find(mtxpool_sleep, arg);
 	mtx_lock(mtx);
@ -47,33 +49,17 @@ nvme_uio_done(void *arg, const struct nvme_completion *status)
 	mtx_unlock(mtx);
 }

-static struct nvme_tracker *
-nvme_allocate_tracker_uio(struct nvme_controller *ctrlr, struct uio *uio,
-    struct nvme_request *req)
-{
-	struct nvme_tracker 	*tr;
-	struct nvme_qpair	*qpair;
-
-	if (ctrlr->per_cpu_io_queues)
-		qpair = &ctrlr->ioq[curcpu];
-	else
-		qpair = &ctrlr->ioq[0];
-
-	tr = nvme_qpair_allocate_tracker(qpair);
-
-	if (tr == NULL)
-		return (NULL);
-
-	tr->qpair = qpair;
-	tr->req = req;
-
-	return (tr);
-}
-
-static void
+void
 nvme_payload_map_uio(void *arg, bus_dma_segment_t *seg, int nseg,
    bus_size_t mapsize, int error)
 {
+	struct nvme_tracker	*tr = arg;
+
+	/*
+	 * Now that we know the actual size of the uio, divide it by the
+	 *  sector size that we stored in cdw12.
+	 */
+	tr->req->cmd.cdw12 = (mapsize / tr->req->cmd.cdw12)-1;
 	nvme_payload_map(arg, seg, nseg, error);
 }

@ -81,20 +67,12 @@ static int
 nvme_read_uio(struct nvme_namespace *ns, struct uio *uio)
 {
 	struct nvme_request	*req;
-	struct nvme_tracker	*tr;
 	struct nvme_command	*cmd;
-	int			err, i;
-	uint64_t		lba, iosize = 0;
+	uint64_t		lba;

-	for (i = 0; i < uio->uio_iovcnt; i++) {
-		iosize += uio->uio_iov[i].iov_len;
-	}
+	req = nvme_allocate_request_uio(uio, nvme_uio_done, uio);

-	req = nvme_allocate_request(NULL, iosize, nvme_uio_done, uio);
-
-	tr = nvme_allocate_tracker_uio(ns->ctrlr, uio, req);
-
-	if (tr == NULL)
+	if (req == NULL)
 		return (ENOMEM);

 	cmd = &req->cmd;
@ -103,13 +81,16 @@ nvme_read_uio(struct nvme_namespace *ns, struct uio *uio)
 	lba = uio->uio_offset / nvme_ns_get_sector_size(ns);

 	*(uint64_t *)&cmd->cdw10 = lba;
+	/*
+	 * Store the sector size in cdw12 (where the LBA count normally goes).
+	 *  We'll adjust cdw12 in the map_uio callback based on the mapsize
+	 *  parameter.  This allows us to not have to store the namespace
+	 *  in the request simply to get the sector size in the map_uio
+	 *  callback.
+	 */
+	cmd->cdw12 = nvme_ns_get_sector_size(ns);

-	cmd->cdw12 = (iosize / nvme_ns_get_sector_size(ns))-1;
-
-	err = bus_dmamap_load_uio(tr->qpair->dma_tag, tr->payload_dma_map, uio,
-	    nvme_payload_map_uio, tr, 0);
-
-	KASSERT(err == 0, ("bus_dmamap_load_uio returned non-zero!\n"));
+	nvme_ctrlr_submit_io_request(ns->ctrlr, req);

 	return (0);
 }
@ -118,20 +99,12 @@ static int
 nvme_write_uio(struct nvme_namespace *ns, struct uio *uio)
 {
 	struct nvme_request	*req;
-	struct nvme_tracker	*tr;
 	struct nvme_command	*cmd;
-	int			err, i;
-	uint64_t		lba, iosize = 0;
+	uint64_t		lba;

-	for (i = 0; i < uio->uio_iovcnt; i++) {
-		iosize += uio->uio_iov[i].iov_len;
-	}
+	req = nvme_allocate_request_uio(uio, nvme_uio_done, uio);

-	req = nvme_allocate_request(NULL, iosize, nvme_uio_done, uio);
-
-	tr = nvme_allocate_tracker_uio(ns->ctrlr, uio, req);
-
-	if (tr == NULL)
+	if (req == NULL)
 		return (ENOMEM);

 	cmd = &req->cmd;
@ -140,13 +113,16 @@ nvme_write_uio(struct nvme_namespace *ns, struct uio *uio)
 	lba = uio->uio_offset / nvme_ns_get_sector_size(ns);

 	*(uint64_t *)&cmd->cdw10 = lba;
+	/*
+	 * Store the sector size in cdw12 (where the LBA count normally goes).
+	 *  We'll adjust cdw12 in the map_uio callback based on the mapsize
+	 *  parameter.  This allows us to not have to store the namespace
+	 *  in the request simply to get the sector size in the map_uio
+	 *  callback.
+	 */
+	cmd->cdw12 = nvme_ns_get_sector_size(ns);

-	cmd->cdw12 = (iosize / nvme_ns_get_sector_size(ns))-1;
-
-	err = bus_dmamap_load_uio(tr->qpair->dma_tag, tr->payload_dma_map, uio,
-	    nvme_payload_map_uio, tr, 0);
-
-	KASSERT(err == 0, ("bus_dmamap_load_uio returned non-zero!\n"));
+	nvme_ctrlr_submit_io_request(ns->ctrlr, req);

 	return (0);
 }
@ -154,9 +130,11 @@ nvme_write_uio(struct nvme_namespace *ns, struct uio *uio)
 int
 nvme_ns_physio(struct cdev *dev, struct uio *uio, int ioflag)
 {
+	struct uio		uio_tmp;
+	struct iovec		uio_iov_tmp;
 	struct nvme_namespace	*ns;
 	struct mtx		*mtx;
-	int			err;
+	int			i, nvme_err, physio_err = 0;
 #if __FreeBSD_version > 900017
 	int			ref;
 #endif
@ -164,7 +142,7 @@ nvme_ns_physio(struct cdev *dev, struct uio *uio, int ioflag)
 	PHOLD(curproc);

 	ns = dev->si_drv1;
-	mtx = mtx_pool_find(mtxpool_sleep, uio);
+	mtx = mtx_pool_find(mtxpool_sleep, &uio_tmp);

 #if __FreeBSD_version > 900017
 	dev_refthread(dev, &ref);
@ -172,15 +150,48 @@ nvme_ns_physio(struct cdev *dev, struct uio *uio, int ioflag)
 	dev_refthread(dev);
 #endif

-	mtx_lock(mtx);
-	if (uio->uio_rw == UIO_READ)
-		err = nvme_read_uio(ns, uio);
-	else
-		err = nvme_write_uio(ns, uio);
+	/*
+	 * NVM Express doesn't really support true SGLs.  All SG elements
+	 *  must be PAGE_SIZE, except for the first and last element.
+	 *  Because of this, we need to break up each iovec into a separate
+	 *  NVMe command - otherwise we could end up with sub-PAGE_SIZE
+	 *  elements in the middle of an SGL which is not allowed.
+	 */
+	uio_tmp.uio_iov = &uio_iov_tmp;
+	uio_tmp.uio_iovcnt = 1;
+	uio_tmp.uio_offset = uio->uio_offset;
+	uio_tmp.uio_segflg = uio->uio_segflg;
+	uio_tmp.uio_rw = uio->uio_rw;
+	uio_tmp.uio_td = uio->uio_td;

-	if (err == 0)
-		msleep(uio, mtx, PRIBIO, "nvme_physio", 0);
-	mtx_unlock(mtx);
+	for (i = 0; i < uio->uio_iovcnt; i++) {
+
+		uio_iov_tmp.iov_base = uio->uio_iov[i].iov_base;
+		uio_iov_tmp.iov_len = uio->uio_iov[i].iov_len;
+		uio_tmp.uio_resid = uio_iov_tmp.iov_len;
+
+		mtx_lock(mtx);
+
+		if (uio->uio_rw == UIO_READ)
+			nvme_err = nvme_read_uio(ns, &uio_tmp);
+		else
+			nvme_err = nvme_write_uio(ns, &uio_tmp);
+
+		if (nvme_err == 0)
+			msleep(&uio_tmp, mtx, PRIBIO, "nvme_physio", 0);
+
+		mtx_unlock(mtx);
+
+		if (uio_tmp.uio_resid == 0) {
+			uio->uio_resid -= uio_iov_tmp.iov_len;
+			uio->uio_offset += uio_iov_tmp.iov_len;
+		} else {
+			physio_err = EFAULT;
+			break;
+		}
+
+		uio_tmp.uio_offset += uio_iov_tmp.iov_len;
+	}

 #if __FreeBSD_version > 900017
 	dev_relthread(dev, ref);
@ -188,9 +199,6 @@ nvme_ns_physio(struct cdev *dev, struct uio *uio, int ioflag)
 	dev_relthread(dev);
 #endif

-	if (err == 0)
-		uio->uio_resid = 0;
-
 	PRELE(curproc);
-	return (0);
+	return (physio_err);
 }