Cleanup uio-related code to use struct nvme_request and

nvme_ctrlr_submit_io_request().

While here, also fix case where a uio may have more than 1 iovec.
NVMe's definition of SGEs (called PRPs) only allows for the first SGE to
start on a non-page boundary.  The simplest way to handle this is to
construct a temporary uio for each iovec, and submit an NVMe request
for each.

Sponsored by:	Intel
This commit is contained in:
jimharris 2012-10-18 00:40:40 +00:00
parent c9e224f9c9
commit 9becceac55
5 changed files with 118 additions and 86 deletions

View File

@ -230,15 +230,11 @@ nvme_dump_completion(struct nvme_completion *cpl)
void
nvme_payload_map(void *arg, bus_dma_segment_t *seg, int nseg, int error)
{
struct nvme_tracker *tr;
struct nvme_qpair *qpair;
struct nvme_tracker *tr = arg;
uint32_t cur_nseg;
KASSERT(error == 0, ("nvme_payload_map error != 0\n"));
tr = (struct nvme_tracker *)arg;
qpair = tr->qpair;
/*
* Note that we specified PAGE_SIZE for alignment and max
* segment size when creating the bus dma tags. So here
@ -259,7 +255,7 @@ nvme_payload_map(void *arg, bus_dma_segment_t *seg, int nseg, int error)
}
}
nvme_qpair_submit_cmd(qpair, tr);
nvme_qpair_submit_cmd(tr->qpair, tr);
}
static int

View File

@ -833,12 +833,21 @@ nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr,
tr->req = req;
if (req->payload_size > 0) {
err = bus_dmamap_load(tr->qpair->dma_tag, tr->payload_dma_map,
req->payload, req->payload_size,
nvme_payload_map, tr, 0);
if (req->uio == NULL) {
if (req->payload_size > 0) {
err = bus_dmamap_load(tr->qpair->dma_tag,
tr->payload_dma_map, req->payload,
req->payload_size,
nvme_payload_map, tr, 0);
if (err != 0)
panic("bus_dmamap_load returned non-zero!\n");
} else
nvme_qpair_submit_cmd(tr->qpair, tr);
} else {
err = bus_dmamap_load_uio(tr->qpair->dma_tag,
tr->payload_dma_map, req->uio,
nvme_payload_map_uio, tr, 0);
if (err != 0)
panic("bus_dmamap_load returned non-zero!\n");
} else
nvme_qpair_submit_cmd(tr->qpair, tr);
}
}

View File

@ -101,6 +101,7 @@ struct nvme_request {
struct nvme_command cmd;
void *payload;
uint32_t payload_size;
struct uio *uio;
nvme_cb_fn_t cb_fn;
void *cb_arg;
SLIST_ENTRY(nvme_request) slist;
@ -333,6 +334,8 @@ void nvme_ctrlr_cmd_asynchronous_event_request(struct nvme_controller *ctrlr,
void nvme_payload_map(void *arg, bus_dma_segment_t *seg, int nseg,
int error);
void nvme_payload_map_uio(void *arg, bus_dma_segment_t *seg, int nseg,
bus_size_t mapsize, int error);
int nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev);
int nvme_ctrlr_reset(struct nvme_controller *ctrlr);
@ -392,6 +395,22 @@ nvme_allocate_request(void *payload, uint32_t payload_size, nvme_cb_fn_t cb_fn,
return (req);
}
static __inline struct nvme_request *
nvme_allocate_request_uio(struct uio *uio, nvme_cb_fn_t cb_fn, void *cb_arg)
{
struct nvme_request *req;
req = uma_zalloc(nvme_request_zone, M_NOWAIT | M_ZERO);
if (req == NULL)
return (NULL);
req->uio = uio;
req->cb_fn = cb_fn;
req->cb_arg = cb_arg;
return (req);
}
#define nvme_free_request(req) uma_zfree(nvme_request_zone, req)
#endif /* __NVME_PRIVATE_H__ */

View File

@ -163,7 +163,7 @@ nvme_qpair_process_completions(struct nvme_qpair *qpair)
/* nvme_qpair_submit_cmd() will release the lock. */
nvme_qpair_submit_cmd(qpair, tr);
else {
if (req->payload_size > 0)
if (req->payload_size > 0 || req->uio != NULL)
bus_dmamap_unload(qpair->dma_tag,
tr->payload_dma_map);

View File

@ -38,8 +38,10 @@ static void
nvme_uio_done(void *arg, const struct nvme_completion *status)
{
struct mtx *mtx;
struct uio *uio = arg;
/* TODO: update uio flags based on status */
if (status->sf_sc == 0 && status->sf_sct == 0)
uio->uio_resid = 0;
mtx = mtx_pool_find(mtxpool_sleep, arg);
mtx_lock(mtx);
@ -47,33 +49,17 @@ nvme_uio_done(void *arg, const struct nvme_completion *status)
mtx_unlock(mtx);
}
static struct nvme_tracker *
nvme_allocate_tracker_uio(struct nvme_controller *ctrlr, struct uio *uio,
struct nvme_request *req)
{
struct nvme_tracker *tr;
struct nvme_qpair *qpair;
if (ctrlr->per_cpu_io_queues)
qpair = &ctrlr->ioq[curcpu];
else
qpair = &ctrlr->ioq[0];
tr = nvme_qpair_allocate_tracker(qpair);
if (tr == NULL)
return (NULL);
tr->qpair = qpair;
tr->req = req;
return (tr);
}
static void
void
nvme_payload_map_uio(void *arg, bus_dma_segment_t *seg, int nseg,
bus_size_t mapsize, int error)
{
struct nvme_tracker *tr = arg;
/*
* Now that we know the actual size of the uio, divide it by the
* sector size that we stored in cdw12.
*/
tr->req->cmd.cdw12 = (mapsize / tr->req->cmd.cdw12)-1;
nvme_payload_map(arg, seg, nseg, error);
}
@ -81,20 +67,12 @@ static int
nvme_read_uio(struct nvme_namespace *ns, struct uio *uio)
{
struct nvme_request *req;
struct nvme_tracker *tr;
struct nvme_command *cmd;
int err, i;
uint64_t lba, iosize = 0;
uint64_t lba;
for (i = 0; i < uio->uio_iovcnt; i++) {
iosize += uio->uio_iov[i].iov_len;
}
req = nvme_allocate_request_uio(uio, nvme_uio_done, uio);
req = nvme_allocate_request(NULL, iosize, nvme_uio_done, uio);
tr = nvme_allocate_tracker_uio(ns->ctrlr, uio, req);
if (tr == NULL)
if (req == NULL)
return (ENOMEM);
cmd = &req->cmd;
@ -103,13 +81,16 @@ nvme_read_uio(struct nvme_namespace *ns, struct uio *uio)
lba = uio->uio_offset / nvme_ns_get_sector_size(ns);
*(uint64_t *)&cmd->cdw10 = lba;
/*
* Store the sector size in cdw12 (where the LBA count normally goes).
* We'll adjust cdw12 in the map_uio callback based on the mapsize
* parameter. This allows us to not have to store the namespace
* in the request simply to get the sector size in the map_uio
* callback.
*/
cmd->cdw12 = nvme_ns_get_sector_size(ns);
cmd->cdw12 = (iosize / nvme_ns_get_sector_size(ns))-1;
err = bus_dmamap_load_uio(tr->qpair->dma_tag, tr->payload_dma_map, uio,
nvme_payload_map_uio, tr, 0);
KASSERT(err == 0, ("bus_dmamap_load_uio returned non-zero!\n"));
nvme_ctrlr_submit_io_request(ns->ctrlr, req);
return (0);
}
@ -118,20 +99,12 @@ static int
nvme_write_uio(struct nvme_namespace *ns, struct uio *uio)
{
struct nvme_request *req;
struct nvme_tracker *tr;
struct nvme_command *cmd;
int err, i;
uint64_t lba, iosize = 0;
uint64_t lba;
for (i = 0; i < uio->uio_iovcnt; i++) {
iosize += uio->uio_iov[i].iov_len;
}
req = nvme_allocate_request_uio(uio, nvme_uio_done, uio);
req = nvme_allocate_request(NULL, iosize, nvme_uio_done, uio);
tr = nvme_allocate_tracker_uio(ns->ctrlr, uio, req);
if (tr == NULL)
if (req == NULL)
return (ENOMEM);
cmd = &req->cmd;
@ -140,13 +113,16 @@ nvme_write_uio(struct nvme_namespace *ns, struct uio *uio)
lba = uio->uio_offset / nvme_ns_get_sector_size(ns);
*(uint64_t *)&cmd->cdw10 = lba;
/*
* Store the sector size in cdw12 (where the LBA count normally goes).
* We'll adjust cdw12 in the map_uio callback based on the mapsize
* parameter. This allows us to not have to store the namespace
* in the request simply to get the sector size in the map_uio
* callback.
*/
cmd->cdw12 = nvme_ns_get_sector_size(ns);
cmd->cdw12 = (iosize / nvme_ns_get_sector_size(ns))-1;
err = bus_dmamap_load_uio(tr->qpair->dma_tag, tr->payload_dma_map, uio,
nvme_payload_map_uio, tr, 0);
KASSERT(err == 0, ("bus_dmamap_load_uio returned non-zero!\n"));
nvme_ctrlr_submit_io_request(ns->ctrlr, req);
return (0);
}
@ -154,9 +130,11 @@ nvme_write_uio(struct nvme_namespace *ns, struct uio *uio)
int
nvme_ns_physio(struct cdev *dev, struct uio *uio, int ioflag)
{
struct uio uio_tmp;
struct iovec uio_iov_tmp;
struct nvme_namespace *ns;
struct mtx *mtx;
int err;
int i, nvme_err, physio_err = 0;
#if __FreeBSD_version > 900017
int ref;
#endif
@ -164,7 +142,7 @@ nvme_ns_physio(struct cdev *dev, struct uio *uio, int ioflag)
PHOLD(curproc);
ns = dev->si_drv1;
mtx = mtx_pool_find(mtxpool_sleep, uio);
mtx = mtx_pool_find(mtxpool_sleep, &uio_tmp);
#if __FreeBSD_version > 900017
dev_refthread(dev, &ref);
@ -172,15 +150,48 @@ nvme_ns_physio(struct cdev *dev, struct uio *uio, int ioflag)
dev_refthread(dev);
#endif
mtx_lock(mtx);
if (uio->uio_rw == UIO_READ)
err = nvme_read_uio(ns, uio);
else
err = nvme_write_uio(ns, uio);
/*
* NVM Express doesn't really support true SGLs. All SG elements
* must be PAGE_SIZE, except for the first and last element.
* Because of this, we need to break up each iovec into a separate
* NVMe command - otherwise we could end up with sub-PAGE_SIZE
* elements in the middle of an SGL which is not allowed.
*/
uio_tmp.uio_iov = &uio_iov_tmp;
uio_tmp.uio_iovcnt = 1;
uio_tmp.uio_offset = uio->uio_offset;
uio_tmp.uio_segflg = uio->uio_segflg;
uio_tmp.uio_rw = uio->uio_rw;
uio_tmp.uio_td = uio->uio_td;
if (err == 0)
msleep(uio, mtx, PRIBIO, "nvme_physio", 0);
mtx_unlock(mtx);
for (i = 0; i < uio->uio_iovcnt; i++) {
uio_iov_tmp.iov_base = uio->uio_iov[i].iov_base;
uio_iov_tmp.iov_len = uio->uio_iov[i].iov_len;
uio_tmp.uio_resid = uio_iov_tmp.iov_len;
mtx_lock(mtx);
if (uio->uio_rw == UIO_READ)
nvme_err = nvme_read_uio(ns, &uio_tmp);
else
nvme_err = nvme_write_uio(ns, &uio_tmp);
if (nvme_err == 0)
msleep(&uio_tmp, mtx, PRIBIO, "nvme_physio", 0);
mtx_unlock(mtx);
if (uio_tmp.uio_resid == 0) {
uio->uio_resid -= uio_iov_tmp.iov_len;
uio->uio_offset += uio_iov_tmp.iov_len;
} else {
physio_err = EFAULT;
break;
}
uio_tmp.uio_offset += uio_iov_tmp.iov_len;
}
#if __FreeBSD_version > 900017
dev_relthread(dev, ref);
@ -188,9 +199,6 @@ nvme_ns_physio(struct cdev *dev, struct uio *uio, int ioflag)
dev_relthread(dev);
#endif
if (err == 0)
uio->uio_resid = 0;
PRELE(curproc);
return (0);
return (physio_err);
}