nvmf/rdma: Use contig buffers for multi SGL payload

Currently we allocate buffers perf each SGL descriptor.
That can lead to a problem when we use NVME bdev with
PRP controller and length of the 1st SGL descriptor is
not multiple of block size, i.e. the initiator may send
PRP1 (which is SGL[0]) which end address is page aligned
while start address is not aligned. This is allowed by
the spec. But when we read such a data to a local buffer,
start of the buffer is page aligned when its end is not.
That violates PRP requirements and we can't handle such
request. However if we use contig buffer to write both
PRP1 and PRP2 (SGL[0] and SGL[1]) then we won't meet
this problem.

Some existing unit tests were updated, 1 new was added.

Fixes github issue #1853

Change-Id: Ib2d56112b7b25e235d17bbc6df8dce4dc556e12d
Signed-off-by: Alexey Marchuk <alexeymar@mellanox.com>
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/7259
Community-CI: Broadcom CI
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
This commit is contained in:
Alexey Marchuk 2021-04-02 08:46:40 +03:00 committed by Tomasz Zawadzki
parent c07a6a949b
commit 019a5361a5
2 changed files with 94 additions and 24 deletions

View File

@ -267,6 +267,9 @@ struct spdk_nvmf_rdma_request {
enum spdk_nvmf_rdma_request_state state;
/* Data offset in req.iov */
uint32_t offset;
struct spdk_nvmf_rdma_recv *recv;
struct {
@ -1440,13 +1443,12 @@ nvmf_rdma_fill_wr_sgl(struct spdk_nvmf_rdma_poll_group *rgroup,
uint32_t total_length,
uint32_t num_extra_wrs)
{
struct spdk_nvmf_request *req = &rdma_req->req;
struct spdk_rdma_memory_translation mem_translation;
struct spdk_dif_ctx *dif_ctx = NULL;
struct ibv_sge *sg_ele;
struct iovec *iov;
uint32_t remaining_data_block = 0;
uint32_t offset = 0, lkey, remaining;
uint32_t lkey, remaining;
int rc;
if (spdk_unlikely(rdma_req->req.dif.dif_insert_or_strip)) {
@ -1457,7 +1459,7 @@ nvmf_rdma_fill_wr_sgl(struct spdk_nvmf_rdma_poll_group *rgroup,
wr->num_sge = 0;
while (total_length && (num_extra_wrs || wr->num_sge < SPDK_NVMF_MAX_SGL_ENTRIES)) {
iov = &req->iov[rdma_req->iovpos];
iov = &rdma_req->req.iov[rdma_req->iovpos];
rc = spdk_rdma_get_translation(device->map, iov->iov_base, iov->iov_len, &mem_translation);
if (spdk_unlikely(rc)) {
return false;
@ -1465,19 +1467,27 @@ nvmf_rdma_fill_wr_sgl(struct spdk_nvmf_rdma_poll_group *rgroup,
lkey = spdk_rdma_memory_translation_get_lkey(&mem_translation);
sg_ele = &wr->sg_list[wr->num_sge];
remaining = spdk_min((uint32_t)iov->iov_len - rdma_req->offset, total_length);
if (spdk_likely(!dif_ctx)) {
sg_ele->lkey = lkey;
sg_ele->addr = (uintptr_t)(iov->iov_base);
sg_ele->length = (uint32_t)iov->iov_len;
sg_ele->addr = (uintptr_t)iov->iov_base + rdma_req->offset;
sg_ele->length = remaining;
SPDK_DEBUGLOG(rdma, "sge[%d] %p addr 0x%"PRIx64", len %u\n", wr->num_sge, sg_ele, sg_ele->addr,
sg_ele->length);
rdma_req->offset += sg_ele->length;
total_length -= sg_ele->length;
wr->num_sge++;
if (rdma_req->offset == iov->iov_len) {
rdma_req->offset = 0;
rdma_req->iovpos++;
}
} else {
uint32_t data_block_size = dif_ctx->block_size - dif_ctx->md_size;
uint32_t md_size = dif_ctx->md_size;
uint32_t sge_len;
remaining = (uint32_t)iov->iov_len - offset;
while (remaining) {
if (wr->num_sge >= SPDK_NVMF_MAX_SGL_ENTRIES) {
if (num_extra_wrs > 0 && wr->next) {
@ -1490,19 +1500,23 @@ nvmf_rdma_fill_wr_sgl(struct spdk_nvmf_rdma_poll_group *rgroup,
}
}
sg_ele->lkey = lkey;
sg_ele->addr = (uintptr_t)((char *)iov->iov_base + offset);
sg_ele->addr = (uintptr_t)((char *)iov->iov_base + rdma_req->offset);
sge_len = spdk_min(remaining, remaining_data_block);
sg_ele->length = sge_len;
SPDK_DEBUGLOG(rdma, "sge[%d] %p addr 0x%"PRIx64", len %u\n", wr->num_sge, sg_ele, sg_ele->addr,
sg_ele->length);
remaining -= sge_len;
remaining_data_block -= sge_len;
offset += sge_len;
rdma_req->offset += sge_len;
total_length -= sge_len;
sg_ele++;
wr->num_sge++;
if (remaining_data_block == 0) {
/* skip metadata */
offset += md_size;
rdma_req->offset += md_size;
total_length -= md_size;
/* Metadata that do not fit this IO buffer will be included in the next IO buffer */
remaining -= spdk_min(remaining, md_size);
remaining_data_block = data_block_size;
@ -1511,13 +1525,11 @@ nvmf_rdma_fill_wr_sgl(struct spdk_nvmf_rdma_poll_group *rgroup,
if (remaining == 0) {
/* By subtracting the size of the last IOV from the offset, we ensure that we skip
the remaining metadata bits at the beginning of the next buffer */
offset -= iov->iov_len;
rdma_req->offset -= spdk_min(iov->iov_len, rdma_req->offset);
rdma_req->iovpos++;
}
}
}
total_length -= req->iov[rdma_req->iovpos].iov_len;
rdma_req->iovpos++;
}
if (total_length) {
@ -1621,7 +1633,7 @@ nvmf_rdma_request_fill_iovs_multi_sgl(struct spdk_nvmf_rdma_transport *rtranspor
struct spdk_nvmf_request *req = &rdma_req->req;
struct spdk_nvme_sgl_descriptor *inline_segment, *desc;
uint32_t num_sgl_descriptors;
uint32_t lengths[SPDK_NVMF_MAX_SGL_ENTRIES];
uint32_t lengths[SPDK_NVMF_MAX_SGL_ENTRIES], total_length = 0;
uint32_t i;
int rc;
@ -1635,10 +1647,6 @@ nvmf_rdma_request_fill_iovs_multi_sgl(struct spdk_nvmf_rdma_transport *rtranspor
num_sgl_descriptors = inline_segment->unkeyed.length / sizeof(struct spdk_nvme_sgl_descriptor);
assert(num_sgl_descriptors <= SPDK_NVMF_MAX_SGL_ENTRIES);
if (nvmf_request_alloc_wrs(rtransport, rdma_req, num_sgl_descriptors - 1) != 0) {
return -ENOMEM;
}
desc = (struct spdk_nvme_sgl_descriptor *)rdma_req->recv->buf + inline_segment->address;
for (i = 0; i < num_sgl_descriptors; i++) {
if (spdk_likely(!req->dif.dif_insert_or_strip)) {
@ -1648,11 +1656,22 @@ nvmf_rdma_request_fill_iovs_multi_sgl(struct spdk_nvmf_rdma_transport *rtranspor
lengths[i] = spdk_dif_get_length_with_md(desc->keyed.length, &req->dif.dif_ctx);
req->dif.elba_length += lengths[i];
}
total_length += lengths[i];
desc++;
}
rc = spdk_nvmf_request_get_buffers_multi(req, &rgroup->group, &rtransport->transport,
lengths, num_sgl_descriptors);
if (total_length > rtransport->transport.opts.max_io_size) {
SPDK_ERRLOG("Multi SGL length 0x%x exceeds max io size 0x%x\n",
total_length, rtransport->transport.opts.max_io_size);
req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
return -EINVAL;
}
if (nvmf_request_alloc_wrs(rtransport, rdma_req, num_sgl_descriptors - 1) != 0) {
return -ENOMEM;
}
rc = spdk_nvmf_request_get_buffers(req, &rgroup->group, &rtransport->transport, total_length);
if (rc != 0) {
nvmf_rdma_request_free_data(rdma_req, rtransport);
return rc;
@ -1673,8 +1692,6 @@ nvmf_rdma_request_fill_iovs_multi_sgl(struct spdk_nvmf_rdma_transport *rtranspor
goto err_exit;
}
current_wr->num_sge = 0;
rc = nvmf_rdma_fill_wr_sgl(rgroup, device, rdma_req, current_wr, lengths[i], 0);
if (rc != 0) {
rc = -ENOMEM;
@ -1851,6 +1868,7 @@ _nvmf_rdma_request_free(struct spdk_nvmf_rdma_request *rdma_req,
rdma_req->req.data = NULL;
rdma_req->rsp.wr.next = NULL;
rdma_req->data.wr.next = NULL;
rdma_req->offset = 0;
memset(&rdma_req->req.dif, 0, sizeof(rdma_req->req.dif));
rqpair->qd--;

View File

@ -132,6 +132,7 @@ static void reset_nvmf_rdma_request(struct spdk_nvmf_rdma_request *rdma_req)
rdma_req->data.wr.num_sge = 0;
rdma_req->data.wr.wr.rdma.remote_addr = 0;
rdma_req->data.wr.wr.rdma.rkey = 0;
rdma_req->offset = 0;
memset(&rdma_req->req.dif, 0, sizeof(rdma_req->req.dif));
for (i = 0; i < SPDK_NVMF_MAX_SGL_ENTRIES; i++) {
@ -162,6 +163,8 @@ test_spdk_nvmf_rdma_request_parse_sgl(void)
struct spdk_nvme_sgl_descriptor sgl_desc[SPDK_NVMF_MAX_SGL_ENTRIES] = {{0}};
struct spdk_nvmf_rdma_request_data data;
int rc, i;
uint32_t sgl_length;
uintptr_t aligned_buffer_address;
data.wr.sg_list = data.sgl;
STAILQ_INIT(&group.group.buf_cache);
@ -362,7 +365,7 @@ test_spdk_nvmf_rdma_request_parse_sgl(void)
CU_ASSERT(rc == 0);
CU_ASSERT(rdma_req.req.data_from_pool == true);
CU_ASSERT(rdma_req.req.length == rtransport.transport.opts.io_unit_size * 16);
CU_ASSERT(rdma_req.req.iovcnt == 17);
CU_ASSERT(rdma_req.req.iovcnt == 16);
CU_ASSERT(rdma_req.data.wr.num_sge == 16);
for (i = 0; i < 15; i++) {
CU_ASSERT(rdma_req.data.sgl[i].length == rtransport.transport.opts.io_unit_size);
@ -378,6 +381,39 @@ test_spdk_nvmf_rdma_request_parse_sgl(void)
CU_ASSERT(data.wr.num_sge == 1);
CU_ASSERT(data.wr.next == &rdma_req.rsp.wr);
/* part 4: 2 SGL descriptors, each length is transport buffer / 2
* 1 transport buffers should be allocated */
reset_nvmf_rdma_request(&rdma_req);
aligned_buffer_address = ((uintptr_t)(&data) + NVMF_DATA_BUFFER_MASK) & ~NVMF_DATA_BUFFER_MASK;
sgl->unkeyed.length = 2 * sizeof(struct spdk_nvme_sgl_descriptor);
sgl_length = rtransport.transport.opts.io_unit_size / 2;
for (i = 0; i < 2; i++) {
sgl_desc[i].keyed.length = sgl_length;
sgl_desc[i].address = 0x4000 + i * sgl_length;
}
rc = nvmf_rdma_request_parse_sgl(&rtransport, &device, &rdma_req);
CU_ASSERT(rc == 0);
CU_ASSERT(rdma_req.req.data_from_pool == true);
CU_ASSERT(rdma_req.req.length == rtransport.transport.opts.io_unit_size);
CU_ASSERT(rdma_req.req.iovcnt == 1);
CU_ASSERT(rdma_req.data.sgl[0].length == sgl_length);
/* We mocked mempool_get to return address of data variable. Mempool is used
* to get both additional WRs and data buffers, so data points to &data */
CU_ASSERT(rdma_req.data.sgl[0].addr == aligned_buffer_address);
CU_ASSERT(rdma_req.data.wr.wr.rdma.rkey == 0x44);
CU_ASSERT(rdma_req.data.wr.wr.rdma.remote_addr == 0x4000);
CU_ASSERT(rdma_req.data.wr.num_sge == 1);
CU_ASSERT(rdma_req.data.wr.next == &data.wr);
CU_ASSERT(data.wr.wr.rdma.rkey == 0x44);
CU_ASSERT(data.wr.wr.rdma.remote_addr == 0x4000 + sgl_length);
CU_ASSERT(data.sgl[0].length == sgl_length);
CU_ASSERT(data.sgl[0].addr == aligned_buffer_address + sgl_length);
CU_ASSERT(data.wr.num_sge == 1);
/* Test 4: use PG buffer cache */
sgl->generic.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK;
sgl->keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS;
@ -1106,10 +1142,26 @@ test_spdk_nvmf_rdma_request_parse_sgl_with_md(void)
CU_ASSERT(rdma_req.data.wr.num_sge == 16);
CU_ASSERT(rdma_req.data.wr.wr.rdma.rkey == 0xEEEE);
CU_ASSERT(rdma_req.data.wr.wr.rdma.remote_addr == 0xFFFF);
for (i = 0; i < 15; ++i) {
CU_ASSERT(rdma_req.data.wr.sg_list[i].addr == (uintptr_t)aligned_buffer + i * (data_bs + md_size));
CU_ASSERT(rdma_req.data.wr.sg_list[i].length == data_bs);
CU_ASSERT(rdma_req.data.wr.sg_list[i].lkey == RDMA_UT_LKEY);
}
/* 8192 - (512 + 8) * 15 = 392 */
CU_ASSERT(rdma_req.data.wr.sg_list[i].addr == (uintptr_t)aligned_buffer + i * (data_bs + md_size));
CU_ASSERT(rdma_req.data.wr.sg_list[i].length == 392);
CU_ASSERT(rdma_req.data.wr.sg_list[i].lkey == RDMA_UT_LKEY);
/* additional wr from pool */
CU_ASSERT(rdma_req.data.wr.next == (void *)&data2->wr);
CU_ASSERT(rdma_req.data.wr.next->num_sge == 1);
CU_ASSERT(rdma_req.data.wr.next->next == &rdma_req.rsp.wr);
/* 2nd IO buffer */
CU_ASSERT(data2->wr.sg_list[0].addr == (uintptr_t)aligned_buffer);
CU_ASSERT(data2->wr.sg_list[0].length == 120);
CU_ASSERT(data2->wr.sg_list[0].lkey == RDMA_UT_LKEY);
/* Part 8: simple I/O, data with metadata do not fit to 1 io_buffer */
MOCK_SET(spdk_mempool_get, (void *)0x2000);