rdma: replace improperly aligned buffers in requests.

It is a very rare thing for a buffer to be split over two memory
regions. In fact, it is only possible in dpdk versions where
--match-allocations is not passed as a startup parameter to dpdk but
dynamic memory allocation is enabled.

By adding a small helper function, we avoid failing an I/O because it
was assigned one of these improperly aligned buffers. Also, we try to
remove the buffer from circulation so that it doesn't get picked up
again by another request.

Also, add a unit test to catch this case.

Change-Id: Ia09865c2f77160a960571665b29c4533b11758ae
Signed-off-by: Seth Howell <seth.howell@intel.com>
Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/467446
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
Reviewed-by: Paul Luse <paul.e.luse@intel.com>
This commit is contained in:
Seth Howell 2019-09-04 13:28:25 -07:00 committed by Ben Walker
parent 98233769f4
commit 8126509c4f
2 changed files with 93 additions and 9 deletions

View File

@ -450,11 +450,19 @@ struct spdk_nvmf_rdma_poll_group_stat {
}; };
struct spdk_nvmf_rdma_poll_group { struct spdk_nvmf_rdma_poll_group {
struct spdk_nvmf_transport_poll_group group; struct spdk_nvmf_transport_poll_group group;
TAILQ_HEAD(, spdk_nvmf_rdma_poller) pollers; /* Requests that are waiting to obtain a data buffer */
struct spdk_nvmf_rdma_poll_group_stat stat; TAILQ_HEAD(, spdk_nvmf_rdma_poller) pollers;
struct spdk_nvmf_rdma_poll_group_stat stat;
/*
* buffers which are split across multiple RDMA
* memory regions cannot be used by this transport.
*/
STAILQ_HEAD(, spdk_nvmf_transport_pg_cache_buf) retired_bufs;
}; };
/* Assuming rdma_cm uses just one protection domain per ibv_context. */ /* Assuming rdma_cm uses just one protection domain per ibv_context. */
@ -1456,6 +1464,34 @@ nvmf_request_alloc_wrs(struct spdk_nvmf_rdma_transport *rtransport,
return 0; return 0;
} }
/* This function is used in the rare case that we have a buffer split over multiple memory regions. */
static int
nvmf_rdma_replace_buffer(struct spdk_nvmf_rdma_poll_group *rgroup, void **buf)
{
struct spdk_nvmf_transport_poll_group *group = &rgroup->group;
struct spdk_nvmf_transport *transport = group->transport;
struct spdk_nvmf_transport_pg_cache_buf *old_buf;
void *new_buf;
if (!(STAILQ_EMPTY(&group->buf_cache))) {
group->buf_cache_count--;
new_buf = STAILQ_FIRST(&group->buf_cache);
STAILQ_REMOVE_HEAD(&group->buf_cache, link);
assert(*buf != NULL);
} else {
new_buf = spdk_mempool_get(transport->data_buf_pool);
}
if (*buf == NULL) {
return -ENOMEM;
}
old_buf = *buf;
STAILQ_INSERT_HEAD(&rgroup->retired_bufs, old_buf, link);
*buf = new_buf;
return 0;
}
static int static int
nvmf_rdma_fill_buffers(struct spdk_nvmf_rdma_transport *rtransport, nvmf_rdma_fill_buffers(struct spdk_nvmf_rdma_transport *rtransport,
struct spdk_nvmf_rdma_poll_group *rgroup, struct spdk_nvmf_rdma_poll_group *rgroup,
@ -1483,9 +1519,13 @@ nvmf_rdma_fill_buffers(struct spdk_nvmf_rdma_transport *rtransport,
(uint64_t)req->iov[req->iovcnt].iov_base, &translation_len); (uint64_t)req->iov[req->iovcnt].iov_base, &translation_len);
} }
if (translation_len < req->iov[req->iovcnt].iov_len) { /* This is a very rare case that can occur when using DPDK version < 19.05 */
SPDK_ERRLOG("Data buffer split over multiple RDMA Memory Regions\n"); if (spdk_unlikely(translation_len < req->iov[req->iovcnt].iov_len)) {
return -EINVAL; SPDK_ERRLOG("Data buffer split over multiple RDMA Memory Regions. Removing it from circulation.\n");
if (nvmf_rdma_replace_buffer(rgroup, &req->buffers[req->iovcnt]) == -ENOMEM) {
return -ENOMEM;
}
continue;
} }
length -= req->iov[req->iovcnt].iov_len; length -= req->iov[req->iovcnt].iov_len;
@ -2930,6 +2970,7 @@ spdk_nvmf_rdma_poll_group_create(struct spdk_nvmf_transport *transport)
} }
TAILQ_INIT(&rgroup->pollers); TAILQ_INIT(&rgroup->pollers);
STAILQ_INIT(&rgroup->retired_bufs);
pthread_mutex_lock(&rtransport->lock); pthread_mutex_lock(&rtransport->lock);
TAILQ_FOREACH(device, &rtransport->devices, link) { TAILQ_FOREACH(device, &rtransport->devices, link) {
@ -3012,6 +3053,7 @@ spdk_nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group)
struct spdk_nvmf_rdma_poll_group *rgroup; struct spdk_nvmf_rdma_poll_group *rgroup;
struct spdk_nvmf_rdma_poller *poller, *tmp; struct spdk_nvmf_rdma_poller *poller, *tmp;
struct spdk_nvmf_rdma_qpair *qpair, *tmp_qpair; struct spdk_nvmf_rdma_qpair *qpair, *tmp_qpair;
struct spdk_nvmf_transport_pg_cache_buf *buf, *tmp_buf;
rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group);
@ -3019,6 +3061,12 @@ spdk_nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group)
return; return;
} }
/* free all retired buffers back to the transport so we don't short the mempool. */
STAILQ_FOREACH_SAFE(buf, &rgroup->retired_bufs, link, tmp_buf) {
STAILQ_REMOVE(&rgroup->retired_bufs, buf, spdk_nvmf_transport_pg_cache_buf, link);
spdk_mempool_put(group->transport->data_buf_pool, buf);
}
TAILQ_FOREACH_SAFE(poller, &rgroup->pollers, link, tmp) { TAILQ_FOREACH_SAFE(poller, &rgroup->pollers, link, tmp) {
TAILQ_REMOVE(&rgroup->pollers, poller, link); TAILQ_REMOVE(&rgroup->pollers, poller, link);

View File

@ -37,6 +37,7 @@
#include "nvmf/rdma.c" #include "nvmf/rdma.c"
uint64_t g_mr_size; uint64_t g_mr_size;
uint64_t g_mr_next_size;
struct ibv_mr g_rdma_mr; struct ibv_mr g_rdma_mr;
#define RDMA_UT_UNITS_IN_MAX_IO 16 #define RDMA_UT_UNITS_IN_MAX_IO 16
@ -135,6 +136,9 @@ spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t
{ {
if (g_mr_size != 0) { if (g_mr_size != 0) {
*(uint32_t *)size = g_mr_size; *(uint32_t *)size = g_mr_size;
if (g_mr_next_size != 0) {
g_mr_size = g_mr_next_size;
}
} }
return (uint64_t)&g_rdma_mr; return (uint64_t)&g_rdma_mr;
@ -177,12 +181,16 @@ test_spdk_nvmf_rdma_request_parse_sgl(void)
struct spdk_nvmf_transport_pg_cache_buf bufs[4]; struct spdk_nvmf_transport_pg_cache_buf bufs[4];
struct spdk_nvme_sgl_descriptor sgl_desc[SPDK_NVMF_MAX_SGL_ENTRIES] = {{0}}; struct spdk_nvme_sgl_descriptor sgl_desc[SPDK_NVMF_MAX_SGL_ENTRIES] = {{0}};
struct spdk_nvmf_rdma_request_data data; struct spdk_nvmf_rdma_request_data data;
struct spdk_nvmf_transport_pg_cache_buf buffer;
struct spdk_nvmf_transport_pg_cache_buf *buffer_ptr;
int rc, i; int rc, i;
data.wr.sg_list = data.sgl; data.wr.sg_list = data.sgl;
STAILQ_INIT(&group.group.buf_cache); STAILQ_INIT(&group.group.buf_cache);
group.group.buf_cache_size = 0; group.group.buf_cache_size = 0;
group.group.buf_cache_count = 0; group.group.buf_cache_count = 0;
group.group.transport = &rtransport.transport;
STAILQ_INIT(&group.retired_bufs);
poller.group = &group; poller.group = &group;
rqpair.poller = &poller; rqpair.poller = &poller;
rqpair.max_send_sge = SPDK_NVMF_MAX_SGL_ENTRIES; rqpair.max_send_sge = SPDK_NVMF_MAX_SGL_ENTRIES;
@ -268,7 +276,6 @@ test_spdk_nvmf_rdma_request_parse_sgl(void)
CU_ASSERT(rdma_req.data.wr.sg_list[0].length == 0); CU_ASSERT(rdma_req.data.wr.sg_list[0].length == 0);
CU_ASSERT(rdma_req.data.wr.sg_list[0].lkey == 0); CU_ASSERT(rdma_req.data.wr.sg_list[0].lkey == 0);
rdma_req.recv->buf = (void *)0xDDDD; rdma_req.recv->buf = (void *)0xDDDD;
/* Test 2: sgl type: keyed data block subtype: offset (in capsule data) */ /* Test 2: sgl type: keyed data block subtype: offset (in capsule data) */
sgl->generic.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; sgl->generic.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
@ -408,7 +415,6 @@ test_spdk_nvmf_rdma_request_parse_sgl(void)
} }
/* part 1: use the four buffers from the pg cache */ /* part 1: use the four buffers from the pg cache */
group.group.buf_cache_size = 4; group.group.buf_cache_size = 4;
group.group.buf_cache_count = 4; group.group.buf_cache_count = 4;
MOCK_SET(spdk_mempool_get, (void *)0x2000); MOCK_SET(spdk_mempool_get, (void *)0x2000);
@ -432,8 +438,8 @@ test_spdk_nvmf_rdma_request_parse_sgl(void)
~NVMF_DATA_BUFFER_MASK)); ~NVMF_DATA_BUFFER_MASK));
CU_ASSERT(rdma_req.data.wr.sg_list[i].length == rtransport.transport.opts.io_unit_size); CU_ASSERT(rdma_req.data.wr.sg_list[i].length == rtransport.transport.opts.io_unit_size);
} }
/* part 2: now that we have used the buffers from the cache, try again. We should get mempool buffers. */
/* part 2: now that we have used the buffers from the cache, try again. We should get mempool buffers. */
reset_nvmf_rdma_request(&rdma_req); reset_nvmf_rdma_request(&rdma_req);
rc = spdk_nvmf_rdma_request_parse_sgl(&rtransport, &device, &rdma_req); rc = spdk_nvmf_rdma_request_parse_sgl(&rtransport, &device, &rdma_req);
@ -482,6 +488,36 @@ test_spdk_nvmf_rdma_request_parse_sgl(void)
CU_ASSERT(rdma_req.data.wr.sg_list[i].addr == 0x2000); CU_ASSERT(rdma_req.data.wr.sg_list[i].addr == 0x2000);
CU_ASSERT(rdma_req.data.wr.sg_list[i].length == rtransport.transport.opts.io_unit_size); CU_ASSERT(rdma_req.data.wr.sg_list[i].length == rtransport.transport.opts.io_unit_size);
} }
reset_nvmf_rdma_request(&rdma_req);
/* Test 5 dealing with a buffer split over two Memory Regions */
MOCK_SET(spdk_mempool_get, (void *)&buffer);
sgl->generic.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK;
sgl->keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS;
sgl->keyed.length = rtransport.transport.opts.io_unit_size / 2;
g_mr_size = rtransport.transport.opts.io_unit_size / 4;
g_mr_next_size = rtransport.transport.opts.io_unit_size / 2;
rc = spdk_nvmf_rdma_request_parse_sgl(&rtransport, &device, &rdma_req);
SPDK_CU_ASSERT_FATAL(rc == 0);
CU_ASSERT(rdma_req.req.data_from_pool == true);
CU_ASSERT(rdma_req.req.length == rtransport.transport.opts.io_unit_size / 2);
CU_ASSERT((uint64_t)rdma_req.req.data == (((uint64_t)&buffer + NVMF_DATA_BUFFER_MASK) &
~NVMF_DATA_BUFFER_MASK));
CU_ASSERT(rdma_req.data.wr.num_sge == 1);
CU_ASSERT(rdma_req.data.wr.wr.rdma.rkey == 0xEEEE);
CU_ASSERT(rdma_req.data.wr.wr.rdma.remote_addr == 0xFFFF);
CU_ASSERT(rdma_req.req.buffers[0] == &buffer);
CU_ASSERT(rdma_req.data.wr.sg_list[0].addr == (((uint64_t)&buffer + NVMF_DATA_BUFFER_MASK) &
~NVMF_DATA_BUFFER_MASK));
CU_ASSERT(rdma_req.data.wr.sg_list[0].length == rtransport.transport.opts.io_unit_size / 2);
CU_ASSERT(rdma_req.data.wr.sg_list[0].lkey == g_rdma_mr.lkey);
buffer_ptr = STAILQ_FIRST(&group.retired_bufs);
CU_ASSERT(buffer_ptr == &buffer);
STAILQ_REMOVE(&group.retired_bufs, buffer_ptr, spdk_nvmf_transport_pg_cache_buf, link);
CU_ASSERT(STAILQ_EMPTY(&group.retired_bufs));
reset_nvmf_rdma_request(&rdma_req);
} }
static struct spdk_nvmf_rdma_recv * static struct spdk_nvmf_rdma_recv *