memory: forbid registering a memory region more than once

Removed the reference count from the registrations map.

Although technically supported, registering a single memory
region more than once had a lot of unhandled cases and could
easily lead to a segfault.

RDMA maps require all memory to be unregistered in the same
chunks the memory was registered, which is often impossible
to achieve if a region was registered more than once:

1. register region    0x0 - 0x3 -> it gets mapped to
                                   a single ibv_mr
2. register region    0x1 - 0x2 -> nothing happens, this region
                                   is already registered
3. unregister region  0x0 - 0x3 -> 0x0-0x1 gets unregistered as
                                   one region. 0x2-0x3 gets
                                   unregistered as another
                                   (leading to segfault in the
                                   the current RDMA implementation)

The problem is that the last two regions share the same ibv_mr,
which SPDK tries to free twice. The second free causes a segfault.
vtophys map handles this case by registering each 2MB chunk
separately, but this solution cannot be applied for RDMA, as
NICs put a limitation (~2048) on the number of regions registered.

Another option is to keep a refcount of each ibv_mr allocated,
and free it only when the entire region was unregistered from the
SPDK mem map. This is however very tricky and RDMAmojo mentions
that freeing a memory buffer before unregistering its ibv_mr
may lead to a segfault.

Change-Id: I545c56e24ffa55bda211dea22aeb8a55d9631fe5
Signed-off-by: Darek Stojaczyk <dariusz.stojaczyk@intel.com>
Reviewed-on: https://review.gerrithub.io/426085
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
Chandler-Test-Pool: SPDK Automated Test System <sys_sgsw@intel.com>
This commit is contained in:
Darek Stojaczyk 2018-09-19 13:36:22 +02:00 committed by Jim Harris
parent 6bcd929500
commit a33e0943b0
2 changed files with 46 additions and 55 deletions

View File

@ -57,8 +57,8 @@
#define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB))
#define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1))
/* Low 16 bits for the reference count */
#define REG_MAP_REF_MASK 0xffff
/* Page is registered */
#define REG_MAP_REGISTERED (1ULL << 62)
/* A notification region barrier. The 2MB translation entry that's marked
* with this flag must be unregistered separately. This allows contiguous
@ -98,9 +98,8 @@ struct spdk_mem_map {
/* Registrations map. The 64 bit translations are bit fields with the
* following layout (starting with the low bits):
* 0 - 15 : reference count
* 16 - 62 : reserved
* 63 : flags
* 0 - 61 : reserved
* 62 - 63 : flags
*/
static struct spdk_mem_map *g_mem_reg_map;
static TAILQ_HEAD(, spdk_mem_map) g_spdk_mem_maps = TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps);
@ -148,7 +147,7 @@ spdk_mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_acti
}
for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) {
if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REF_MASK) > 0 &&
if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
(contig_start == UINT64_MAX ||
(map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
/* Rebuild the virtual address from the indexes */
@ -209,7 +208,7 @@ err_unregister:
}
for (; idx_1gb < UINT64_MAX; idx_1gb--) {
if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REF_MASK) > 0 &&
if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
(contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
/* Rebuild the virtual address from the indexes */
uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
@ -318,6 +317,7 @@ spdk_mem_register(void *vaddr, size_t len)
int rc;
void *seg_vaddr;
size_t seg_len;
uint64_t reg;
if ((uintptr_t)vaddr & ~MASK_256TB) {
DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
@ -330,45 +330,39 @@ spdk_mem_register(void *vaddr, size_t len)
return -EINVAL;
}
if (len == 0) {
return 0;
}
pthread_mutex_lock(&g_spdk_mem_map_mutex);
seg_vaddr = vaddr;
seg_len = len;
while (seg_len > 0) {
reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
if (reg & REG_MAP_REGISTERED) {
pthread_mutex_unlock(&g_spdk_mem_map_mutex);
return -EBUSY;
}
seg_vaddr += VALUE_2MB;
seg_len -= VALUE_2MB;
}
seg_vaddr = vaddr;
seg_len = 0;
while (len > 0) {
uint64_t reg;
reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB,
seg_len == 0 ? (reg + 1) | REG_MAP_NOTIFY_START : reg + 1);
if ((reg & REG_MAP_REF_MASK) > 0 || (reg & REG_MAP_NOTIFY_START)) {
if (seg_len > 0) {
TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len);
if (rc != 0) {
pthread_mutex_unlock(&g_spdk_mem_map_mutex);
return rc;
}
}
}
seg_vaddr = vaddr + VALUE_2MB;
seg_len = 0;
} else {
seg_len += VALUE_2MB;
}
seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED);
seg_len += VALUE_2MB;
vaddr += VALUE_2MB;
len -= VALUE_2MB;
}
if (seg_len > 0) {
TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len);
if (rc != 0) {
pthread_mutex_unlock(&g_spdk_mem_map_mutex);
return rc;
}
TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len);
if (rc != 0) {
pthread_mutex_unlock(&g_spdk_mem_map_mutex);
return rc;
}
}
@ -402,7 +396,7 @@ spdk_mem_unregister(void *vaddr, size_t len)
seg_len = len;
while (seg_len > 0) {
reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
if ((reg & REG_MAP_REF_MASK) == 0) {
if ((reg & REG_MAP_REGISTERED) == 0) {
pthread_mutex_unlock(&g_spdk_mem_map_mutex);
return -EINVAL;
}
@ -414,20 +408,14 @@ spdk_mem_unregister(void *vaddr, size_t len)
seg_len = 0;
while (len > 0) {
reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
if ((reg & REG_MAP_REF_MASK) == 1) {
/* clear any flags */
reg = 1;
}
spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, reg - 1);
spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0);
if ((reg & REG_MAP_REF_MASK) > 1 || (reg & REG_MAP_NOTIFY_START)) {
if (seg_len > 0) {
TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len);
if (rc != 0) {
pthread_mutex_unlock(&g_spdk_mem_map_mutex);
return rc;
}
if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) {
TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len);
if (rc != 0) {
pthread_mutex_unlock(&g_spdk_mem_map_mutex);
return rc;
}
}

View File

@ -373,14 +373,17 @@ test_mem_map_registration(void)
/* Register an overlapping address range */
rc = spdk_mem_register((void *)0, 3 * VALUE_2MB);
CU_ASSERT(rc == 0);
CU_ASSERT(rc == -EBUSY);
/*
* Unregister the middle page of the larger region.
* It was set twice, so unregister it twice.
*/
/* Unregister a 2MB page */
rc = spdk_mem_unregister((void *)VALUE_2MB, VALUE_2MB);
CU_ASSERT(rc == 0);
/* Register non overlapping address range */
rc = spdk_mem_register((void *)0, 3 * VALUE_2MB);
CU_ASSERT(rc == 0);
/* Unregister the middle page of the larger region. */
rc = spdk_mem_unregister((void *)VALUE_2MB, VALUE_2MB);
CU_ASSERT(rc == 0);