memory: do not merge adjacent registrations

Prevents us from unregistering two or more separately
registered regions with a single notification.

This fixes an ibv_mr leak in RDMA. When multiple registrations
were unregistered with a single notification, only the first
ibv_mr one would be freed and the remaining memory would
possibly still remain DMA-able.

As of now, unregistering multiple complete regions with
a single unregister call is not possible. It will be
implemented later, after the rest of the code is cleaned up.

Change-Id: I7d61867fa61fd7a4a8a644ff45cab17125d63e1b
Signed-off-by: Darek Stojaczyk <dariusz.stojaczyk@intel.com>
Reviewed-on: https://review.gerrithub.io/425555
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Chandler-Test-Pool: SPDK Automated Test System <sys_sgsw@intel.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
This commit is contained in:
Darek Stojaczyk 2018-09-12 17:20:38 +02:00 committed by Jim Harris
parent 6c30e783c1
commit 6bcd929500
2 changed files with 104 additions and 15 deletions

View File

@ -57,6 +57,15 @@
#define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB))
#define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1))
/* Low 16 bits for the reference count */
#define REG_MAP_REF_MASK 0xffff
/* A notification region barrier. The 2MB translation entry that's marked
* with this flag must be unregistered separately. This allows contiguous
* regions to be unregistered in the same chunks they were registered.
*/
#define REG_MAP_NOTIFY_START (1ULL << 63)
/* Translation of a single 2MB page. */
struct map_2mb {
uint64_t translation_2mb;
@ -87,6 +96,12 @@ struct spdk_mem_map {
TAILQ_ENTRY(spdk_mem_map) tailq;
};
/* Registrations map. The 64 bit translations are bit fields with the
* following layout (starting with the low bits):
* 0 - 15 : reference count
* 16 - 62 : reserved
* 63 : flags
*/
static struct spdk_mem_map *g_mem_reg_map;
static TAILQ_HEAD(, spdk_mem_map) g_spdk_mem_maps = TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps);
static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER;
@ -133,7 +148,9 @@ spdk_mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_acti
}
for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) {
if (map_1gb->map[idx_1gb].translation_2mb != 0) {
if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REF_MASK) > 0 &&
(contig_start == UINT64_MAX ||
(map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
/* Rebuild the virtual address from the indexes */
uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
@ -152,6 +169,11 @@ spdk_mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_acti
if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
goto err_unregister;
}
/* This page might be a part of a neighbour region, so process
* it again. The idx_1gb will be incremented immediately.
*/
idx_1gb--;
}
contig_start = UINT64_MAX;
}
@ -187,7 +209,8 @@ err_unregister:
}
for (; idx_1gb < UINT64_MAX; idx_1gb--) {
if (map_1gb->map[idx_1gb].translation_2mb != 0) {
if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REF_MASK) > 0 &&
(contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
/* Rebuild the virtual address from the indexes */
uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
@ -202,6 +225,7 @@ err_unregister:
SPDK_MEM_MAP_NOTIFY_UNREGISTER,
(void *)contig_start,
contig_end - contig_start + VALUE_2MB);
idx_1gb++;
}
contig_end = UINT64_MAX;
}
@ -311,13 +335,13 @@ spdk_mem_register(void *vaddr, size_t len)
seg_vaddr = vaddr;
seg_len = 0;
while (len > 0) {
uint64_t ref_count;
uint64_t reg;
/* In g_mem_reg_map, the "translation" is the reference count */
ref_count = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, ref_count + 1);
reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB,
seg_len == 0 ? (reg + 1) | REG_MAP_NOTIFY_START : reg + 1);
if (ref_count > 0) {
if ((reg & REG_MAP_REF_MASK) > 0 || (reg & REG_MAP_NOTIFY_START)) {
if (seg_len > 0) {
TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len);
@ -359,7 +383,7 @@ spdk_mem_unregister(void *vaddr, size_t len)
int rc;
void *seg_vaddr;
size_t seg_len;
uint64_t ref_count;
uint64_t reg;
if ((uintptr_t)vaddr & ~MASK_256TB) {
DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
@ -377,8 +401,8 @@ spdk_mem_unregister(void *vaddr, size_t len)
seg_vaddr = vaddr;
seg_len = len;
while (seg_len > 0) {
ref_count = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
if (ref_count == 0) {
reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
if ((reg & REG_MAP_REF_MASK) == 0) {
pthread_mutex_unlock(&g_spdk_mem_map_mutex);
return -EINVAL;
}
@ -389,11 +413,14 @@ spdk_mem_unregister(void *vaddr, size_t len)
seg_vaddr = vaddr;
seg_len = 0;
while (len > 0) {
/* In g_mem_reg_map, the "translation" is the reference count */
ref_count = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, ref_count - 1);
reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
if ((reg & REG_MAP_REF_MASK) == 1) {
/* clear any flags */
reg = 1;
}
spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, reg - 1);
if (ref_count > 1) {
if ((reg & REG_MAP_REF_MASK) > 1 || (reg & REG_MAP_NOTIFY_START)) {
if (seg_len > 0) {
TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len);

View File

@ -122,6 +122,30 @@ test_mem_map_notify_fail(void *cb_ctx, struct spdk_mem_map *map,
return 0;
}
static int
test_mem_map_notify_checklen(void *cb_ctx, struct spdk_mem_map *map,
enum spdk_mem_map_notify_action action, void *vaddr, size_t size)
{
size_t *len_arr = cb_ctx;
/*
* This is a test requirement - the len array we use to verify
* pages are valid is only so large.
*/
SPDK_CU_ASSERT_FATAL((uintptr_t)vaddr < (VALUE_2MB * PAGE_ARRAY_SIZE));
switch (action) {
case SPDK_MEM_MAP_NOTIFY_REGISTER:
assert(size == len_arr[(uintptr_t)vaddr / VALUE_2MB]);
break;
case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
CU_ASSERT(size == len_arr[(uintptr_t)vaddr / VALUE_2MB]);
break;
}
return 0;
}
static int
test_check_regions_contiguous(uint64_t addr1, uint64_t addr2)
{
@ -143,6 +167,11 @@ struct spdk_mem_map_ops test_map_ops_notify_fail = {
.are_contiguous = NULL
};
struct spdk_mem_map_ops test_map_ops_notify_checklen = {
.notify_cb = test_mem_map_notify_checklen,
.are_contiguous = NULL
};
static void
test_mem_map_alloc_free(void)
{
@ -367,6 +396,38 @@ test_mem_map_registration(void)
CU_ASSERT(map == NULL);
}
static void
test_mem_map_registration_adjacent(void)
{
struct spdk_mem_map *map;
uint64_t default_translation = 0xDEADBEEF0BADF00D;
uintptr_t vaddr;
unsigned i;
size_t notify_len[PAGE_ARRAY_SIZE] = {0};
size_t chunk_len[] = { 2, 1, 3, 2, 1, 1 };
vaddr = 0;
for (i = 0; i < SPDK_COUNTOF(chunk_len); i++) {
notify_len[vaddr / VALUE_2MB] = chunk_len[i] * VALUE_2MB;
spdk_mem_register((void *)vaddr, notify_len[vaddr / VALUE_2MB]);
vaddr += notify_len[vaddr / VALUE_2MB];
}
/* Verify the memory is translated in the same chunks it was registered */
map = spdk_mem_map_alloc(default_translation,
&test_map_ops_notify_checklen, notify_len);
SPDK_CU_ASSERT_FATAL(map != NULL);
spdk_mem_map_free(&map);
CU_ASSERT(map == NULL);
vaddr = 0;
for (i = 0; i < SPDK_COUNTOF(chunk_len); i++) {
notify_len[vaddr / VALUE_2MB] = chunk_len[i] * VALUE_2MB;
spdk_mem_unregister((void *)vaddr, notify_len[vaddr / VALUE_2MB]);
vaddr += notify_len[vaddr / VALUE_2MB];
}
}
int
main(int argc, char **argv)
{
@ -398,7 +459,8 @@ main(int argc, char **argv)
if (
CU_add_test(suite, "alloc and free memory map", test_mem_map_alloc_free) == NULL ||
CU_add_test(suite, "mem map translation", test_mem_map_translation) == NULL ||
CU_add_test(suite, "mem map registration", test_mem_map_registration) == NULL
CU_add_test(suite, "mem map registration", test_mem_map_registration) == NULL ||
CU_add_test(suite, "mem map adjacent registrations", test_mem_map_registration_adjacent) == NULL
) {
CU_cleanup_registry();
return CU_get_error();