eal: introduce memory management wrappers

Introduce OS-independent wrappers for memory management operations used
across DPDK and specifically in common code of EAL:

* rte_mem_map()
* rte_mem_unmap()
* rte_mem_page_size()
* rte_mem_lock()

Windows uses different APIs for memory mapping and reservation, while
Unices reserve memory by mapping it. Introduce EAL private functions to
support memory reservation in common code:

* eal_mem_reserve()
* eal_mem_free()
* eal_mem_set_dump()

Wrappers follow POSIX semantics limited to DPDK tasks, but their
signatures deliberately differ from POSIX ones to be more safe and
expressive. New symbols are internal. Being thin wrappers, they require
no special maintenance.

Signed-off-by: Dmitry Kozlyuk <dmitry.kozliuk@gmail.com>
This commit is contained in:
Dmitry Kozlyuk 2020-06-15 03:43:45 +03:00 committed by Thomas Monjalon
parent 176bb37ca6
commit c4b89ecb64
10 changed files with 381 additions and 65 deletions

View File

@ -5,15 +5,16 @@
#include <fcntl.h>
#include <inttypes.h>
#include <limits.h>
#include <sys/mman.h>
#include <stdint.h>
#include <errno.h>
#include <string.h>
#include <unistd.h>
#include <rte_common.h>
#include <rte_log.h>
#include <rte_eal_paging.h>
#include <rte_errno.h>
#include <rte_log.h>
#include <rte_memory.h>
#include <rte_spinlock.h>
#include <rte_tailq.h>
@ -90,12 +91,9 @@ resize_and_map(int fd, void *addr, size_t len)
return -1;
}
map_addr = mmap(addr, len, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, fd, 0);
map_addr = rte_mem_map(addr, len, RTE_PROT_READ | RTE_PROT_WRITE,
RTE_MAP_SHARED | RTE_MAP_FORCE_ADDRESS, fd, 0);
if (map_addr != addr) {
RTE_LOG(ERR, EAL, "mmap() failed: %s\n", strerror(errno));
/* pass errno up the chain */
rte_errno = errno;
return -1;
}
return 0;
@ -733,7 +731,7 @@ rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len,
return -1;
}
page_sz = sysconf(_SC_PAGESIZE);
page_sz = rte_mem_page_size();
if (page_sz == (size_t)-1) {
free(ma);
return -1;
@ -754,11 +752,13 @@ rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len,
if (internal_config.no_shconf) {
/* remap virtual area as writable */
void *new_data = mmap(data, mmap_len, PROT_READ | PROT_WRITE,
MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, fd, 0);
if (new_data == MAP_FAILED) {
static const int flags = RTE_MAP_FORCE_ADDRESS |
RTE_MAP_PRIVATE | RTE_MAP_ANONYMOUS;
void *new_data = rte_mem_map(data, mmap_len,
RTE_PROT_READ | RTE_PROT_WRITE, flags, fd, 0);
if (new_data == NULL) {
RTE_LOG(DEBUG, EAL, "%s(): couldn't remap anonymous memory: %s\n",
__func__, strerror(errno));
__func__, rte_strerror(rte_errno));
goto fail;
}
} else {
@ -820,7 +820,7 @@ rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len,
return 0;
fail:
if (data)
munmap(data, mmap_len);
rte_mem_unmap(data, mmap_len);
if (fd >= 0)
close(fd);
free(ma);
@ -858,7 +858,7 @@ rte_fbarray_attach(struct rte_fbarray *arr)
return -1;
}
page_sz = sysconf(_SC_PAGESIZE);
page_sz = rte_mem_page_size();
if (page_sz == (size_t)-1) {
free(ma);
return -1;
@ -909,7 +909,7 @@ rte_fbarray_attach(struct rte_fbarray *arr)
return 0;
fail:
if (data)
munmap(data, mmap_len);
rte_mem_unmap(data, mmap_len);
if (fd >= 0)
close(fd);
free(ma);
@ -937,8 +937,7 @@ rte_fbarray_detach(struct rte_fbarray *arr)
* really do anything about it, things will blow up either way.
*/
size_t page_sz = sysconf(_SC_PAGESIZE);
size_t page_sz = rte_mem_page_size();
if (page_sz == (size_t)-1)
return -1;
@ -957,7 +956,7 @@ rte_fbarray_detach(struct rte_fbarray *arr)
goto out;
}
munmap(arr->data, mmap_len);
rte_mem_unmap(arr->data, mmap_len);
/* area is unmapped, close fd and remove the tailq entry */
if (tmp->fd >= 0)
@ -992,8 +991,7 @@ rte_fbarray_destroy(struct rte_fbarray *arr)
* really do anything about it, things will blow up either way.
*/
size_t page_sz = sysconf(_SC_PAGESIZE);
size_t page_sz = rte_mem_page_size();
if (page_sz == (size_t)-1)
return -1;
@ -1042,7 +1040,7 @@ rte_fbarray_destroy(struct rte_fbarray *arr)
}
close(fd);
}
munmap(arr->data, mmap_len);
rte_mem_unmap(arr->data, mmap_len);
/* area is unmapped, remove the tailq entry */
TAILQ_REMOVE(&mem_area_tailq, tmp, next);

View File

@ -11,13 +11,13 @@
#include <string.h>
#include <unistd.h>
#include <inttypes.h>
#include <sys/mman.h>
#include <sys/queue.h>
#include <rte_fbarray.h>
#include <rte_memory.h>
#include <rte_eal.h>
#include <rte_eal_memconfig.h>
#include <rte_eal_paging.h>
#include <rte_errno.h>
#include <rte_log.h>
@ -40,18 +40,10 @@
static void *next_baseaddr;
static uint64_t system_page_sz;
#ifdef RTE_EXEC_ENV_LINUX
#define RTE_DONTDUMP MADV_DONTDUMP
#elif defined RTE_EXEC_ENV_FREEBSD
#define RTE_DONTDUMP MADV_NOCORE
#else
#error "madvise doesn't support this OS"
#endif
#define MAX_MMAP_WITH_DEFINED_ADDR_TRIES 5
void *
eal_get_virtual_area(void *requested_addr, size_t *size,
size_t page_sz, int flags, int mmap_flags)
size_t page_sz, int flags, int reserve_flags)
{
bool addr_is_hint, allow_shrink, unmap, no_align;
uint64_t map_sz;
@ -59,9 +51,7 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
uint8_t try = 0;
if (system_page_sz == 0)
system_page_sz = sysconf(_SC_PAGESIZE);
mmap_flags |= MAP_PRIVATE | MAP_ANONYMOUS;
system_page_sz = rte_mem_page_size();
RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size);
@ -105,24 +95,24 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
return NULL;
}
mapped_addr = mmap(requested_addr, (size_t)map_sz, PROT_NONE,
mmap_flags, -1, 0);
if (mapped_addr == MAP_FAILED && allow_shrink)
mapped_addr = eal_mem_reserve(
requested_addr, (size_t)map_sz, reserve_flags);
if ((mapped_addr == NULL) && allow_shrink)
*size -= page_sz;
if (mapped_addr != MAP_FAILED && addr_is_hint &&
mapped_addr != requested_addr) {
if ((mapped_addr != NULL) && addr_is_hint &&
(mapped_addr != requested_addr)) {
try++;
next_baseaddr = RTE_PTR_ADD(next_baseaddr, page_sz);
if (try <= MAX_MMAP_WITH_DEFINED_ADDR_TRIES) {
/* hint was not used. Try with another offset */
munmap(mapped_addr, map_sz);
mapped_addr = MAP_FAILED;
eal_mem_free(mapped_addr, map_sz);
mapped_addr = NULL;
requested_addr = next_baseaddr;
}
}
} while ((allow_shrink || addr_is_hint) &&
mapped_addr == MAP_FAILED && *size > 0);
(mapped_addr == NULL) && (*size > 0));
/* align resulting address - if map failed, we will ignore the value
* anyway, so no need to add additional checks.
@ -132,20 +122,17 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
if (*size == 0) {
RTE_LOG(ERR, EAL, "Cannot get a virtual area of any size: %s\n",
strerror(errno));
rte_errno = errno;
rte_strerror(rte_errno));
return NULL;
} else if (mapped_addr == MAP_FAILED) {
} else if (mapped_addr == NULL) {
RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n",
strerror(errno));
/* pass errno up the call chain */
rte_errno = errno;
rte_strerror(rte_errno));
return NULL;
} else if (requested_addr != NULL && !addr_is_hint &&
aligned_addr != requested_addr) {
RTE_LOG(ERR, EAL, "Cannot get a virtual area at requested address: %p (got %p)\n",
requested_addr, aligned_addr);
munmap(mapped_addr, map_sz);
eal_mem_free(mapped_addr, map_sz);
rte_errno = EADDRNOTAVAIL;
return NULL;
} else if (requested_addr != NULL && addr_is_hint &&
@ -161,7 +148,7 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
aligned_addr, *size);
if (unmap) {
munmap(mapped_addr, map_sz);
eal_mem_free(mapped_addr, map_sz);
} else if (!no_align) {
void *map_end, *aligned_end;
size_t before_len, after_len;
@ -179,19 +166,17 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
/* unmap space before aligned mmap address */
before_len = RTE_PTR_DIFF(aligned_addr, mapped_addr);
if (before_len > 0)
munmap(mapped_addr, before_len);
eal_mem_free(mapped_addr, before_len);
/* unmap space after aligned end mmap address */
after_len = RTE_PTR_DIFF(map_end, aligned_end);
if (after_len > 0)
munmap(aligned_end, after_len);
eal_mem_free(aligned_end, after_len);
}
if (!unmap) {
/* Exclude these pages from a core dump. */
if (madvise(aligned_addr, *size, RTE_DONTDUMP) != 0)
RTE_LOG(DEBUG, EAL, "madvise failed: %s\n",
strerror(errno));
eal_mem_set_dump(aligned_addr, *size, false);
}
return aligned_addr;
@ -547,10 +532,10 @@ rte_eal_memdevice_init(void)
int
rte_mem_lock_page(const void *virt)
{
unsigned long virtual = (unsigned long)virt;
int page_size = getpagesize();
unsigned long aligned = (virtual & ~(page_size - 1));
return mlock((void *)aligned, page_size);
uintptr_t virtual = (uintptr_t)virt;
size_t page_size = rte_mem_page_size();
uintptr_t aligned = RTE_PTR_ALIGN_FLOOR(virtual, page_size);
return rte_mem_lock((void *)aligned, page_size);
}
int

View File

@ -11,6 +11,7 @@
#include <rte_dev.h>
#include <rte_lcore.h>
#include <rte_memory.h>
/**
* Structure storing internal configuration (per-lcore)
@ -202,6 +203,24 @@ int rte_eal_alarm_init(void);
*/
int rte_eal_check_module(const char *module_name);
/**
* Memory reservation flags.
*/
enum eal_mem_reserve_flags {
/**
* Reserve hugepages. May be unsupported by some platforms.
*/
EAL_RESERVE_HUGEPAGES = 1 << 0,
/**
* Force reserving memory at the requested address.
* This can be a destructive action depending on the implementation.
*
* @see RTE_MAP_FORCE_ADDRESS for description of possible consequences
* (although implementations are not required to use it).
*/
EAL_RESERVE_FORCE_ADDRESS = 1 << 1
};
/**
* Get virtual area of specified size from the OS.
*
@ -215,8 +234,8 @@ int rte_eal_check_module(const char *module_name);
* Page size on which to align requested virtual area.
* @param flags
* EAL_VIRTUAL_AREA_* flags.
* @param mmap_flags
* Extra flags passed directly to mmap().
* @param reserve_flags
* Extra flags passed directly to eal_mem_reserve().
*
* @return
* Virtual area address if successful.
@ -233,7 +252,7 @@ int rte_eal_check_module(const char *module_name);
/**< immediately unmap reserved virtual area. */
void *
eal_get_virtual_area(void *requested_addr, size_t *size,
size_t page_sz, int flags, int mmap_flags);
size_t page_sz, int flags, int reserve_flags);
/**
* Get cpu core_id.
@ -493,4 +512,57 @@ eal_file_lock(int fd, enum eal_flock_op op, enum eal_flock_mode mode);
int
eal_file_truncate(int fd, ssize_t size);
/**
* Reserve a region of virtual memory.
*
* Use eal_mem_free() to free reserved memory.
*
* @param requested_addr
* A desired reservation address which must be page-aligned.
* The system might not respect it.
* NULL means the address will be chosen by the system.
* @param size
* Reservation size. Must be a multiple of system page size.
* @param flags
* Reservation options, a combination of eal_mem_reserve_flags.
* @returns
* Starting address of the reserved area on success, NULL on failure.
* Callers must not access this memory until remapping it.
*/
void *
eal_mem_reserve(void *requested_addr, size_t size, int flags);
/**
* Free memory obtained by eal_mem_reserve() or eal_mem_alloc().
*
* If *virt* and *size* describe a part of the reserved region,
* only this part of the region is freed (accurately up to the system
* page size). If *virt* points to allocated memory, *size* must match
* the one specified on allocation. The behavior is undefined
* if the memory pointed by *virt* is obtained from another source
* than listed above.
*
* @param virt
* A virtual address in a region previously reserved.
* @param size
* Number of bytes to unreserve.
*/
void
eal_mem_free(void *virt, size_t size);
/**
* Configure memory region inclusion into dumps.
*
* @param virt
* Starting address of the region.
* @param size
* Size of the region.
* @param dump
* True to include memory into dumps, false to exclude.
* @return
* 0 on success, (-1) on failure and rte_errno is set.
*/
int
eal_mem_set_dump(void *virt, size_t size, bool dump);
#endif /* _EAL_PRIVATE_H_ */

View File

@ -77,6 +77,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_reciprocal.c
# from unix dir
SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_file.c
SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_unix_memory.c
# from arch dir
SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_cpuflags.c

View File

@ -0,0 +1,98 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2020 Dmitry Kozlyuk
*/
#include <stdint.h>
#include <rte_compat.h>
/**
* @file
* @internal
*
* Wrappers for OS facilities related to memory paging, used across DPDK.
*/
/** Memory protection flags. */
enum rte_mem_prot {
RTE_PROT_READ = 1 << 0, /**< Read access. */
RTE_PROT_WRITE = 1 << 1, /**< Write access. */
RTE_PROT_EXECUTE = 1 << 2 /**< Code execution. */
};
/** Additional flags for memory mapping. */
enum rte_map_flags {
/** Changes to the mapped memory are visible to other processes. */
RTE_MAP_SHARED = 1 << 0,
/** Mapping is not backed by a regular file. */
RTE_MAP_ANONYMOUS = 1 << 1,
/** Copy-on-write mapping, changes are invisible to other processes. */
RTE_MAP_PRIVATE = 1 << 2,
/**
* Force mapping to the requested address. This flag should be used
* with caution, because to fulfill the request implementation
* may remove all other mappings in the requested region. However,
* it is not required to do so, thus mapping with this flag may fail.
*/
RTE_MAP_FORCE_ADDRESS = 1 << 3
};
/**
* Map a portion of an opened file or the page file into memory.
*
* This function is similar to POSIX mmap(3) with common MAP_ANONYMOUS
* extension, except for the return value.
*
* @param requested_addr
* Desired virtual address for mapping. Can be NULL to let OS choose.
* @param size
* Size of the mapping in bytes.
* @param prot
* Protection flags, a combination of rte_mem_prot values.
* @param flags
* Additional mapping flags, a combination of rte_map_flags.
* @param fd
* Mapped file descriptor. Can be negative for anonymous mapping.
* @param offset
* Offset of the mapped region in fd. Must be 0 for anonymous mappings.
* @return
* Mapped address or NULL on failure and rte_errno is set to OS error.
*/
__rte_internal
void *
rte_mem_map(void *requested_addr, size_t size, int prot, int flags,
int fd, size_t offset);
/**
* OS-independent implementation of POSIX munmap(3).
*/
__rte_internal
int
rte_mem_unmap(void *virt, size_t size);
/**
* Get system page size. This function never fails.
*
* @return
* Page size in bytes.
*/
__rte_internal
size_t
rte_mem_page_size(void);
/**
* Lock in physical memory all pages crossed by the address region.
*
* @param virt
* Base virtual address of the region.
* @param size
* Size of the region.
* @return
* 0 on success, negative on error.
*
* @see rte_mem_page_size() to retrieve the page size.
* @see rte_mem_lock_page() to lock an entire single page.
*/
__rte_internal
int
rte_mem_lock(const void *virt, size_t size);

View File

@ -84,6 +84,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_reciprocal.c
# from unix dir
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_file.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_unix_memory.c
# from arch dir
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_cpuflags.c

View File

@ -630,7 +630,7 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
mapped:
munmap(addr, alloc_sz);
unmapped:
flags = MAP_FIXED;
flags = EAL_RESERVE_FORCE_ADDRESS;
new_addr = eal_get_virtual_area(addr, &alloc_sz, alloc_sz, 0, flags);
if (new_addr != addr) {
if (new_addr != NULL)
@ -687,8 +687,7 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
return -1;
}
if (madvise(ms->addr, ms->len, MADV_DONTDUMP) != 0)
RTE_LOG(DEBUG, EAL, "madvise failed: %s\n", strerror(errno));
eal_mem_set_dump(ms->addr, ms->len, false);
exit_early = false;

View File

@ -387,3 +387,12 @@ EXPERIMENTAL {
rte_trace_regexp;
rte_trace_save;
};
INTERNAL {
global:
rte_mem_lock;
rte_mem_map;
rte_mem_page_size;
rte_mem_unmap;
};

View File

@ -0,0 +1,152 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2020 Dmitry Kozlyuk
*/
#include <string.h>
#include <sys/mman.h>
#include <unistd.h>
#include <rte_eal_paging.h>
#include <rte_errno.h>
#include <rte_log.h>
#include "eal_private.h"
#ifdef RTE_EXEC_ENV_LINUX
#define EAL_DONTDUMP MADV_DONTDUMP
#define EAL_DODUMP MADV_DODUMP
#elif defined RTE_EXEC_ENV_FREEBSD
#define EAL_DONTDUMP MADV_NOCORE
#define EAL_DODUMP MADV_CORE
#else
#error "madvise doesn't support this OS"
#endif
static void *
mem_map(void *requested_addr, size_t size, int prot, int flags,
int fd, size_t offset)
{
void *virt = mmap(requested_addr, size, prot, flags, fd, offset);
if (virt == MAP_FAILED) {
RTE_LOG(DEBUG, EAL,
"Cannot mmap(%p, 0x%zx, 0x%x, 0x%x, %d, 0x%zx): %s\n",
requested_addr, size, prot, flags, fd, offset,
strerror(errno));
rte_errno = errno;
return NULL;
}
return virt;
}
static int
mem_unmap(void *virt, size_t size)
{
int ret = munmap(virt, size);
if (ret < 0) {
RTE_LOG(DEBUG, EAL, "Cannot munmap(%p, 0x%zx): %s\n",
virt, size, strerror(errno));
rte_errno = errno;
}
return ret;
}
void *
eal_mem_reserve(void *requested_addr, size_t size, int flags)
{
int sys_flags = MAP_PRIVATE | MAP_ANONYMOUS;
if (flags & EAL_RESERVE_HUGEPAGES) {
#ifdef MAP_HUGETLB
sys_flags |= MAP_HUGETLB;
#else
rte_errno = ENOTSUP;
return NULL;
#endif
}
if (flags & EAL_RESERVE_FORCE_ADDRESS)
sys_flags |= MAP_FIXED;
return mem_map(requested_addr, size, PROT_NONE, sys_flags, -1, 0);
}
void
eal_mem_free(void *virt, size_t size)
{
mem_unmap(virt, size);
}
int
eal_mem_set_dump(void *virt, size_t size, bool dump)
{
int flags = dump ? EAL_DODUMP : EAL_DONTDUMP;
int ret = madvise(virt, size, flags);
if (ret) {
RTE_LOG(DEBUG, EAL, "madvise(%p, %#zx, %d) failed: %s\n",
virt, size, flags, strerror(rte_errno));
rte_errno = errno;
}
return ret;
}
static int
mem_rte_to_sys_prot(int prot)
{
int sys_prot = PROT_NONE;
if (prot & RTE_PROT_READ)
sys_prot |= PROT_READ;
if (prot & RTE_PROT_WRITE)
sys_prot |= PROT_WRITE;
if (prot & RTE_PROT_EXECUTE)
sys_prot |= PROT_EXEC;
return sys_prot;
}
void *
rte_mem_map(void *requested_addr, size_t size, int prot, int flags,
int fd, size_t offset)
{
int sys_flags = 0;
int sys_prot;
sys_prot = mem_rte_to_sys_prot(prot);
if (flags & RTE_MAP_SHARED)
sys_flags |= MAP_SHARED;
if (flags & RTE_MAP_ANONYMOUS)
sys_flags |= MAP_ANONYMOUS;
if (flags & RTE_MAP_PRIVATE)
sys_flags |= MAP_PRIVATE;
if (flags & RTE_MAP_FORCE_ADDRESS)
sys_flags |= MAP_FIXED;
return mem_map(requested_addr, size, sys_prot, sys_flags, fd, offset);
}
int
rte_mem_unmap(void *virt, size_t size)
{
return mem_unmap(virt, size);
}
size_t
rte_mem_page_size(void)
{
static size_t page_size;
if (!page_size)
page_size = sysconf(_SC_PAGESIZE);
return page_size;
}
int
rte_mem_lock(const void *virt, size_t size)
{
int ret = mlock(virt, size);
if (ret)
rte_errno = errno;
return ret;
}

View File

@ -3,4 +3,5 @@
sources += files(
'eal_file.c',
'eal_unix_memory.c',
)