mem: replace memseg with memseg lists

Before, we were aggregating multiple pages into one memseg, so the
number of memsegs was small. Now, each page gets its own memseg,
so the list of memsegs is huge. To accommodate the new memseg list
size and to keep the under-the-hood workings sane, the memseg list
is now not just a single list, but multiple lists. To be precise,
each hugepage size available on the system gets one or more memseg
lists, per socket.

In order to support dynamic memory allocation, we reserve all
memory in advance (unless we're in 32-bit legacy mode, in which
case we do not preallocate memory). As in, we do an anonymous
mmap() of the entire maximum size of memory per hugepage size, per
socket (which is limited to either RTE_MAX_MEMSEG_PER_TYPE pages or
RTE_MAX_MEM_MB_PER_TYPE megabytes worth of memory, whichever is the
smaller one), split over multiple lists (which are limited to
either RTE_MAX_MEMSEG_PER_LIST memsegs or RTE_MAX_MEM_MB_PER_LIST
megabytes per list, whichever is the smaller one). There is also
a global limit of CONFIG_RTE_MAX_MEM_MB megabytes, which is mainly
used for 32-bit targets to limit amounts of preallocated memory,
but can be used to place an upper limit on total amount of VA
memory that can be allocated by DPDK application.

So, for each hugepage size, we get (by default) up to 128G worth
of memory, per socket, split into chunks of up to 32G in size.
The address space is claimed at the start, in eal_common_memory.c.
The actual page allocation code is in eal_memalloc.c (Linux-only),
and largely consists of copied EAL memory init code.

Pages in the list are also indexed by address. That is, in order
to figure out where the page belongs, one can simply look at base
address for a memseg list. Similarly, figuring out IOVA address
of a memzone is a matter of finding the right memseg list, getting
offset and dividing by page size to get the appropriate memseg.

This commit also removes rte_eal_dump_physmem_layout() call,
according to deprecation notice [1], and removes that deprecation
notice as well.

On 32-bit targets due to limited VA space, DPDK will no longer
spread memory to different sockets like before. Instead, it will
(by default) allocate all of the memory on socket where master
lcore is. To override this behavior, --socket-mem must be used.

The rest of the changes are really ripple effects from the memseg
change - heap changes, compile fixes, and rewrites to support
fbarray-backed memseg lists. Due to earlier switch to _walk()
functions, most of the changes are simple fixes, however some
of the _walk() calls were switched to memseg list walk, where
it made sense to do so.

Additionally, we are also switching locks from flock() to fcntl().
Down the line, we will be introducing single-file segments option,
and we cannot use flock() locks to lock parts of the file. Therefore,
we will use fcntl() locks for legacy mem as well, in case someone is
unfortunate enough to accidentally start legacy mem primary process
alongside an already working non-legacy mem-based primary process.

[1] http://dpdk.org/dev/patchwork/patch/34002/

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Tested-by: Santosh Shukla <santosh.shukla@caviumnetworks.com>
Tested-by: Hemant Agrawal <hemant.agrawal@nxp.com>
Tested-by: Gowrishankar Muthukrishnan <gowrishankar.m@linux.vnet.ibm.com>
This commit is contained in:
Anatoly Burakov 2018-04-11 13:30:24 +01:00 committed by Thomas Monjalon
parent c44d09811b
commit 66cc45e293
37 changed files with 1593 additions and 593 deletions

View File

@ -61,7 +61,20 @@ CONFIG_RTE_CACHE_LINE_SIZE=64
CONFIG_RTE_LIBRTE_EAL=y CONFIG_RTE_LIBRTE_EAL=y
CONFIG_RTE_MAX_LCORE=128 CONFIG_RTE_MAX_LCORE=128
CONFIG_RTE_MAX_NUMA_NODES=8 CONFIG_RTE_MAX_NUMA_NODES=8
CONFIG_RTE_MAX_MEMSEG=256 CONFIG_RTE_MAX_MEMSEG_LISTS=64
# each memseg list will be limited to either RTE_MAX_MEMSEG_PER_LIST pages
# or RTE_MAX_MEM_MB_PER_LIST megabytes worth of memory, whichever is smaller
CONFIG_RTE_MAX_MEMSEG_PER_LIST=8192
CONFIG_RTE_MAX_MEM_MB_PER_LIST=32768
# a "type" is a combination of page size and NUMA node. total number of memseg
# lists per type will be limited to either RTE_MAX_MEMSEG_PER_TYPE pages (split
# over multiple lists of RTE_MAX_MEMSEG_PER_LIST pages), or
# RTE_MAX_MEM_MB_PER_TYPE megabytes of memory (split over multiple lists of
# RTE_MAX_MEM_MB_PER_LIST), whichever is smaller
CONFIG_RTE_MAX_MEMSEG_PER_TYPE=32768
CONFIG_RTE_MAX_MEM_MB_PER_TYPE=131072
# global maximum usable amount of VA, in megabytes
CONFIG_RTE_MAX_MEM_MB=524288
CONFIG_RTE_MAX_MEMZONE=2560 CONFIG_RTE_MAX_MEMZONE=2560
CONFIG_RTE_MAX_TAILQ=32 CONFIG_RTE_MAX_TAILQ=32
CONFIG_RTE_ENABLE_ASSERT=n CONFIG_RTE_ENABLE_ASSERT=n

View File

@ -46,3 +46,6 @@ CONFIG_RTE_LIBRTE_PMD_ZUC=n
# AVP PMD is not supported on 32-bit # AVP PMD is not supported on 32-bit
# #
CONFIG_RTE_LIBRTE_AVP_PMD=n CONFIG_RTE_LIBRTE_AVP_PMD=n
# 32-bit doesn't break up memory in lists, but does have VA allocation limit
CONFIG_RTE_MAX_MEM_MB=2048

View File

@ -51,3 +51,6 @@ CONFIG_RTE_LIBRTE_PMD_ZUC=n
# AVP PMD is not supported on 32-bit # AVP PMD is not supported on 32-bit
# #
CONFIG_RTE_LIBRTE_AVP_PMD=n CONFIG_RTE_LIBRTE_AVP_PMD=n
# 32-bit doesn't break up memory in lists, but does have VA allocation limit
CONFIG_RTE_MAX_MEM_MB=2048

View File

@ -26,3 +26,6 @@ CONFIG_RTE_LIBRTE_SFC_EFX_PMD=n
# AVP PMD is not supported on 32-bit # AVP PMD is not supported on 32-bit
# #
CONFIG_RTE_LIBRTE_AVP_PMD=n CONFIG_RTE_LIBRTE_AVP_PMD=n
# 32-bit doesn't break up memory in lists, but does have VA allocation limit
CONFIG_RTE_MAX_MEM_MB=2048

View File

@ -21,7 +21,12 @@
/****** library defines ********/ /****** library defines ********/
/* EAL defines */ /* EAL defines */
#define RTE_MAX_MEMSEG 512 #define RTE_MAX_MEMSEG_LISTS 128
#define RTE_MAX_MEMSEG_PER_LIST 8192
#define RTE_MAX_MEM_MB_PER_LIST 32768
#define RTE_MAX_MEMSEG_PER_TYPE 32768
#define RTE_MAX_MEM_MB_PER_TYPE 65536
#define RTE_MAX_MEM_MB 524288
#define RTE_MAX_MEMZONE 2560 #define RTE_MAX_MEMZONE 2560
#define RTE_MAX_TAILQ 32 #define RTE_MAX_TAILQ 32
#define RTE_LOG_DP_LEVEL RTE_LOG_INFO #define RTE_LOG_DP_LEVEL RTE_LOG_INFO

View File

@ -38,15 +38,6 @@ Deprecation Notices
success and failure, respectively. This will change to 1 and 0 for true and success and failure, respectively. This will change to 1 and 0 for true and
false, respectively, to make use of the function more intuitive. false, respectively, to make use of the function more intuitive.
* eal: due to internal data layout reorganization, there will be changes to
several structures and functions as a result of coming changes to support
memory hotplug in v18.05.
``rte_eal_get_physmem_layout`` will be deprecated and removed in subsequent
releases.
``rte_mem_config`` contents will change due to switch to memseg lists.
``rte_memzone`` member ``memseg_id`` will no longer serve any useful purpose
and will be removed.
* eal: a new set of mbuf mempool ops name APIs for user, platform and best * eal: a new set of mbuf mempool ops name APIs for user, platform and best
mempool names have been defined in ``rte_mbuf`` in v18.02. The uses of mempool names have been defined in ``rte_mbuf`` in v18.02. The uses of
``rte_eal_mbuf_default_mempool_ops`` shall be replaced by ``rte_eal_mbuf_default_mempool_ops`` shall be replaced by

View File

@ -190,7 +190,8 @@ static int vfio_map_irq_region(struct fslmc_vfio_group *group)
} }
static int static int
fslmc_vfio_map(const struct rte_memseg *ms, void *arg) fslmc_vfio_map(const struct rte_memseg_list *msl __rte_unused,
const struct rte_memseg *ms, void *arg)
{ {
int *n_segs = arg; int *n_segs = arg;
struct fslmc_vfio_group *group; struct fslmc_vfio_group *group;
@ -232,18 +233,11 @@ fslmc_vfio_map(const struct rte_memseg *ms, void *arg)
int rte_fslmc_vfio_dmamap(void) int rte_fslmc_vfio_dmamap(void)
{ {
const struct rte_memseg *memseg;
int i = 0; int i = 0;
if (is_dma_done) if (is_dma_done)
return 0; return 0;
memseg = rte_eal_get_physmem_layout();
if (memseg == NULL) {
DPAA2_BUS_ERR("Cannot get physical layout");
return -ENODEV;
}
if (rte_memseg_walk(fslmc_vfio_map, &i) < 0) if (rte_memseg_walk(fslmc_vfio_map, &i) < 0)
return -1; return -1;

View File

@ -274,7 +274,7 @@ static phys_addr_t dpaa2_mem_vtop(uint64_t vaddr)
if (dpaa2_virt_mode) if (dpaa2_virt_mode)
return vaddr; return vaddr;
memseg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr); memseg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL);
if (memseg) if (memseg)
return memseg->phys_addr + RTE_PTR_DIFF(vaddr, memseg->addr); return memseg->phys_addr + RTE_PTR_DIFF(vaddr, memseg->addr);
return (size_t)NULL; return (size_t)NULL;

View File

@ -117,9 +117,10 @@ rte_pci_unmap_device(struct rte_pci_device *dev)
} }
static int static int
find_max_end_va(const struct rte_memseg *ms, void *arg) find_max_end_va(const struct rte_memseg_list *msl, void *arg)
{ {
void *end_va = RTE_PTR_ADD(ms->addr, ms->len); size_t sz = msl->memseg_arr.len * msl->page_sz;
void *end_va = RTE_PTR_ADD(msl->base_va, sz);
void **max_va = arg; void **max_va = arg;
if (*max_va < end_va) if (*max_va < end_va)
@ -132,10 +133,11 @@ pci_find_max_end_va(void)
{ {
void *va = NULL; void *va = NULL;
rte_memseg_walk(find_max_end_va, &va); rte_memseg_list_walk(find_max_end_va, &va);
return va; return va;
} }
/* parse one line of the "resource" sysfs file (note that the 'line' /* parse one line of the "resource" sysfs file (note that the 'line'
* string is modified) * string is modified)
*/ */

View File

@ -95,7 +95,7 @@ dpaa_mem_vtop(void *vaddr)
{ {
const struct rte_memseg *ms; const struct rte_memseg *ms;
ms = rte_mem_virt2memseg(vaddr); ms = rte_mem_virt2memseg(vaddr, NULL);
if (ms) if (ms)
return ms->iova + RTE_PTR_DIFF(vaddr, ms->addr); return ms->iova + RTE_PTR_DIFF(vaddr, ms->addr);
return (size_t)NULL; return (size_t)NULL;

View File

@ -141,10 +141,10 @@ mlx4_mr_get(struct priv *priv, struct rte_mempool *mp)
(void *)mp, (void *)start, (void *)end, (void *)mp, (void *)start, (void *)end,
(size_t)(end - start)); (size_t)(end - start));
/* Round start and end to page boundary if found in memory segments. */ /* Round start and end to page boundary if found in memory segments. */
ms = rte_mem_virt2memseg((void *)start); ms = rte_mem_virt2memseg((void *)start, NULL);
if (ms != NULL) if (ms != NULL)
start = RTE_ALIGN_FLOOR(start, ms->hugepage_sz); start = RTE_ALIGN_FLOOR(start, ms->hugepage_sz);
ms = rte_mem_virt2memseg((void *)end); ms = rte_mem_virt2memseg((void *)end, NULL);
if (ms != NULL) if (ms != NULL)
end = RTE_ALIGN_CEIL(end, ms->hugepage_sz); end = RTE_ALIGN_CEIL(end, ms->hugepage_sz);

View File

@ -478,7 +478,8 @@ static struct rte_pci_driver mlx5_driver;
static void *uar_base; static void *uar_base;
static int static int
find_lower_va_bound(const struct rte_memseg *ms, void *arg) find_lower_va_bound(const struct rte_memseg_list *msl __rte_unused,
const struct rte_memseg *ms, void *arg)
{ {
void **addr = arg; void **addr = arg;

View File

@ -262,10 +262,10 @@ mlx5_mr_new(struct rte_eth_dev *dev, struct rte_mempool *mp)
mr->end = end; mr->end = end;
/* Round start and end to page boundary if found in memory segments. */ /* Round start and end to page boundary if found in memory segments. */
ms = rte_mem_virt2memseg((void *)start); ms = rte_mem_virt2memseg((void *)start, NULL);
if (ms != NULL) if (ms != NULL)
start = RTE_ALIGN_FLOOR(start, ms->hugepage_sz); start = RTE_ALIGN_FLOOR(start, ms->hugepage_sz);
ms = rte_mem_virt2memseg((void *)end); ms = rte_mem_virt2memseg((void *)end, NULL);
if (ms != NULL) if (ms != NULL)
end = RTE_ALIGN_CEIL(end, ms->hugepage_sz); end = RTE_ALIGN_CEIL(end, ms->hugepage_sz);

View File

@ -75,7 +75,8 @@ struct walk_arg {
uint32_t region_nr; uint32_t region_nr;
}; };
static int static int
add_memory_region(const struct rte_memseg *ms, size_t len, void *arg) add_memory_region(const struct rte_memseg_list *msl __rte_unused,
const struct rte_memseg *ms, size_t len, void *arg)
{ {
struct walk_arg *wa = arg; struct walk_arg *wa = arg;
struct vhost_memory_region *mr; struct vhost_memory_region *mr;
@ -95,7 +96,6 @@ add_memory_region(const struct rte_memseg *ms, size_t len, void *arg)
return 0; return 0;
} }
/* By default, vhost kernel module allows 64 regions, but DPDK allows /* By default, vhost kernel module allows 64 regions, but DPDK allows
* 256 segments. As a relief, below function merges those virtually * 256 segments. As a relief, below function merges those virtually
* adjacent memsegs into one region. * adjacent memsegs into one region.

View File

@ -64,8 +64,8 @@ static int mem_cfg_fd = -1;
static struct flock wr_lock = { static struct flock wr_lock = {
.l_type = F_WRLCK, .l_type = F_WRLCK,
.l_whence = SEEK_SET, .l_whence = SEEK_SET,
.l_start = offsetof(struct rte_mem_config, memseg), .l_start = offsetof(struct rte_mem_config, memsegs),
.l_len = sizeof(early_mem_config.memseg), .l_len = sizeof(early_mem_config.memsegs),
}; };
/* Address of global and public configuration */ /* Address of global and public configuration */
@ -430,11 +430,11 @@ eal_parse_args(int argc, char **argv)
} }
static int static int
check_socket(const struct rte_memseg *ms, void *arg) check_socket(const struct rte_memseg_list *msl, void *arg)
{ {
int *socket_id = arg; int *socket_id = arg;
if (ms->socket_id == *socket_id) if (msl->socket_id == *socket_id && msl->memseg_arr.count != 0)
return 1; return 1;
return 0; return 0;
@ -447,10 +447,11 @@ eal_check_mem_on_local_socket(void)
socket_id = rte_lcore_to_socket_id(rte_config.master_lcore); socket_id = rte_lcore_to_socket_id(rte_config.master_lcore);
if (rte_memseg_walk(check_socket, &socket_id) == 0) if (rte_memseg_list_walk(check_socket, &socket_id) == 0)
RTE_LOG(WARNING, EAL, "WARNING: Master core has no memory on local socket!\n"); RTE_LOG(WARNING, EAL, "WARNING: Master core has no memory on local socket!\n");
} }
static int static int
sync_func(__attribute__((unused)) void *arg) sync_func(__attribute__((unused)) void *arg)
{ {
@ -561,7 +562,6 @@ rte_eal_init(int argc, char **argv)
rte_eal_get_configuration()->iova_mode = rte_bus_get_iommu_class(); rte_eal_get_configuration()->iova_mode = rte_bus_get_iommu_class();
if (internal_config.no_hugetlbfs == 0 && if (internal_config.no_hugetlbfs == 0 &&
internal_config.process_type != RTE_PROC_SECONDARY &&
eal_hugepage_info_init() < 0) { eal_hugepage_info_init() < 0) {
rte_eal_init_alert("Cannot get hugepage information."); rte_eal_init_alert("Cannot get hugepage information.");
rte_errno = EACCES; rte_errno = EACCES;

View File

@ -47,12 +47,18 @@ eal_hugepage_info_init(void)
struct hugepage_info *hpi = &internal_config.hugepage_info[0]; struct hugepage_info *hpi = &internal_config.hugepage_info[0];
struct hugepage_info *tmp_hpi; struct hugepage_info *tmp_hpi;
internal_config.num_hugepage_sizes = 1;
/* nothing more to be done for secondary */
if (rte_eal_process_type() == RTE_PROC_SECONDARY)
return 0;
sysctl_size = sizeof(num_buffers); sysctl_size = sizeof(num_buffers);
error = sysctlbyname("hw.contigmem.num_buffers", &num_buffers, error = sysctlbyname("hw.contigmem.num_buffers", &num_buffers,
&sysctl_size, NULL, 0); &sysctl_size, NULL, 0);
if (error != 0) { if (error != 0) {
RTE_LOG(ERR, EAL, "could not read sysctl hw.contigmem.num_buffers"); RTE_LOG(ERR, EAL, "could not read sysctl hw.contigmem.num_buffers\n");
return -1; return -1;
} }
@ -61,7 +67,7 @@ eal_hugepage_info_init(void)
&sysctl_size, NULL, 0); &sysctl_size, NULL, 0);
if (error != 0) { if (error != 0) {
RTE_LOG(ERR, EAL, "could not read sysctl hw.contigmem.buffer_size"); RTE_LOG(ERR, EAL, "could not read sysctl hw.contigmem.buffer_size\n");
return -1; return -1;
} }
@ -81,22 +87,21 @@ eal_hugepage_info_init(void)
RTE_LOG(INFO, EAL, "Contigmem driver has %d buffers, each of size %dKB\n", RTE_LOG(INFO, EAL, "Contigmem driver has %d buffers, each of size %dKB\n",
num_buffers, (int)(buffer_size>>10)); num_buffers, (int)(buffer_size>>10));
internal_config.num_hugepage_sizes = 1;
hpi->hugedir = CONTIGMEM_DEV; hpi->hugedir = CONTIGMEM_DEV;
hpi->hugepage_sz = buffer_size; hpi->hugepage_sz = buffer_size;
hpi->num_pages[0] = num_buffers; hpi->num_pages[0] = num_buffers;
hpi->lock_descriptor = fd; hpi->lock_descriptor = fd;
tmp_hpi = create_shared_memory(eal_hugepage_info_path(), tmp_hpi = create_shared_memory(eal_hugepage_info_path(),
sizeof(struct hugepage_info)); sizeof(internal_config.hugepage_info));
if (tmp_hpi == NULL ) { if (tmp_hpi == NULL ) {
RTE_LOG(ERR, EAL, "Failed to create shared memory!\n"); RTE_LOG(ERR, EAL, "Failed to create shared memory!\n");
return -1; return -1;
} }
memcpy(tmp_hpi, hpi, sizeof(struct hugepage_info)); memcpy(tmp_hpi, hpi, sizeof(internal_config.hugepage_info));
if ( munmap(tmp_hpi, sizeof(struct hugepage_info)) < 0) { if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) {
RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n"); RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n");
return -1; return -1;
} }

View File

@ -6,6 +6,8 @@
#include <sys/types.h> #include <sys/types.h>
#include <sys/sysctl.h> #include <sys/sysctl.h>
#include <inttypes.h> #include <inttypes.h>
#include <errno.h>
#include <string.h>
#include <fcntl.h> #include <fcntl.h>
#include <rte_eal.h> #include <rte_eal.h>
@ -41,37 +43,135 @@ rte_eal_hugepage_init(void)
struct rte_mem_config *mcfg; struct rte_mem_config *mcfg;
uint64_t total_mem = 0; uint64_t total_mem = 0;
void *addr; void *addr;
unsigned i, j, seg_idx = 0; unsigned int i, j, seg_idx = 0;
/* get pointer to global configuration */ /* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config; mcfg = rte_eal_get_configuration()->mem_config;
/* for debug purposes, hugetlbfs can be disabled */ /* for debug purposes, hugetlbfs can be disabled */
if (internal_config.no_hugetlbfs) { if (internal_config.no_hugetlbfs) {
addr = malloc(internal_config.memory); struct rte_memseg_list *msl;
mcfg->memseg[0].iova = (rte_iova_t)(uintptr_t)addr; struct rte_fbarray *arr;
mcfg->memseg[0].addr = addr; struct rte_memseg *ms;
mcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K; uint64_t page_sz;
mcfg->memseg[0].len = internal_config.memory; int n_segs, cur_seg;
mcfg->memseg[0].socket_id = 0;
/* create a memseg list */
msl = &mcfg->memsegs[0];
page_sz = RTE_PGSIZE_4K;
n_segs = internal_config.memory / page_sz;
if (rte_fbarray_init(&msl->memseg_arr, "nohugemem", n_segs,
sizeof(struct rte_memseg))) {
RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n");
return -1;
}
addr = mmap(NULL, internal_config.memory,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
if (addr == MAP_FAILED) {
RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__,
strerror(errno));
return -1;
}
msl->base_va = addr;
msl->page_sz = page_sz;
msl->socket_id = 0;
/* populate memsegs. each memseg is 1 page long */
for (cur_seg = 0; cur_seg < n_segs; cur_seg++) {
arr = &msl->memseg_arr;
ms = rte_fbarray_get(arr, cur_seg);
if (rte_eal_iova_mode() == RTE_IOVA_VA)
ms->iova = (uintptr_t)addr;
else
ms->iova = RTE_BAD_IOVA;
ms->addr = addr;
ms->hugepage_sz = page_sz;
ms->len = page_sz;
ms->socket_id = 0;
rte_fbarray_set_used(arr, cur_seg);
addr = RTE_PTR_ADD(addr, page_sz);
}
return 0; return 0;
} }
/* map all hugepages and sort them */ /* map all hugepages and sort them */
for (i = 0; i < internal_config.num_hugepage_sizes; i ++){ for (i = 0; i < internal_config.num_hugepage_sizes; i ++){
struct hugepage_info *hpi; struct hugepage_info *hpi;
uint64_t page_sz, mem_needed;
unsigned int n_pages, max_pages;
hpi = &internal_config.hugepage_info[i]; hpi = &internal_config.hugepage_info[i];
for (j = 0; j < hpi->num_pages[0]; j++) { page_sz = hpi->hugepage_sz;
max_pages = hpi->num_pages[0];
mem_needed = RTE_ALIGN_CEIL(internal_config.memory - total_mem,
page_sz);
n_pages = RTE_MIN(mem_needed / page_sz, max_pages);
for (j = 0; j < n_pages; j++) {
struct rte_memseg_list *msl;
struct rte_fbarray *arr;
struct rte_memseg *seg; struct rte_memseg *seg;
int msl_idx, ms_idx;
rte_iova_t physaddr; rte_iova_t physaddr;
int error; int error;
size_t sysctl_size = sizeof(physaddr); size_t sysctl_size = sizeof(physaddr);
char physaddr_str[64]; char physaddr_str[64];
addr = mmap(NULL, hpi->hugepage_sz, PROT_READ|PROT_WRITE, for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS;
MAP_SHARED, hpi->lock_descriptor, msl_idx++) {
j * EAL_PAGE_SIZE); bool empty;
msl = &mcfg->memsegs[msl_idx];
arr = &msl->memseg_arr;
if (msl->page_sz != page_sz)
continue;
empty = arr->count == 0;
/* we need 1, plus hole if not empty */
ms_idx = rte_fbarray_find_next_n_free(arr,
0, 1 + (empty ? 1 : 0));
/* memseg list is full? */
if (ms_idx < 0)
continue;
/* leave some space between memsegs, they are
* not IOVA contiguous, so they shouldn't be VA
* contiguous either.
*/
if (!empty)
ms_idx++;
break;
}
if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
RTE_LOG(ERR, EAL, "Could not find space for memseg. Please increase %s and/or %s in configuration.\n",
RTE_STR(CONFIG_RTE_MAX_MEMSEG_PER_TYPE),
RTE_STR(CONFIG_RTE_MAX_MEM_PER_TYPE));
return -1;
}
arr = &msl->memseg_arr;
seg = rte_fbarray_get(arr, ms_idx);
addr = RTE_PTR_ADD(msl->base_va,
(size_t)msl->page_sz * ms_idx);
/* address is already mapped in memseg list, so using
* MAP_FIXED here is safe.
*/
addr = mmap(addr, page_sz, PROT_READ|PROT_WRITE,
MAP_SHARED | MAP_FIXED,
hpi->lock_descriptor,
j * EAL_PAGE_SIZE);
if (addr == MAP_FAILED) { if (addr == MAP_FAILED) {
RTE_LOG(ERR, EAL, "Failed to mmap buffer %u from %s\n", RTE_LOG(ERR, EAL, "Failed to mmap buffer %u from %s\n",
j, hpi->hugedir); j, hpi->hugedir);
@ -88,23 +188,53 @@ rte_eal_hugepage_init(void)
return -1; return -1;
} }
seg = &mcfg->memseg[seg_idx++];
seg->addr = addr; seg->addr = addr;
seg->iova = physaddr; seg->iova = physaddr;
seg->hugepage_sz = hpi->hugepage_sz; seg->hugepage_sz = page_sz;
seg->len = hpi->hugepage_sz; seg->len = page_sz;
seg->nchannel = mcfg->nchannel; seg->nchannel = mcfg->nchannel;
seg->nrank = mcfg->nrank; seg->nrank = mcfg->nrank;
seg->socket_id = 0; seg->socket_id = 0;
rte_fbarray_set_used(arr, ms_idx);
RTE_LOG(INFO, EAL, "Mapped memory segment %u @ %p: physaddr:0x%" RTE_LOG(INFO, EAL, "Mapped memory segment %u @ %p: physaddr:0x%"
PRIx64", len %zu\n", PRIx64", len %zu\n",
seg_idx, addr, physaddr, hpi->hugepage_sz); seg_idx, addr, physaddr, page_sz);
if (total_mem >= internal_config.memory ||
seg_idx >= RTE_MAX_MEMSEG) total_mem += seg->len;
break;
} }
if (total_mem >= internal_config.memory)
break;
} }
if (total_mem < internal_config.memory) {
RTE_LOG(ERR, EAL, "Couldn't reserve requested memory, "
"requested: %" PRIu64 "M "
"available: %" PRIu64 "M\n",
internal_config.memory >> 20, total_mem >> 20);
return -1;
}
return 0;
}
struct attach_walk_args {
int fd_hugepage;
int seg_idx;
};
static int
attach_segment(const struct rte_memseg_list *msl __rte_unused,
const struct rte_memseg *ms, void *arg)
{
struct attach_walk_args *wa = arg;
void *addr;
addr = mmap(ms->addr, ms->len, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, wa->fd_hugepage,
wa->seg_idx * EAL_PAGE_SIZE);
if (addr == MAP_FAILED || addr != ms->addr)
return -1;
wa->seg_idx++;
return 0; return 0;
} }
@ -113,8 +243,7 @@ rte_eal_hugepage_attach(void)
{ {
const struct hugepage_info *hpi; const struct hugepage_info *hpi;
int fd_hugepage_info, fd_hugepage = -1; int fd_hugepage_info, fd_hugepage = -1;
unsigned i = 0; unsigned int i;
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
/* Obtain a file descriptor for hugepage_info */ /* Obtain a file descriptor for hugepage_info */
fd_hugepage_info = open(eal_hugepage_info_path(), O_RDONLY); fd_hugepage_info = open(eal_hugepage_info_path(), O_RDONLY);
@ -124,41 +253,43 @@ rte_eal_hugepage_attach(void)
} }
/* Map the shared hugepage_info into the process address spaces */ /* Map the shared hugepage_info into the process address spaces */
hpi = mmap(NULL, sizeof(struct hugepage_info), PROT_READ, MAP_PRIVATE, hpi = mmap(NULL, sizeof(internal_config.hugepage_info),
fd_hugepage_info, 0); PROT_READ, MAP_PRIVATE, fd_hugepage_info, 0);
if (hpi == MAP_FAILED) { if (hpi == MAP_FAILED) {
RTE_LOG(ERR, EAL, "Could not mmap %s\n", eal_hugepage_info_path()); RTE_LOG(ERR, EAL, "Could not mmap %s\n", eal_hugepage_info_path());
goto error; goto error;
} }
/* Obtain a file descriptor for contiguous memory */ for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
fd_hugepage = open(hpi->hugedir, O_RDWR); const struct hugepage_info *cur_hpi = &hpi[i];
if (fd_hugepage < 0) { struct attach_walk_args wa;
RTE_LOG(ERR, EAL, "Could not open %s\n", hpi->hugedir);
goto error;
}
/* Map the contiguous memory into each memory segment */ memset(&wa, 0, sizeof(wa));
for (i = 0; i < hpi->num_pages[0]; i++) {
void *addr; /* Obtain a file descriptor for contiguous memory */
struct rte_memseg *seg = &mcfg->memseg[i]; fd_hugepage = open(cur_hpi->hugedir, O_RDWR);
if (fd_hugepage < 0) {
RTE_LOG(ERR, EAL, "Could not open %s\n",
cur_hpi->hugedir);
goto error;
}
wa.fd_hugepage = fd_hugepage;
wa.seg_idx = 0;
addr = mmap(seg->addr, hpi->hugepage_sz, PROT_READ|PROT_WRITE, /* Map the contiguous memory into each memory segment */
MAP_SHARED|MAP_FIXED, fd_hugepage, if (rte_memseg_walk(attach_segment, &wa) < 0) {
i * EAL_PAGE_SIZE);
if (addr == MAP_FAILED || addr != seg->addr) {
RTE_LOG(ERR, EAL, "Failed to mmap buffer %u from %s\n", RTE_LOG(ERR, EAL, "Failed to mmap buffer %u from %s\n",
i, hpi->hugedir); wa.seg_idx, cur_hpi->hugedir);
goto error; goto error;
} }
close(fd_hugepage);
fd_hugepage = -1;
} }
/* hugepage_info is no longer required */ /* hugepage_info is no longer required */
munmap((void *)(uintptr_t)hpi, sizeof(struct hugepage_info)); munmap((void *)(uintptr_t)hpi, sizeof(internal_config.hugepage_info));
close(fd_hugepage_info); close(fd_hugepage_info);
close(fd_hugepage);
return 0; return 0;
error: error:

View File

@ -13,6 +13,7 @@
#include <sys/mman.h> #include <sys/mman.h>
#include <sys/queue.h> #include <sys/queue.h>
#include <rte_fbarray.h>
#include <rte_memory.h> #include <rte_memory.h>
#include <rte_eal.h> #include <rte_eal.h>
#include <rte_eal_memconfig.h> #include <rte_eal_memconfig.h>
@ -30,6 +31,8 @@
* which is a multiple of hugepage size. * which is a multiple of hugepage size.
*/ */
#define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i"
static uint64_t baseaddr_offset; static uint64_t baseaddr_offset;
static uint64_t system_page_sz; static uint64_t system_page_sz;
@ -120,15 +123,394 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
return aligned_addr; return aligned_addr;
} }
/* static uint64_t
* Return a pointer to a read-only table of struct rte_physmem_desc get_mem_amount(uint64_t page_sz, uint64_t max_mem)
* elements, containing the layout of all addressable physical
* memory. The last element of the table contains a NULL address.
*/
const struct rte_memseg *
rte_eal_get_physmem_layout(void)
{ {
return rte_eal_get_configuration()->mem_config->memseg; uint64_t area_sz, max_pages;
/* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_MB_PER_LIST */
max_pages = RTE_MAX_MEMSEG_PER_LIST;
max_mem = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20, max_mem);
area_sz = RTE_MIN(page_sz * max_pages, max_mem);
/* make sure the list isn't smaller than the page size */
area_sz = RTE_MAX(area_sz, page_sz);
return RTE_ALIGN(area_sz, page_sz);
}
static int
alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz,
uint64_t max_mem, int socket_id, int type_msl_idx)
{
char name[RTE_FBARRAY_NAME_LEN];
uint64_t mem_amount;
int max_segs;
mem_amount = get_mem_amount(page_sz, max_mem);
max_segs = mem_amount / page_sz;
snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id,
type_msl_idx);
if (rte_fbarray_init(&msl->memseg_arr, name, max_segs,
sizeof(struct rte_memseg))) {
RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n",
rte_strerror(rte_errno));
return -1;
}
msl->page_sz = page_sz;
msl->socket_id = socket_id;
msl->base_va = NULL;
RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n",
(size_t)page_sz >> 10, socket_id);
return 0;
}
static int
alloc_va_space(struct rte_memseg_list *msl)
{
uint64_t page_sz;
size_t mem_sz;
void *addr;
int flags = 0;
#ifdef RTE_ARCH_PPC_64
flags |= MAP_HUGETLB;
#endif
page_sz = msl->page_sz;
mem_sz = page_sz * msl->memseg_arr.len;
addr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags);
if (addr == NULL) {
if (rte_errno == EADDRNOTAVAIL)
RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - please use '--base-virtaddr' option\n",
(unsigned long long)mem_sz, msl->base_va);
else
RTE_LOG(ERR, EAL, "Cannot reserve memory\n");
return -1;
}
msl->base_va = addr;
return 0;
}
static int __rte_unused
memseg_primary_init_32(void)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
int active_sockets, hpi_idx, msl_idx = 0;
unsigned int socket_id, i;
struct rte_memseg_list *msl;
uint64_t extra_mem_per_socket, total_extra_mem, total_requested_mem;
uint64_t max_mem;
/* no-huge does not need this at all */
if (internal_config.no_hugetlbfs)
return 0;
/* this is a giant hack, but desperate times call for desperate
* measures. in legacy 32-bit mode, we cannot preallocate VA space,
* because having upwards of 2 gigabytes of VA space already mapped will
* interfere with our ability to map and sort hugepages.
*
* therefore, in legacy 32-bit mode, we will be initializing memseg
* lists much later - in eal_memory.c, right after we unmap all the
* unneeded pages. this will not affect secondary processes, as those
* should be able to mmap the space without (too many) problems.
*/
if (internal_config.legacy_mem)
return 0;
/* 32-bit mode is a very special case. we cannot know in advance where
* the user will want to allocate their memory, so we have to do some
* heuristics.
*/
active_sockets = 0;
total_requested_mem = 0;
if (internal_config.force_sockets)
for (i = 0; i < rte_socket_count(); i++) {
uint64_t mem;
socket_id = rte_socket_id_by_idx(i);
mem = internal_config.socket_mem[socket_id];
if (mem == 0)
continue;
active_sockets++;
total_requested_mem += mem;
}
else
total_requested_mem = internal_config.memory;
max_mem = (uint64_t) RTE_MAX_MEM_MB_PER_TYPE << 20;
if (total_requested_mem > max_mem) {
RTE_LOG(ERR, EAL, "Invalid parameters: 32-bit process can at most use %uM of memory\n",
(unsigned int)(max_mem >> 20));
return -1;
}
total_extra_mem = max_mem - total_requested_mem;
extra_mem_per_socket = active_sockets == 0 ? total_extra_mem :
total_extra_mem / active_sockets;
/* the allocation logic is a little bit convoluted, but here's how it
* works, in a nutshell:
* - if user hasn't specified on which sockets to allocate memory via
* --socket-mem, we allocate all of our memory on master core socket.
* - if user has specified sockets to allocate memory on, there may be
* some "unused" memory left (e.g. if user has specified --socket-mem
* such that not all memory adds up to 2 gigabytes), so add it to all
* sockets that are in use equally.
*
* page sizes are sorted by size in descending order, so we can safely
* assume that we dispense with bigger page sizes first.
*/
/* create memseg lists */
for (i = 0; i < rte_socket_count(); i++) {
int hp_sizes = (int) internal_config.num_hugepage_sizes;
uint64_t max_socket_mem, cur_socket_mem;
unsigned int master_lcore_socket;
struct rte_config *cfg = rte_eal_get_configuration();
bool skip;
socket_id = rte_socket_id_by_idx(i);
#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
if (socket_id > 0)
break;
#endif
/* if we didn't specifically request memory on this socket */
skip = active_sockets != 0 &&
internal_config.socket_mem[socket_id] == 0;
/* ...or if we didn't specifically request memory on *any*
* socket, and this is not master lcore
*/
master_lcore_socket = rte_lcore_to_socket_id(cfg->master_lcore);
skip |= active_sockets == 0 && socket_id != master_lcore_socket;
if (skip) {
RTE_LOG(DEBUG, EAL, "Will not preallocate memory on socket %u\n",
socket_id);
continue;
}
/* max amount of memory on this socket */
max_socket_mem = (active_sockets != 0 ?
internal_config.socket_mem[socket_id] :
internal_config.memory) +
extra_mem_per_socket;
cur_socket_mem = 0;
for (hpi_idx = 0; hpi_idx < hp_sizes; hpi_idx++) {
uint64_t max_pagesz_mem, cur_pagesz_mem = 0;
uint64_t hugepage_sz;
struct hugepage_info *hpi;
int type_msl_idx, max_segs, total_segs = 0;
hpi = &internal_config.hugepage_info[hpi_idx];
hugepage_sz = hpi->hugepage_sz;
max_segs = RTE_MAX_MEMSEG_PER_TYPE;
max_pagesz_mem = max_socket_mem - cur_socket_mem;
/* make it multiple of page size */
max_pagesz_mem = RTE_ALIGN_FLOOR(max_pagesz_mem,
hugepage_sz);
RTE_LOG(DEBUG, EAL, "Attempting to preallocate "
"%" PRIu64 "M on socket %i\n",
max_pagesz_mem >> 20, socket_id);
type_msl_idx = 0;
while (cur_pagesz_mem < max_pagesz_mem &&
total_segs < max_segs) {
if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
RTE_LOG(ERR, EAL,
"No more space in memseg lists, please increase %s\n",
RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
return -1;
}
msl = &mcfg->memsegs[msl_idx++];
if (alloc_memseg_list(msl, hugepage_sz,
max_pagesz_mem, socket_id,
type_msl_idx))
return -1;
total_segs += msl->memseg_arr.len;
cur_pagesz_mem = total_segs * hugepage_sz;
type_msl_idx++;
if (alloc_va_space(msl)) {
RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n");
return -1;
}
}
cur_socket_mem += cur_pagesz_mem;
}
}
return 0;
}
static int __rte_unused
memseg_primary_init(void)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
int i, socket_id, hpi_idx, msl_idx = 0;
struct rte_memseg_list *msl;
uint64_t max_mem, total_mem;
/* no-huge does not need this at all */
if (internal_config.no_hugetlbfs)
return 0;
max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
total_mem = 0;
/* create memseg lists */
for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes;
hpi_idx++) {
struct hugepage_info *hpi;
uint64_t hugepage_sz;
hpi = &internal_config.hugepage_info[hpi_idx];
hugepage_sz = hpi->hugepage_sz;
for (i = 0; i < (int) rte_socket_count(); i++) {
uint64_t max_type_mem, total_type_mem = 0;
int type_msl_idx, max_segs, total_segs = 0;
socket_id = rte_socket_id_by_idx(i);
#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
if (socket_id > 0)
break;
#endif
max_type_mem = RTE_MIN(max_mem - total_mem,
(uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20);
max_segs = RTE_MAX_MEMSEG_PER_TYPE;
type_msl_idx = 0;
while (total_type_mem < max_type_mem &&
total_segs < max_segs) {
uint64_t cur_max_mem;
if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
RTE_LOG(ERR, EAL,
"No more space in memseg lists, please increase %s\n",
RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
return -1;
}
msl = &mcfg->memsegs[msl_idx++];
cur_max_mem = max_type_mem - total_type_mem;
if (alloc_memseg_list(msl, hugepage_sz,
cur_max_mem, socket_id,
type_msl_idx))
return -1;
total_segs += msl->memseg_arr.len;
total_type_mem = total_segs * hugepage_sz;
type_msl_idx++;
if (alloc_va_space(msl)) {
RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n");
return -1;
}
}
total_mem += total_type_mem;
}
}
return 0;
}
static int
memseg_secondary_init(void)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
int msl_idx = 0;
struct rte_memseg_list *msl;
for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
msl = &mcfg->memsegs[msl_idx];
/* skip empty memseg lists */
if (msl->memseg_arr.len == 0)
continue;
if (rte_fbarray_attach(&msl->memseg_arr)) {
RTE_LOG(ERR, EAL, "Cannot attach to primary process memseg lists\n");
return -1;
}
/* preallocate VA space */
if (alloc_va_space(msl)) {
RTE_LOG(ERR, EAL, "Cannot preallocate VA space for hugepage memory\n");
return -1;
}
}
return 0;
}
static struct rte_memseg *
virt2memseg(const void *addr, const struct rte_memseg_list *msl)
{
const struct rte_fbarray *arr;
void *start, *end;
int ms_idx;
/* a memseg list was specified, check if it's the right one */
start = msl->base_va;
end = RTE_PTR_ADD(start, (size_t)msl->page_sz * msl->memseg_arr.len);
if (addr < start || addr >= end)
return NULL;
/* now, calculate index */
arr = &msl->memseg_arr;
ms_idx = RTE_PTR_DIFF(addr, msl->base_va) / msl->page_sz;
return rte_fbarray_get(arr, ms_idx);
}
static struct rte_memseg_list *
virt2memseg_list(const void *addr)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
struct rte_memseg_list *msl;
int msl_idx;
for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
void *start, *end;
msl = &mcfg->memsegs[msl_idx];
start = msl->base_va;
end = RTE_PTR_ADD(start,
(size_t)msl->page_sz * msl->memseg_arr.len);
if (addr >= start && addr < end)
break;
}
/* if we didn't find our memseg list */
if (msl_idx == RTE_MAX_MEMSEG_LISTS)
return NULL;
return msl;
}
__rte_experimental struct rte_memseg_list *
rte_mem_virt2memseg_list(const void *addr)
{
return virt2memseg_list(addr);
} }
struct virtiova { struct virtiova {
@ -136,7 +518,8 @@ struct virtiova {
void *virt; void *virt;
}; };
static int static int
find_virt(const struct rte_memseg *ms, void *arg) find_virt(const struct rte_memseg_list *msl __rte_unused,
const struct rte_memseg *ms, void *arg)
{ {
struct virtiova *vi = arg; struct virtiova *vi = arg;
if (vi->iova >= ms->iova && vi->iova < (ms->iova + ms->len)) { if (vi->iova >= ms->iova && vi->iova < (ms->iova + ms->len)) {
@ -147,6 +530,19 @@ find_virt(const struct rte_memseg *ms, void *arg)
} }
return 0; return 0;
} }
static int
find_virt_legacy(const struct rte_memseg_list *msl __rte_unused,
const struct rte_memseg *ms, size_t len, void *arg)
{
struct virtiova *vi = arg;
if (vi->iova >= ms->iova && vi->iova < (ms->iova + len)) {
size_t offset = vi->iova - ms->iova;
vi->virt = RTE_PTR_ADD(ms->addr, offset);
/* stop the walk */
return 1;
}
return 0;
}
__rte_experimental void * __rte_experimental void *
rte_mem_iova2virt(rte_iova_t iova) rte_mem_iova2virt(rte_iova_t iova)
@ -156,54 +552,30 @@ rte_mem_iova2virt(rte_iova_t iova)
memset(&vi, 0, sizeof(vi)); memset(&vi, 0, sizeof(vi));
vi.iova = iova; vi.iova = iova;
rte_memseg_walk(find_virt, &vi); /* for legacy mem, we can get away with scanning VA-contiguous segments,
* as we know they are PA-contiguous as well
*/
if (internal_config.legacy_mem)
rte_memseg_contig_walk(find_virt_legacy, &vi);
else
rte_memseg_walk(find_virt, &vi);
return vi.virt; return vi.virt;
} }
struct virtms {
const void *virt;
struct rte_memseg *ms;
};
static int
find_memseg(const struct rte_memseg *ms, void *arg)
{
struct virtms *vm = arg;
if (arg >= ms->addr && arg < RTE_PTR_ADD(ms->addr, ms->len)) {
struct rte_memseg *memseg, *found_ms;
int idx;
memseg = rte_eal_get_configuration()->mem_config->memseg;
idx = ms - memseg;
found_ms = &memseg[idx];
vm->ms = found_ms;
return 1;
}
return 0;
}
__rte_experimental struct rte_memseg * __rte_experimental struct rte_memseg *
rte_mem_virt2memseg(const void *addr) rte_mem_virt2memseg(const void *addr, const struct rte_memseg_list *msl)
{ {
struct virtms vm; return virt2memseg(addr, msl != NULL ? msl :
rte_mem_virt2memseg_list(addr));
memset(&vm, 0, sizeof(vm));
vm.virt = addr;
rte_memseg_walk(find_memseg, &vm);
return vm.ms;
} }
static int static int
physmem_size(const struct rte_memseg *ms, void *arg) physmem_size(const struct rte_memseg_list *msl, void *arg)
{ {
uint64_t *total_len = arg; uint64_t *total_len = arg;
*total_len += ms->len; *total_len += msl->memseg_arr.count * msl->page_sz;
return 0; return 0;
} }
@ -214,32 +586,39 @@ rte_eal_get_physmem_size(void)
{ {
uint64_t total_len = 0; uint64_t total_len = 0;
rte_memseg_walk(physmem_size, &total_len); rte_memseg_list_walk(physmem_size, &total_len);
return total_len; return total_len;
} }
static int static int
dump_memseg(const struct rte_memseg *ms, void *arg) dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
void *arg)
{ {
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
int i = ms - mcfg->memseg; int msl_idx, ms_idx;
FILE *f = arg; FILE *f = arg;
if (i < 0 || i >= RTE_MAX_MEMSEG) msl_idx = msl - mcfg->memsegs;
if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS)
return -1; return -1;
fprintf(f, "Segment %u: IOVA:0x%"PRIx64", len:%zu, " ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
if (ms_idx < 0)
return -1;
fprintf(f, "Segment %i-%i: IOVA:0x%"PRIx64", len:%zu, "
"virt:%p, socket_id:%"PRId32", " "virt:%p, socket_id:%"PRId32", "
"hugepage_sz:%"PRIu64", nchannel:%"PRIx32", " "hugepage_sz:%"PRIu64", nchannel:%"PRIx32", "
"nrank:%"PRIx32"\n", i, "nrank:%"PRIx32"\n",
mcfg->memseg[i].iova, msl_idx, ms_idx,
mcfg->memseg[i].len, ms->iova,
mcfg->memseg[i].addr, ms->len,
mcfg->memseg[i].socket_id, ms->addr,
mcfg->memseg[i].hugepage_sz, ms->socket_id,
mcfg->memseg[i].nchannel, ms->hugepage_sz,
mcfg->memseg[i].nrank); ms->nchannel,
ms->nrank);
return 0; return 0;
} }
@ -289,55 +668,89 @@ rte_mem_lock_page(const void *virt)
} }
int __rte_experimental int __rte_experimental
rte_memseg_walk(rte_memseg_walk_t func, void *arg) rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg)
{ {
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
int i, ret; int i, ms_idx, ret = 0;
for (i = 0; i < RTE_MAX_MEMSEG; i++) { for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
const struct rte_memseg *ms = &mcfg->memseg[i]; struct rte_memseg_list *msl = &mcfg->memsegs[i];
const struct rte_memseg *ms;
struct rte_fbarray *arr;
if (ms->addr == NULL) if (msl->memseg_arr.count == 0)
continue; continue;
ret = func(ms, arg); arr = &msl->memseg_arr;
if (ret < 0)
return -1; ms_idx = rte_fbarray_find_next_used(arr, 0);
if (ret > 0) while (ms_idx >= 0) {
return 1; int n_segs;
size_t len;
ms = rte_fbarray_get(arr, ms_idx);
/* find how many more segments there are, starting with
* this one.
*/
n_segs = rte_fbarray_find_contig_used(arr, ms_idx);
len = n_segs * msl->page_sz;
ret = func(msl, ms, len, arg);
if (ret < 0)
return -1;
else if (ret > 0)
return 1;
ms_idx = rte_fbarray_find_next_used(arr,
ms_idx + n_segs);
}
} }
return 0; return 0;
} }
int __rte_experimental int __rte_experimental
rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg) rte_memseg_walk(rte_memseg_walk_t func, void *arg)
{ {
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
int i, j, ret; int i, ms_idx, ret = 0;
for (i = 0; i < RTE_MAX_MEMSEG; i++) { for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
const struct rte_memseg *ms = &mcfg->memseg[i]; struct rte_memseg_list *msl = &mcfg->memsegs[i];
size_t total_len; const struct rte_memseg *ms;
void *end_addr; struct rte_fbarray *arr;
if (ms->addr == NULL) if (msl->memseg_arr.count == 0)
continue; continue;
end_addr = RTE_PTR_ADD(ms->addr, ms->len); arr = &msl->memseg_arr;
/* check how many more segments are contiguous to this one */ ms_idx = rte_fbarray_find_next_used(arr, 0);
for (j = i + 1; j < RTE_MAX_MEMSEG; j++) { while (ms_idx >= 0) {
const struct rte_memseg *next = &mcfg->memseg[j]; ms = rte_fbarray_get(arr, ms_idx);
ret = func(msl, ms, arg);
if (next->addr != end_addr) if (ret < 0)
break; return -1;
else if (ret > 0)
end_addr = RTE_PTR_ADD(next->addr, next->len); return 1;
i++; ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1);
} }
total_len = RTE_PTR_DIFF(end_addr, ms->addr); }
return 0;
}
ret = func(ms, total_len, arg); int __rte_experimental
rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
int i, ret = 0;
for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
struct rte_memseg_list *msl = &mcfg->memsegs[i];
if (msl->base_va == NULL)
continue;
ret = func(msl, arg);
if (ret < 0) if (ret < 0)
return -1; return -1;
if (ret > 0) if (ret > 0)
@ -350,9 +763,25 @@ rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg)
int int
rte_eal_memory_init(void) rte_eal_memory_init(void)
{ {
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
int retval;
RTE_LOG(DEBUG, EAL, "Setting up physically contiguous memory...\n"); RTE_LOG(DEBUG, EAL, "Setting up physically contiguous memory...\n");
const int retval = rte_eal_process_type() == RTE_PROC_PRIMARY ? if (!mcfg)
return -1;
retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
#ifndef RTE_ARCH_64
memseg_primary_init_32() :
#else
memseg_primary_init() :
#endif
memseg_secondary_init();
if (retval < 0)
return -1;
retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
rte_eal_hugepage_init() : rte_eal_hugepage_init() :
rte_eal_hugepage_attach(); rte_eal_hugepage_attach();
if (retval < 0) if (retval < 0)

View File

@ -239,10 +239,9 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
mz->iova = rte_malloc_virt2iova(mz_addr); mz->iova = rte_malloc_virt2iova(mz_addr);
mz->addr = mz_addr; mz->addr = mz_addr;
mz->len = (requested_len == 0 ? elem->size : requested_len); mz->len = (requested_len == 0 ? elem->size : requested_len);
mz->hugepage_sz = elem->ms->hugepage_sz; mz->hugepage_sz = elem->msl->page_sz;
mz->socket_id = elem->ms->socket_id; mz->socket_id = elem->msl->socket_id;
mz->flags = 0; mz->flags = 0;
mz->memseg_id = elem->ms - rte_eal_get_configuration()->mem_config->memseg;
return mz; return mz;
} }
@ -364,20 +363,50 @@ static void
dump_memzone(const struct rte_memzone *mz, void *arg) dump_memzone(const struct rte_memzone *mz, void *arg)
{ {
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
struct rte_memseg_list *msl = NULL;
void *cur_addr, *mz_end;
struct rte_memseg *ms;
int mz_idx, ms_idx;
size_t page_sz;
FILE *f = arg; FILE *f = arg;
int mz_idx;
mz_idx = mz - mcfg->memzone; mz_idx = mz - mcfg->memzone;
fprintf(f, "Zone %u: name:<%s>, IO:0x%"PRIx64", len:0x%zx, virt:%p, " fprintf(f, "Zone %u: name:<%s>, len:0x%zx, virt:%p, "
"socket_id:%"PRId32", flags:%"PRIx32"\n", "socket_id:%"PRId32", flags:%"PRIx32"\n",
mz_idx, mz_idx,
mz->name, mz->name,
mz->iova,
mz->len, mz->len,
mz->addr, mz->addr,
mz->socket_id, mz->socket_id,
mz->flags); mz->flags);
/* go through each page occupied by this memzone */
msl = rte_mem_virt2memseg_list(mz->addr);
if (!msl) {
RTE_LOG(DEBUG, EAL, "Skipping bad memzone\n");
return;
}
page_sz = (size_t)mz->hugepage_sz;
cur_addr = RTE_PTR_ALIGN_FLOOR(mz->addr, page_sz);
mz_end = RTE_PTR_ADD(cur_addr, mz->len);
fprintf(f, "physical segments used:\n");
ms_idx = RTE_PTR_DIFF(mz->addr, msl->base_va) / page_sz;
ms = rte_fbarray_get(&msl->memseg_arr, ms_idx);
do {
fprintf(f, " addr: %p iova: 0x%" PRIx64 " "
"len: 0x%zx "
"pagesz: 0x%zx\n",
cur_addr, ms->iova, ms->len, page_sz);
/* advance VA to next page */
cur_addr = RTE_PTR_ADD(cur_addr, page_sz);
/* memzones occupy contiguous segments */
++ms;
} while (cur_addr < mz_end);
} }
/* Dump all reserved memory zones on console */ /* Dump all reserved memory zones on console */
@ -394,7 +423,6 @@ int
rte_eal_memzone_init(void) rte_eal_memzone_init(void)
{ {
struct rte_mem_config *mcfg; struct rte_mem_config *mcfg;
const struct rte_memseg *memseg;
/* get pointer to global configuration */ /* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config; mcfg = rte_eal_get_configuration()->mem_config;
@ -403,12 +431,6 @@ rte_eal_memzone_init(void)
if (rte_eal_process_type() == RTE_PROC_SECONDARY) if (rte_eal_process_type() == RTE_PROC_SECONDARY)
return 0; return 0;
memseg = rte_eal_get_physmem_layout();
if (memseg == NULL) {
RTE_LOG(ERR, EAL, "%s(): Cannot get physical layout\n", __func__);
return -1;
}
rte_rwlock_write_lock(&mcfg->mlock); rte_rwlock_write_lock(&mcfg->mlock);
/* delete all zones */ /* delete all zones */

View File

@ -22,7 +22,6 @@ struct hugepage_file {
size_t size; /**< the page size */ size_t size; /**< the page size */
int socket_id; /**< NUMA socket ID */ int socket_id; /**< NUMA socket ID */
int file_id; /**< the '%d' in HUGEFILE_FMT */ int file_id; /**< the '%d' in HUGEFILE_FMT */
int memseg_id; /**< the memory segment to which page belongs */
char filepath[MAX_HUGEPAGE_PATH]; /**< path to backing file on filesystem */ char filepath[MAX_HUGEPAGE_PATH]; /**< path to backing file on filesystem */
}; };

View File

@ -23,7 +23,7 @@ struct hugepage_info {
uint64_t hugepage_sz; /**< size of a huge page */ uint64_t hugepage_sz; /**< size of a huge page */
const char *hugedir; /**< dir where hugetlbfs is mounted */ const char *hugedir; /**< dir where hugetlbfs is mounted */
uint32_t num_pages[RTE_MAX_NUMA_NODES]; uint32_t num_pages[RTE_MAX_NUMA_NODES];
/**< number of hugepages of that size on each socket */ /**< number of hugepages of that size on each socket */
int lock_descriptor; /**< file descriptor for hugepage dir */ int lock_descriptor; /**< file descriptor for hugepage dir */
}; };

View File

@ -12,11 +12,29 @@
#include <rte_malloc_heap.h> #include <rte_malloc_heap.h>
#include <rte_rwlock.h> #include <rte_rwlock.h>
#include <rte_pause.h> #include <rte_pause.h>
#include <rte_fbarray.h>
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
/**
* memseg list is a special case as we need to store a bunch of other data
* together with the array itself.
*/
struct rte_memseg_list {
RTE_STD_C11
union {
void *base_va;
/**< Base virtual address for this memseg list. */
uint64_t addr_64;
/**< Makes sure addr is always 64-bits */
};
int socket_id; /**< Socket ID for all memsegs in this list. */
uint64_t page_sz; /**< Page size for all memsegs in this list. */
struct rte_fbarray memseg_arr;
};
/** /**
* the structure for the memory configuration for the RTE. * the structure for the memory configuration for the RTE.
* Used by the rte_config structure. It is separated out, as for multi-process * Used by the rte_config structure. It is separated out, as for multi-process
@ -43,9 +61,11 @@ struct rte_mem_config {
uint32_t memzone_cnt; /**< Number of allocated memzones */ uint32_t memzone_cnt; /**< Number of allocated memzones */
/* memory segments and zones */ /* memory segments and zones */
struct rte_memseg memseg[RTE_MAX_MEMSEG]; /**< Physmem descriptors. */
struct rte_memzone memzone[RTE_MAX_MEMZONE]; /**< Memzone descriptors. */ struct rte_memzone memzone[RTE_MAX_MEMZONE]; /**< Memzone descriptors. */
struct rte_memseg_list memsegs[RTE_MAX_MEMSEG_LISTS];
/**< list of dynamic arrays holding memsegs */
struct rte_tailq_head tailq_head[RTE_MAX_TAILQ]; /**< Tailqs for objects */ struct rte_tailq_head tailq_head[RTE_MAX_TAILQ]; /**< Tailqs for objects */
/* Heaps of Malloc per socket */ /* Heaps of Malloc per socket */

View File

@ -23,6 +23,9 @@ extern "C" {
#include <rte_compat.h> #include <rte_compat.h>
#include <rte_config.h> #include <rte_config.h>
/* forward declaration for pointers */
struct rte_memseg_list;
__extension__ __extension__
enum rte_page_sizes { enum rte_page_sizes {
RTE_PGSIZE_4K = 1ULL << 12, RTE_PGSIZE_4K = 1ULL << 12,
@ -147,11 +150,25 @@ rte_mem_iova2virt(rte_iova_t iova);
* *
* @param virt * @param virt
* The virtual address. * The virtual address.
* @param msl
* The memseg list in which to look up based on ``virt`` address
* (can be NULL).
* @return * @return
* Memseg pointer on success, or NULL on error. * Memseg pointer on success, or NULL on error.
*/ */
__rte_experimental struct rte_memseg * __rte_experimental struct rte_memseg *
rte_mem_virt2memseg(const void *virt); rte_mem_virt2memseg(const void *virt, const struct rte_memseg_list *msl);
/**
* Get memseg list corresponding to virtual memory address.
*
* @param virt
* The virtual address.
* @return
* Memseg list to which this virtual address belongs to.
*/
__rte_experimental struct rte_memseg_list *
rte_mem_virt2memseg_list(const void *virt);
/** /**
* Memseg walk function prototype. * Memseg walk function prototype.
@ -160,7 +177,8 @@ rte_mem_virt2memseg(const void *virt);
* Returning 1 will stop the walk * Returning 1 will stop the walk
* Returning -1 will stop the walk and report error * Returning -1 will stop the walk and report error
*/ */
typedef int (*rte_memseg_walk_t)(const struct rte_memseg *ms, void *arg); typedef int (*rte_memseg_walk_t)(const struct rte_memseg_list *msl,
const struct rte_memseg *ms, void *arg);
/** /**
* Memseg contig walk function prototype. This will trigger a callback on every * Memseg contig walk function prototype. This will trigger a callback on every
@ -171,8 +189,19 @@ typedef int (*rte_memseg_walk_t)(const struct rte_memseg *ms, void *arg);
* Returning 1 will stop the walk * Returning 1 will stop the walk
* Returning -1 will stop the walk and report error * Returning -1 will stop the walk and report error
*/ */
typedef int (*rte_memseg_contig_walk_t)(const struct rte_memseg *ms, typedef int (*rte_memseg_contig_walk_t)(const struct rte_memseg_list *msl,
size_t len, void *arg); const struct rte_memseg *ms, size_t len, void *arg);
/**
* Memseg list walk function prototype. This will trigger a callback on every
* allocated memseg list.
*
* Returning 0 will continue walk
* Returning 1 will stop the walk
* Returning -1 will stop the walk and report error
*/
typedef int (*rte_memseg_list_walk_t)(const struct rte_memseg_list *msl,
void *arg);
/** /**
* Walk list of all memsegs. * Walk list of all memsegs.
@ -205,21 +234,19 @@ int __rte_experimental
rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg); rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg);
/** /**
* Get the layout of the available physical memory. * Walk each allocated memseg list.
*
* It can be useful for an application to have the full physical
* memory layout to decide the size of a memory zone to reserve. This
* table is stored in rte_config (see rte_eal_get_configuration()).
* *
* @param func
* Iterator function
* @param arg
* Argument passed to iterator
* @return * @return
* - On success, return a pointer to a read-only table of struct * 0 if walked over the entire list
* rte_physmem_desc elements, containing the layout of all * 1 if stopped by the user
* addressable physical memory. The last element of the table * -1 if user function reported error
* contains a NULL address.
* - On error, return NULL. This should not happen since it is a fatal
* error that will probably cause the entire system to panic.
*/ */
const struct rte_memseg *rte_eal_get_physmem_layout(void); int __rte_experimental
rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg);
/** /**
* Dump the physical memory layout to a file. * Dump the physical memory layout to a file.

View File

@ -68,7 +68,6 @@ struct rte_memzone {
int32_t socket_id; /**< NUMA socket ID. */ int32_t socket_id; /**< NUMA socket ID. */
uint32_t flags; /**< Characteristics of this memzone. */ uint32_t flags; /**< Characteristics of this memzone. */
uint32_t memseg_id; /**< Memseg it belongs. */
} __attribute__((__packed__)); } __attribute__((__packed__));
/** /**

View File

@ -27,11 +27,11 @@
* Initialize a general malloc_elem header structure * Initialize a general malloc_elem header structure
*/ */
void void
malloc_elem_init(struct malloc_elem *elem, malloc_elem_init(struct malloc_elem *elem, struct malloc_heap *heap,
struct malloc_heap *heap, const struct rte_memseg *ms, size_t size) struct rte_memseg_list *msl, size_t size)
{ {
elem->heap = heap; elem->heap = heap;
elem->ms = ms; elem->msl = msl;
elem->prev = NULL; elem->prev = NULL;
elem->next = NULL; elem->next = NULL;
memset(&elem->free_list, 0, sizeof(elem->free_list)); memset(&elem->free_list, 0, sizeof(elem->free_list));
@ -100,7 +100,7 @@ malloc_elem_insert(struct malloc_elem *elem)
* so we just check the page addresses. * so we just check the page addresses.
*/ */
static bool static bool
elem_check_phys_contig(const struct rte_memseg *ms __rte_unused, elem_check_phys_contig(const struct rte_memseg_list *msl __rte_unused,
void *start, size_t size) void *start, size_t size)
{ {
rte_iova_t cur, expected; rte_iova_t cur, expected;
@ -191,7 +191,7 @@ elem_start_pt(struct malloc_elem *elem, size_t size, unsigned align,
* couldn't fit all data into one physically contiguous * couldn't fit all data into one physically contiguous
* block, try again with lower addresses. * block, try again with lower addresses.
*/ */
if (!elem_check_phys_contig(elem->ms, if (!elem_check_phys_contig(elem->msl,
(void *)new_data_start, (void *)new_data_start,
new_data_size)) { new_data_size)) {
elem_size -= align; elem_size -= align;
@ -225,7 +225,7 @@ split_elem(struct malloc_elem *elem, struct malloc_elem *split_pt)
const size_t old_elem_size = (uintptr_t)split_pt - (uintptr_t)elem; const size_t old_elem_size = (uintptr_t)split_pt - (uintptr_t)elem;
const size_t new_elem_size = elem->size - old_elem_size; const size_t new_elem_size = elem->size - old_elem_size;
malloc_elem_init(split_pt, elem->heap, elem->ms, new_elem_size); malloc_elem_init(split_pt, elem->heap, elem->msl, new_elem_size);
split_pt->prev = elem; split_pt->prev = elem;
split_pt->next = next_elem; split_pt->next = next_elem;
if (next_elem) if (next_elem)

View File

@ -7,7 +7,7 @@
#include <stdbool.h> #include <stdbool.h>
#include <rte_memory.h> #include <rte_eal_memconfig.h>
/* dummy definition of struct so we can use pointers to it in malloc_elem struct */ /* dummy definition of struct so we can use pointers to it in malloc_elem struct */
struct malloc_heap; struct malloc_heap;
@ -26,7 +26,7 @@ struct malloc_elem {
/**< points to next elem in memseg */ /**< points to next elem in memseg */
LIST_ENTRY(malloc_elem) free_list; LIST_ENTRY(malloc_elem) free_list;
/**< list of free elements in heap */ /**< list of free elements in heap */
const struct rte_memseg *ms; struct rte_memseg_list *msl;
volatile enum elem_state state; volatile enum elem_state state;
uint32_t pad; uint32_t pad;
size_t size; size_t size;
@ -113,7 +113,7 @@ malloc_elem_from_data(const void *data)
void void
malloc_elem_init(struct malloc_elem *elem, malloc_elem_init(struct malloc_elem *elem,
struct malloc_heap *heap, struct malloc_heap *heap,
const struct rte_memseg *ms, struct rte_memseg_list *msl,
size_t size); size_t size);
void void

View File

@ -21,6 +21,7 @@
#include <rte_memcpy.h> #include <rte_memcpy.h>
#include <rte_atomic.h> #include <rte_atomic.h>
#include "eal_internal_cfg.h"
#include "malloc_elem.h" #include "malloc_elem.h"
#include "malloc_heap.h" #include "malloc_heap.h"
@ -62,36 +63,49 @@ check_hugepage_sz(unsigned flags, uint64_t hugepage_sz)
} }
/* /*
* Expand the heap with a memseg. * Expand the heap with a memory area.
* This reserves the zone and sets a dummy malloc_elem header at the end
* to prevent overflow. The rest of the zone is added to free list as a single
* large free block
*/ */
static struct malloc_elem *
malloc_heap_add_memory(struct malloc_heap *heap, struct rte_memseg_list *msl,
void *start, size_t len)
{
struct malloc_elem *elem = start;
malloc_elem_init(elem, heap, msl, len);
malloc_elem_insert(elem);
elem = malloc_elem_join_adjacent_free(elem);
malloc_elem_free_list_insert(elem);
heap->total_size += len;
return elem;
}
static int static int
malloc_heap_add_memseg(const struct rte_memseg *ms, void *arg __rte_unused) malloc_add_seg(const struct rte_memseg_list *msl,
const struct rte_memseg *ms, size_t len, void *arg __rte_unused)
{ {
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
struct malloc_elem *start_elem; struct rte_memseg_list *found_msl;
struct rte_memseg *found_ms;
struct malloc_heap *heap; struct malloc_heap *heap;
size_t elem_size; int msl_idx;
int ms_idx;
heap = &mcfg->malloc_heaps[ms->socket_id]; heap = &mcfg->malloc_heaps[msl->socket_id];
/* ms is const, so find it */ /* msl is const, so find it */
ms_idx = ms - mcfg->memseg; msl_idx = msl - mcfg->memsegs;
found_ms = &mcfg->memseg[ms_idx]; found_msl = &mcfg->memsegs[msl_idx];
start_elem = (struct malloc_elem *)found_ms->addr; if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS)
elem_size = ms->len - MALLOC_ELEM_OVERHEAD; return -1;
malloc_elem_init(start_elem, heap, found_ms, elem_size); malloc_heap_add_memory(heap, found_msl, ms->addr, len);
malloc_elem_insert(start_elem);
malloc_elem_free_list_insert(start_elem);
heap->total_size += elem_size;
RTE_LOG(DEBUG, EAL, "Added %zuM to heap on socket %i\n", len >> 20,
msl->socket_id);
return 0; return 0;
} }
@ -114,7 +128,8 @@ find_suitable_element(struct malloc_heap *heap, size_t size,
!!elem; elem = LIST_NEXT(elem, free_list)) { !!elem; elem = LIST_NEXT(elem, free_list)) {
if (malloc_elem_can_hold(elem, size, align, bound, if (malloc_elem_can_hold(elem, size, align, bound,
contig)) { contig)) {
if (check_hugepage_sz(flags, elem->ms->hugepage_sz)) if (check_hugepage_sz(flags,
elem->msl->page_sz))
return elem; return elem;
if (alt_elem == NULL) if (alt_elem == NULL)
alt_elem = elem; alt_elem = elem;
@ -263,7 +278,6 @@ rte_eal_malloc_heap_init(void)
if (mcfg == NULL) if (mcfg == NULL)
return -1; return -1;
rte_memseg_walk(malloc_heap_add_memseg, NULL); /* add all IOVA-contiguous areas to the heap */
return rte_memseg_contig_walk(malloc_add_seg, NULL);
return 0;
} }

View File

@ -242,17 +242,21 @@ rte_malloc_set_limit(__rte_unused const char *type,
rte_iova_t rte_iova_t
rte_malloc_virt2iova(const void *addr) rte_malloc_virt2iova(const void *addr)
{ {
rte_iova_t iova; const struct rte_memseg *ms;
const struct malloc_elem *elem = malloc_elem_from_data(addr); struct malloc_elem *elem = malloc_elem_from_data(addr);
if (elem == NULL) if (elem == NULL)
return RTE_BAD_IOVA; return RTE_BAD_IOVA;
if (elem->ms->iova == RTE_BAD_IOVA)
return RTE_BAD_IOVA;
if (rte_eal_iova_mode() == RTE_IOVA_VA) if (rte_eal_iova_mode() == RTE_IOVA_VA)
iova = (uintptr_t)addr; return (uintptr_t) addr;
else
iova = elem->ms->iova + ms = rte_mem_virt2memseg(addr, elem->msl);
RTE_PTR_DIFF(addr, elem->ms->addr); if (ms == NULL)
return iova; return RTE_BAD_IOVA;
if (ms->iova == RTE_BAD_IOVA)
return RTE_BAD_IOVA;
return ms->iova + RTE_PTR_DIFF(addr, ms->addr);
} }

View File

@ -74,8 +74,8 @@ static int mem_cfg_fd = -1;
static struct flock wr_lock = { static struct flock wr_lock = {
.l_type = F_WRLCK, .l_type = F_WRLCK,
.l_whence = SEEK_SET, .l_whence = SEEK_SET,
.l_start = offsetof(struct rte_mem_config, memseg), .l_start = offsetof(struct rte_mem_config, memsegs),
.l_len = sizeof(early_mem_config.memseg), .l_len = sizeof(early_mem_config.memsegs),
}; };
/* Address of global and public configuration */ /* Address of global and public configuration */
@ -640,11 +640,14 @@ eal_parse_args(int argc, char **argv)
} }
static int static int
check_mem(const struct rte_memseg *ms, void *arg) check_socket(const struct rte_memseg_list *msl, void *arg)
{ {
int *socket = arg; int *socket_id = arg;
return ms->socket_id == *socket; if (msl->socket_id == *socket_id && msl->memseg_arr.count != 0)
return 1;
return 0;
} }
static void static void
@ -654,7 +657,7 @@ eal_check_mem_on_local_socket(void)
socket_id = rte_lcore_to_socket_id(rte_config.master_lcore); socket_id = rte_lcore_to_socket_id(rte_config.master_lcore);
if (rte_memseg_walk(check_mem, &socket_id) == 0) if (rte_memseg_list_walk(check_socket, &socket_id) == 0)
RTE_LOG(WARNING, EAL, "WARNING: Master core has no memory on local socket!\n"); RTE_LOG(WARNING, EAL, "WARNING: Master core has no memory on local socket!\n");
} }

View File

@ -15,6 +15,7 @@
#include <unistd.h> #include <unistd.h>
#include <errno.h> #include <errno.h>
#include <sys/queue.h> #include <sys/queue.h>
#include <sys/stat.h>
#include <rte_memory.h> #include <rte_memory.h>
#include <rte_eal.h> #include <rte_eal.h>
@ -159,6 +160,18 @@ get_hugepage_dir(uint64_t hugepage_sz)
return retval; return retval;
} }
/*
* uses fstat to report the size of a file on disk
*/
static off_t
get_file_size(int fd)
{
struct stat st;
if (fstat(fd, &st) < 0)
return 0;
return st.st_size;
}
/* /*
* Clear the hugepage directory of whatever hugepage files * Clear the hugepage directory of whatever hugepage files
* there are. Checks if the file is locked (i.e. * there are. Checks if the file is locked (i.e.
@ -189,6 +202,8 @@ clear_hugedir(const char * hugedir)
} }
while(dirent != NULL){ while(dirent != NULL){
struct flock lck = {0};
/* skip files that don't match the hugepage pattern */ /* skip files that don't match the hugepage pattern */
if (fnmatch(filter, dirent->d_name, 0) > 0) { if (fnmatch(filter, dirent->d_name, 0) > 0) {
dirent = readdir(dir); dirent = readdir(dir);
@ -205,11 +220,17 @@ clear_hugedir(const char * hugedir)
} }
/* non-blocking lock */ /* non-blocking lock */
lck_result = flock(fd, LOCK_EX | LOCK_NB); lck.l_type = F_RDLCK;
lck.l_whence = SEEK_SET;
lck.l_start = 0;
lck.l_len = get_file_size(fd);
lck_result = fcntl(fd, F_SETLK, &lck);
/* if lock succeeds, unlock and remove the file */ /* if lock succeeds, unlock and remove the file */
if (lck_result != -1) { if (lck_result != -1) {
flock(fd, LOCK_UN); lck.l_type = F_UNLCK;
fcntl(fd, F_SETLK, &lck);
unlinkat(dir_fd, dirent->d_name, 0); unlinkat(dir_fd, dirent->d_name, 0);
} }
close (fd); close (fd);

File diff suppressed because it is too large Load Diff

View File

@ -908,7 +908,8 @@ vfio_get_group_no(const char *sysfs_base,
} }
static int static int
type1_map(const struct rte_memseg *ms, void *arg) type1_map(const struct rte_memseg_list *msl __rte_unused,
const struct rte_memseg *ms, void *arg)
{ {
int *vfio_container_fd = arg; int *vfio_container_fd = arg;
@ -1021,7 +1022,8 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
} }
static int static int
vfio_spapr_map_walk(const struct rte_memseg *ms, void *arg) vfio_spapr_map_walk(const struct rte_memseg_list *msl __rte_unused,
const struct rte_memseg *ms, void *arg)
{ {
int *vfio_container_fd = arg; int *vfio_container_fd = arg;
@ -1034,7 +1036,8 @@ struct spapr_walk_param {
uint64_t hugepage_sz; uint64_t hugepage_sz;
}; };
static int static int
vfio_spapr_window_size_walk(const struct rte_memseg *ms, void *arg) vfio_spapr_window_size_walk(const struct rte_memseg_list *msl __rte_unused,
const struct rte_memseg *ms, void *arg)
{ {
struct spapr_walk_param *param = arg; struct spapr_walk_param *param = arg;
uint64_t max = ms->iova + ms->len; uint64_t max = ms->iova + ms->len;

View File

@ -25,7 +25,6 @@ DPDK_2.0 {
rte_eal_devargs_type_count; rte_eal_devargs_type_count;
rte_eal_get_configuration; rte_eal_get_configuration;
rte_eal_get_lcore_state; rte_eal_get_lcore_state;
rte_eal_get_physmem_layout;
rte_eal_get_physmem_size; rte_eal_get_physmem_size;
rte_eal_has_hugepages; rte_eal_has_hugepages;
rte_eal_hpet_init; rte_eal_hpet_init;
@ -241,7 +240,9 @@ EXPERIMENTAL {
rte_malloc_dump_heaps; rte_malloc_dump_heaps;
rte_mem_iova2virt; rte_mem_iova2virt;
rte_mem_virt2memseg; rte_mem_virt2memseg;
rte_mem_virt2memseg_list;
rte_memseg_contig_walk; rte_memseg_contig_walk;
rte_memseg_list_walk;
rte_memseg_walk; rte_memseg_walk;
rte_mp_action_register; rte_mp_action_register;
rte_mp_action_unregister; rte_mp_action_unregister;

View File

@ -100,12 +100,12 @@ static unsigned optimize_object_size(unsigned obj_size)
} }
static int static int
find_min_pagesz(const struct rte_memseg *ms, void *arg) find_min_pagesz(const struct rte_memseg_list *msl, void *arg)
{ {
size_t *min = arg; size_t *min = arg;
if (ms->hugepage_sz < *min) if (msl->page_sz < *min)
*min = ms->hugepage_sz; *min = msl->page_sz;
return 0; return 0;
} }
@ -115,11 +115,12 @@ get_min_page_size(void)
{ {
size_t min_pagesz = SIZE_MAX; size_t min_pagesz = SIZE_MAX;
rte_memseg_walk(find_min_pagesz, &min_pagesz); rte_memseg_list_walk(find_min_pagesz, &min_pagesz);
return min_pagesz == SIZE_MAX ? (size_t) getpagesize() : min_pagesz; return min_pagesz == SIZE_MAX ? (size_t) getpagesize() : min_pagesz;
} }
static void static void
mempool_add_elem(struct rte_mempool *mp, void *obj, rte_iova_t iova) mempool_add_elem(struct rte_mempool *mp, void *obj, rte_iova_t iova)
{ {

View File

@ -12,6 +12,7 @@
#include <rte_common.h> #include <rte_common.h>
#include <rte_memory.h> #include <rte_memory.h>
#include <rte_eal_memconfig.h>
#include <rte_per_lcore.h> #include <rte_per_lcore.h>
#include <rte_launch.h> #include <rte_launch.h>
#include <rte_eal.h> #include <rte_eal.h>
@ -706,36 +707,20 @@ test_malloc_bad_params(void)
} }
static int static int
check_socket_mem(const struct rte_memseg *ms, void *arg) check_socket_mem(const struct rte_memseg_list *msl, void *arg)
{ {
int32_t *socket = arg; int32_t *socket = arg;
return *socket == ms->socket_id; return *socket == msl->socket_id;
} }
/* Check if memory is available on a specific socket */ /* Check if memory is available on a specific socket */
static int static int
is_mem_on_socket(int32_t socket) is_mem_on_socket(int32_t socket)
{ {
return rte_memseg_walk(check_socket_mem, &socket); return rte_memseg_list_walk(check_socket_mem, &socket);
} }
struct walk_param {
void *addr;
int32_t socket;
};
static int
find_socket(const struct rte_memseg *ms, void *arg)
{
struct walk_param *param = arg;
if (param->addr >= ms->addr &&
param->addr < RTE_PTR_ADD(ms->addr, ms->len)) {
param->socket = ms->socket_id;
return 1;
}
return 0;
}
/* /*
* Find what socket a memory address is on. Only works for addresses within * Find what socket a memory address is on. Only works for addresses within
@ -744,10 +729,9 @@ find_socket(const struct rte_memseg *ms, void *arg)
static int32_t static int32_t
addr_to_socket(void * addr) addr_to_socket(void * addr)
{ {
struct walk_param param = {.addr = addr, .socket = 0}; const struct rte_memseg *ms = rte_mem_virt2memseg(addr, NULL);
if (rte_memseg_walk(find_socket, &param) > 0) return ms == NULL ? -1 : ms->socket_id;
return param.socket;
return -1;
} }
/* Test using rte_[c|m|zm]alloc_socket() on a specific socket */ /* Test using rte_[c|m|zm]alloc_socket() on a specific socket */

View File

@ -5,8 +5,11 @@
#include <stdio.h> #include <stdio.h>
#include <stdint.h> #include <stdint.h>
#include <rte_eal.h>
#include <rte_eal_memconfig.h>
#include <rte_memory.h> #include <rte_memory.h>
#include <rte_common.h> #include <rte_common.h>
#include <rte_memzone.h>
#include "test.h" #include "test.h"
@ -23,12 +26,13 @@
*/ */
static int static int
check_mem(const struct rte_memseg *ms, void *arg __rte_unused) check_mem(const struct rte_memseg_list *msl __rte_unused,
const struct rte_memseg *ms, void *arg __rte_unused)
{ {
volatile uint8_t *mem = (volatile uint8_t *) ms->addr; volatile uint8_t *mem = (volatile uint8_t *) ms->addr;
size_t i; size_t i, max = ms->len;
for (i = 0; i < ms->len; i++, mem++) for (i = 0; i < max; i++, mem++)
*mem; *mem;
return 0; return 0;
} }

View File

@ -111,17 +111,17 @@ struct walk_arg {
int hugepage_16GB_avail; int hugepage_16GB_avail;
}; };
static int static int
find_available_pagesz(const struct rte_memseg *ms, void *arg) find_available_pagesz(const struct rte_memseg_list *msl, void *arg)
{ {
struct walk_arg *wa = arg; struct walk_arg *wa = arg;
if (ms->hugepage_sz == RTE_PGSIZE_2M) if (msl->page_sz == RTE_PGSIZE_2M)
wa->hugepage_2MB_avail = 1; wa->hugepage_2MB_avail = 1;
if (ms->hugepage_sz == RTE_PGSIZE_1G) if (msl->page_sz == RTE_PGSIZE_1G)
wa->hugepage_1GB_avail = 1; wa->hugepage_1GB_avail = 1;
if (ms->hugepage_sz == RTE_PGSIZE_16M) if (msl->page_sz == RTE_PGSIZE_16M)
wa->hugepage_16MB_avail = 1; wa->hugepage_16MB_avail = 1;
if (ms->hugepage_sz == RTE_PGSIZE_16G) if (msl->page_sz == RTE_PGSIZE_16G)
wa->hugepage_16GB_avail = 1; wa->hugepage_16GB_avail = 1;
return 0; return 0;
@ -138,7 +138,7 @@ test_memzone_reserve_flags(void)
memset(&wa, 0, sizeof(wa)); memset(&wa, 0, sizeof(wa));
rte_memseg_walk(find_available_pagesz, &wa); rte_memseg_list_walk(find_available_pagesz, &wa);
hugepage_2MB_avail = wa.hugepage_2MB_avail; hugepage_2MB_avail = wa.hugepage_2MB_avail;
hugepage_1GB_avail = wa.hugepage_1GB_avail; hugepage_1GB_avail = wa.hugepage_1GB_avail;