FreeBSD contigmem and eal NUMA support
This commit is contained in:
parent
b1d36cf828
commit
025a453103
@ -27,7 +27,7 @@
|
||||
#define RTE_VER_PREFIX "DPDK"
|
||||
|
||||
/****** library defines ********/
|
||||
|
||||
#define RTE_LIBRTE_IEEE1588 1
|
||||
/* EAL defines */
|
||||
#define RTE_MAX_HEAPS 32
|
||||
#define RTE_MAX_MEMSEG_LISTS 128
|
||||
@ -44,6 +44,7 @@
|
||||
/* bsd module defines */
|
||||
#define RTE_CONTIGMEM_MAX_NUM_BUFS 64
|
||||
#define RTE_CONTIGMEM_DEFAULT_NUM_BUFS 1
|
||||
#define RTE_CONTIGMEM_DEFAULT_NODE_AFFINITY 0x1
|
||||
#define RTE_CONTIGMEM_DEFAULT_BUF_SIZE (512*1024*1024)
|
||||
|
||||
/* mempool defines */
|
||||
|
@ -20,6 +20,7 @@ __FBSDID("$FreeBSD$");
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/vmmeter.h>
|
||||
#include <sys/eventhandler.h>
|
||||
#include <sys/domainset.h>
|
||||
|
||||
#include <machine/bus.h>
|
||||
|
||||
@ -31,6 +32,15 @@ __FBSDID("$FreeBSD$");
|
||||
#include <vm/vm_pager.h>
|
||||
#include <vm/vm_phys.h>
|
||||
|
||||
#define CONTIGMEM_DEBUG
|
||||
|
||||
#ifdef CONTIGMEM_DEBUG
|
||||
#define DBGPRINT(fmt, ...) printf(fmt, ##__VA_ARGS__)
|
||||
#else
|
||||
#define DBGPRINT(fmt, ...)
|
||||
#endif
|
||||
|
||||
#define NEXT_CPU_NULL -1;
|
||||
struct contigmem_buffer {
|
||||
void *addr;
|
||||
int refcnt;
|
||||
@ -39,6 +49,7 @@ struct contigmem_buffer {
|
||||
|
||||
struct contigmem_vm_handle {
|
||||
int buffer_index;
|
||||
int node_index;
|
||||
};
|
||||
|
||||
static int contigmem_load(void);
|
||||
@ -49,19 +60,46 @@ static d_mmap_single_t contigmem_mmap_single;
|
||||
static d_open_t contigmem_open;
|
||||
static d_close_t contigmem_close;
|
||||
|
||||
static uint64_t contigmem_node_affinity = RTE_CONTIGMEM_DEFAULT_NODE_AFFINITY;
|
||||
static int contigmem_num_buffers = RTE_CONTIGMEM_DEFAULT_NUM_BUFS;
|
||||
static int64_t contigmem_buffer_size = RTE_CONTIGMEM_DEFAULT_BUF_SIZE;
|
||||
|
||||
#define CONTIGMEM_MAX_NODECNT RTE_MAX_NUMA_NODES
|
||||
|
||||
static eventhandler_tag contigmem_eh_tag;
|
||||
static struct contigmem_buffer contigmem_buffers[RTE_CONTIGMEM_MAX_NUM_BUFS];
|
||||
static struct contigmem_buffer contigmem_buffers[CONTIGMEM_MAX_NODECNT][RTE_CONTIGMEM_MAX_NUM_BUFS];
|
||||
static struct cdev *contigmem_cdev = NULL;
|
||||
static int contigmem_refcnt;
|
||||
static int contigmem_nodecnt;
|
||||
|
||||
/*
|
||||
* offset: 0-11 bits page alignment
|
||||
* 12-15: node idx
|
||||
* 16-31:
|
||||
*/
|
||||
static inline int offset_to_node(off_t offset)
|
||||
{
|
||||
return (offset >> 12) & 0xF;
|
||||
}
|
||||
|
||||
static inline int offset_to_buffer(off_t offset)
|
||||
{
|
||||
return (offset >> 16) & 0xFFFF;
|
||||
}
|
||||
|
||||
static inline off_t offset_make(int node, int buffer)
|
||||
{
|
||||
return (((off_t)node & 0xF) << 12) | (((off_t)buffer & 0xFFFF) << 16);
|
||||
}
|
||||
|
||||
TUNABLE_QUAD("hw.contigmem.node_affinity", &contigmem_node_affinity);
|
||||
TUNABLE_INT("hw.contigmem.num_buffers", &contigmem_num_buffers);
|
||||
TUNABLE_QUAD("hw.contigmem.buffer_size", &contigmem_buffer_size);
|
||||
|
||||
static SYSCTL_NODE(_hw, OID_AUTO, contigmem, CTLFLAG_RD, 0, "contigmem");
|
||||
|
||||
SYSCTL_QUAD(_hw_contigmem, OID_AUTO, node_affinity, CTLFLAG_RD,
|
||||
&contigmem_node_affinity, 0, "The node/NUMA affinity of buffers.");
|
||||
SYSCTL_INT(_hw_contigmem, OID_AUTO, num_buffers, CTLFLAG_RD,
|
||||
&contigmem_num_buffers, 0, "Number of contigmem buffers allocated");
|
||||
SYSCTL_QUAD(_hw_contigmem, OID_AUTO, buffer_size, CTLFLAG_RD,
|
||||
@ -110,12 +148,34 @@ static struct cdevsw contigmem_ops = {
|
||||
.d_close = contigmem_close,
|
||||
};
|
||||
|
||||
static void
|
||||
contigmem_free()
|
||||
{
|
||||
for (int j = 0; j < contigmem_nodecnt; j++) {
|
||||
for (int i = 0; i < contigmem_num_buffers; i++) {
|
||||
if (contigmem_buffers[j][i].addr != NULL) {
|
||||
DBGPRINT("contigmem_free: freeing contigmem node buffer %d, node %d, virt 0x%p, size 0x%lx\n",
|
||||
i, j, contigmem_buffers[j][i].addr, contigmem_buffer_size);
|
||||
contigfree(contigmem_buffers[j][i].addr,
|
||||
contigmem_buffer_size, M_CONTIGMEM);
|
||||
contigmem_buffers[j][i].addr = NULL;
|
||||
}
|
||||
if (mtx_initialized(&contigmem_buffers[j][i].mtx))
|
||||
mtx_destroy(&contigmem_buffers[j][i].mtx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
contigmem_load()
|
||||
{
|
||||
char index_string[8], description[32];
|
||||
int i, error = 0;
|
||||
int i, j, error = 0;
|
||||
void *addr;
|
||||
struct domainset ds;
|
||||
struct domainset *rds;
|
||||
ds.ds_policy = DOMAINSET_POLICY_ROUNDROBIN;
|
||||
|
||||
|
||||
if (contigmem_num_buffers > RTE_CONTIGMEM_MAX_NUM_BUFS) {
|
||||
printf("%d buffers requested is greater than %d allowed\n",
|
||||
@ -132,30 +192,51 @@ contigmem_load()
|
||||
goto error;
|
||||
}
|
||||
|
||||
for (i = 0; i < contigmem_num_buffers; i++) {
|
||||
addr = contigmalloc(contigmem_buffer_size, M_CONTIGMEM, M_ZERO,
|
||||
0, BUS_SPACE_MAXADDR, contigmem_buffer_size, 0);
|
||||
if (addr == NULL) {
|
||||
printf("contigmalloc failed for buffer %d\n", i);
|
||||
error = ENOMEM;
|
||||
goto error;
|
||||
// init cputop
|
||||
contigmem_nodecnt = vm_ndomains;
|
||||
|
||||
if (contigmem_nodecnt > CONTIGMEM_MAX_NODECNT) {
|
||||
printf("contigmem_load: too many NUMA nodes detected: %d\n", contigmem_nodecnt);
|
||||
error = EINVAL;
|
||||
goto error;
|
||||
}
|
||||
|
||||
DBGPRINT("contigmem_load: detected %d vmdomain\n", contigmem_nodecnt);
|
||||
|
||||
for (j = 0; j < contigmem_nodecnt; j++) {
|
||||
if ((contigmem_node_affinity & (1 << j)) == 0) {
|
||||
DBGPRINT("contigmem_load: skipping node %d...\n", j);
|
||||
continue;
|
||||
}
|
||||
DOMAINSET_ZERO(&ds.ds_mask);
|
||||
DOMAINSET_SET(j, &ds.ds_mask);
|
||||
rds = domainset_create(&ds);
|
||||
for (i = 0; i < contigmem_num_buffers; i++) {
|
||||
DBGPRINT("contigmem_load: allocating 0x%lx bytes memory for buffer %d node %d\n", contigmem_buffer_size, i, j);
|
||||
addr = contigmalloc_domainset(contigmem_buffer_size, M_CONTIGMEM, rds,
|
||||
M_ZERO, 0, BUS_SPACE_MAXADDR, contigmem_buffer_size, 0);
|
||||
if (addr == NULL) {
|
||||
printf("contigmalloc failed for buffer %d node %d\n", i, j);
|
||||
error = ENOMEM;
|
||||
goto error;
|
||||
}
|
||||
|
||||
printf("%2u: virt=%p phys=%p\n", i, addr,
|
||||
(void *)pmap_kextract((vm_offset_t)addr));
|
||||
printf("%2u: virt=%p phys=%p\n", i, addr,
|
||||
(void *)pmap_kextract((vm_offset_t)addr));
|
||||
|
||||
mtx_init(&contigmem_buffers[i].mtx, "contigmem", NULL, MTX_DEF);
|
||||
contigmem_buffers[i].addr = addr;
|
||||
contigmem_buffers[i].refcnt = 0;
|
||||
mtx_init(&contigmem_buffers[j][i].mtx, "contigmem", NULL, MTX_DEF);
|
||||
contigmem_buffers[j][i].addr = addr;
|
||||
contigmem_buffers[j][i].refcnt = 0;
|
||||
|
||||
snprintf(index_string, sizeof(index_string), "%d", i);
|
||||
snprintf(description, sizeof(description),
|
||||
"phys addr for buffer %d", i);
|
||||
SYSCTL_ADD_PROC(NULL,
|
||||
&SYSCTL_NODE_CHILDREN(_hw_contigmem, physaddr), OID_AUTO,
|
||||
index_string, CTLTYPE_U64 | CTLFLAG_RD,
|
||||
(void *)(uintptr_t)i, 0, contigmem_physaddr, "LU",
|
||||
description);
|
||||
snprintf(index_string, sizeof(index_string), "%ld", offset_make(j, i));
|
||||
snprintf(description, sizeof(description),
|
||||
"phys addr for node %d buffer %d", j, i);
|
||||
SYSCTL_ADD_PROC(NULL,
|
||||
&SYSCTL_NODE_CHILDREN(_hw_contigmem, physaddr), OID_AUTO,
|
||||
index_string, CTLTYPE_U64 | CTLFLAG_RD,
|
||||
(void*)offset_make(j, i), 0, contigmem_physaddr, "LU",
|
||||
description);
|
||||
}
|
||||
}
|
||||
|
||||
contigmem_cdev = make_dev_credf(0, &contigmem_ops, 0, NULL, UID_ROOT,
|
||||
@ -164,24 +245,13 @@ contigmem_load()
|
||||
return 0;
|
||||
|
||||
error:
|
||||
for (i = 0; i < contigmem_num_buffers; i++) {
|
||||
if (contigmem_buffers[i].addr != NULL) {
|
||||
contigfree(contigmem_buffers[i].addr,
|
||||
contigmem_buffer_size, M_CONTIGMEM);
|
||||
contigmem_buffers[i].addr = NULL;
|
||||
}
|
||||
if (mtx_initialized(&contigmem_buffers[i].mtx))
|
||||
mtx_destroy(&contigmem_buffers[i].mtx);
|
||||
}
|
||||
|
||||
contigmem_free();
|
||||
return error;
|
||||
}
|
||||
|
||||
static int
|
||||
contigmem_unload()
|
||||
{
|
||||
int i;
|
||||
|
||||
if (contigmem_refcnt > 0)
|
||||
return EBUSY;
|
||||
|
||||
@ -191,14 +261,7 @@ contigmem_unload()
|
||||
if (contigmem_eh_tag != NULL)
|
||||
EVENTHANDLER_DEREGISTER(process_exit, contigmem_eh_tag);
|
||||
|
||||
for (i = 0; i < RTE_CONTIGMEM_MAX_NUM_BUFS; i++) {
|
||||
if (contigmem_buffers[i].addr != NULL)
|
||||
contigfree(contigmem_buffers[i].addr,
|
||||
contigmem_buffer_size, M_CONTIGMEM);
|
||||
if (mtx_initialized(&contigmem_buffers[i].mtx))
|
||||
mtx_destroy(&contigmem_buffers[i].mtx);
|
||||
}
|
||||
|
||||
contigmem_free();
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -206,9 +269,12 @@ static int
|
||||
contigmem_physaddr(SYSCTL_HANDLER_ARGS)
|
||||
{
|
||||
uint64_t physaddr;
|
||||
int index = (int)(uintptr_t)arg1;
|
||||
off_t offset = (off_t)arg1;
|
||||
int index = offset_to_buffer(offset);
|
||||
int node = offset_to_node(offset);
|
||||
|
||||
physaddr = (uint64_t)vtophys(contigmem_buffers[index].addr);
|
||||
physaddr = (uint64_t)vtophys(contigmem_buffers[node][index].addr);
|
||||
DBGPRINT("contigmem_physaddr sysctl: buffer %d node %d paddr 0x%lx\n", index, node, physaddr);
|
||||
return sysctl_handle_64(oidp, &physaddr, 0, req);
|
||||
}
|
||||
|
||||
@ -239,7 +305,8 @@ contigmem_cdev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
|
||||
struct contigmem_vm_handle *vmh = handle;
|
||||
struct contigmem_buffer *buf;
|
||||
|
||||
buf = &contigmem_buffers[vmh->buffer_index];
|
||||
DBGPRINT("contigmem_cdev_pager_ctor: buffer %d node %d\n", vmh->buffer_index, vmh->node_index);
|
||||
buf = &contigmem_buffers[vmh->node_index][vmh->buffer_index];
|
||||
|
||||
atomic_add_int(&contigmem_refcnt, 1);
|
||||
|
||||
@ -258,7 +325,8 @@ contigmem_cdev_pager_dtor(void *handle)
|
||||
struct contigmem_vm_handle *vmh = handle;
|
||||
struct contigmem_buffer *buf;
|
||||
|
||||
buf = &contigmem_buffers[vmh->buffer_index];
|
||||
DBGPRINT("contigmem_cdev_pager_dtor: buffer %d node %d\n", vmh->buffer_index, vmh->node_index);
|
||||
buf = &contigmem_buffers[vmh->node_index][vmh->buffer_index];
|
||||
|
||||
mtx_lock(&buf->mtx);
|
||||
buf->refcnt--;
|
||||
@ -335,14 +403,17 @@ contigmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t size,
|
||||
{
|
||||
struct contigmem_vm_handle *vmh;
|
||||
uint64_t buffer_index;
|
||||
uint64_t node_index;
|
||||
|
||||
/*
|
||||
* The buffer index is encoded in the offset. Divide the offset by
|
||||
* PAGE_SIZE to get the index of the buffer requested by the user
|
||||
* app.
|
||||
*/
|
||||
buffer_index = *offset / PAGE_SIZE;
|
||||
if (buffer_index >= contigmem_num_buffers)
|
||||
buffer_index = offset_to_buffer(*offset);
|
||||
node_index = offset_to_node(*offset);
|
||||
DBGPRINT("contigmem_mmap_single: buffer %lu node %lu size 0x%lx\n", buffer_index, node_index, size);
|
||||
if (buffer_index >= contigmem_num_buffers || node_index >= contigmem_nodecnt)
|
||||
return EINVAL;
|
||||
|
||||
if (size > contigmem_buffer_size)
|
||||
@ -352,8 +423,9 @@ contigmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t size,
|
||||
if (vmh == NULL)
|
||||
return ENOMEM;
|
||||
vmh->buffer_index = buffer_index;
|
||||
vmh->node_index = node_index;
|
||||
|
||||
*offset = (vm_ooffset_t)vtophys(contigmem_buffers[buffer_index].addr);
|
||||
*offset = (vm_ooffset_t)vtophys(contigmem_buffers[node_index][buffer_index].addr);
|
||||
*obj = cdev_pager_allocate(vmh, OBJT_DEVICE, &contigmem_cdev_pager_ops,
|
||||
size, nprot, *offset, curthread->td_ucred);
|
||||
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/mman.h>
|
||||
#include <string.h>
|
||||
#include <immintrin.h>
|
||||
|
||||
#include <rte_log.h>
|
||||
#include <fcntl.h>
|
||||
@ -13,6 +14,7 @@
|
||||
#include "eal_hugepages.h"
|
||||
#include "eal_internal_cfg.h"
|
||||
#include "eal_filesystem.h"
|
||||
#include "rte_build_config.h"
|
||||
|
||||
#define CONTIGMEM_DEV "/dev/contigmem"
|
||||
|
||||
@ -48,6 +50,21 @@ create_shared_memory(const char *filename, const size_t mem_size)
|
||||
return map_shared_memory(filename, mem_size, O_RDWR | O_CREAT);
|
||||
}
|
||||
|
||||
#define NEXT_CPU_NULL -1;
|
||||
static inline int
|
||||
cmask_get_next_cpu(uint64_t *mask)
|
||||
{
|
||||
int ffs = ffsll(*mask);
|
||||
*mask &= ~(1ul << (ffs - 1));
|
||||
return ffs - 1;
|
||||
}
|
||||
|
||||
static inline int
|
||||
cmask_get_num_cpus(const uint64_t mask)
|
||||
{
|
||||
return _mm_popcnt_u64(mask);
|
||||
}
|
||||
|
||||
/*
|
||||
* No hugepage support on freebsd, but we dummy it, using contigmem driver
|
||||
*/
|
||||
@ -57,6 +74,8 @@ eal_hugepage_info_init(void)
|
||||
size_t sysctl_size;
|
||||
int num_buffers, fd, error;
|
||||
int64_t buffer_size;
|
||||
uint64_t node_affinity = 0;
|
||||
int num_nodes;
|
||||
struct internal_config *internal_conf =
|
||||
eal_get_internal_configuration();
|
||||
|
||||
@ -76,6 +95,14 @@ eal_hugepage_info_init(void)
|
||||
return -1;
|
||||
}
|
||||
|
||||
sysctl_size = sizeof(node_affinity);
|
||||
error = sysctlbyname("hw.contigmem.node_affinity", &node_affinity, &sysctl_size, NULL, 0);
|
||||
num_nodes = cmask_get_num_cpus(node_affinity);
|
||||
if (error != 0 || num_nodes == 0 || num_nodes > RTE_MAX_NUMA_NODES) {
|
||||
RTE_LOG(ERR, EAL, "could not read or invalid sysctl hw.contigmem.node_affinity 0x%lx\n", node_affinity);
|
||||
return -1;
|
||||
}
|
||||
|
||||
sysctl_size = sizeof(buffer_size);
|
||||
error = sysctlbyname("hw.contigmem.buffer_size", &buffer_size,
|
||||
&sysctl_size, NULL, 0);
|
||||
@ -101,9 +128,18 @@ eal_hugepage_info_init(void)
|
||||
RTE_LOG(INFO, EAL, "Contigmem driver has %d buffers, each of size %dKB\n",
|
||||
num_buffers, (int)(buffer_size>>10));
|
||||
|
||||
RTE_LOG(INFO, EAL, "Contigmem driver's node affinity is 0x%lx, %d domains in total\n",
|
||||
node_affinity, num_nodes);
|
||||
|
||||
strlcpy(hpi->hugedir, CONTIGMEM_DEV, sizeof(hpi->hugedir));
|
||||
hpi->hugepage_sz = buffer_size;
|
||||
hpi->num_pages[0] = num_buffers;
|
||||
for (int i = 0; i < num_nodes; i++) {
|
||||
if ((node_affinity & (1 << i)) != 0) {
|
||||
hpi->num_pages[i] = num_buffers;
|
||||
} else {
|
||||
hpi->num_pages[i] = 0;
|
||||
}
|
||||
}
|
||||
hpi->lock_descriptor = fd;
|
||||
|
||||
/* for no shared files mode, do not create shared memory config */
|
||||
|
@ -1,7 +1,7 @@
|
||||
/* SPDX-License-Identifier: BSD-3-Clause
|
||||
* Copyright(c) 2010-2014 Intel Corporation
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/sysctl.h>
|
||||
|
||||
@ -16,9 +16,10 @@
|
||||
|
||||
/* No topology information available on FreeBSD including NUMA info */
|
||||
unsigned
|
||||
eal_cpu_core_id(__rte_unused unsigned lcore_id)
|
||||
eal_cpu_core_id(unsigned lcore_id)
|
||||
{
|
||||
return 0;
|
||||
/* DPDK uses 1-1 mapping */
|
||||
return lcore_id;
|
||||
}
|
||||
|
||||
static int
|
||||
@ -36,9 +37,21 @@ eal_get_ncpus(void)
|
||||
}
|
||||
|
||||
unsigned
|
||||
eal_cpu_socket_id(__rte_unused unsigned cpu_id)
|
||||
eal_cpu_socket_id(unsigned cpu_id)
|
||||
{
|
||||
return 0;
|
||||
int error;
|
||||
int domain;
|
||||
size_t domain_sz = sizeof(domain);
|
||||
|
||||
char sysctl_str[32];
|
||||
snprintf(sysctl_str, sizeof(sysctl_str), "dev.cpu.%d.%%domain", cpu_id);
|
||||
|
||||
error = sysctlbyname(sysctl_str, &domain, &domain_sz, NULL, 0);
|
||||
if (error < 0) {
|
||||
RTE_LOG(WARNING, EAL, "Failed to get socket id for core %u, returning 0...\n", cpu_id);
|
||||
domain = 0;
|
||||
}
|
||||
return domain;
|
||||
}
|
||||
|
||||
/* Check if a cpu is present by the presence of the
|
||||
|
@ -107,114 +107,117 @@ rte_eal_hugepage_init(void)
|
||||
|
||||
hpi = &internal_conf->hugepage_info[i];
|
||||
page_sz = hpi->hugepage_sz;
|
||||
max_pages = hpi->num_pages[0];
|
||||
mem_needed = RTE_ALIGN_CEIL(internal_conf->memory - total_mem,
|
||||
page_sz);
|
||||
for (int k = 0; k < RTE_MAX_NUMA_NODES; k++) {
|
||||
max_pages = hpi->num_pages[k];
|
||||
|
||||
n_pages = RTE_MIN(mem_needed / page_sz, max_pages);
|
||||
mem_needed = RTE_ALIGN_CEIL(internal_conf->memory - total_mem,
|
||||
page_sz);
|
||||
|
||||
for (j = 0; j < n_pages; j++) {
|
||||
struct rte_memseg_list *msl;
|
||||
struct rte_fbarray *arr;
|
||||
struct rte_memseg *seg;
|
||||
int msl_idx, ms_idx;
|
||||
rte_iova_t physaddr;
|
||||
int error;
|
||||
size_t sysctl_size = sizeof(physaddr);
|
||||
char physaddr_str[64];
|
||||
bool is_adjacent;
|
||||
n_pages = RTE_MIN(mem_needed / page_sz, max_pages);
|
||||
|
||||
/* first, check if this segment is IOVA-adjacent to
|
||||
* the previous one.
|
||||
*/
|
||||
snprintf(physaddr_str, sizeof(physaddr_str),
|
||||
"hw.contigmem.physaddr.%d", j);
|
||||
error = sysctlbyname(physaddr_str, &physaddr,
|
||||
&sysctl_size, NULL, 0);
|
||||
if (error < 0) {
|
||||
RTE_LOG(ERR, EAL, "Failed to get physical addr for buffer %u "
|
||||
"from %s\n", j, hpi->hugedir);
|
||||
return -1;
|
||||
}
|
||||
for (j = 0; j < n_pages; j++) {
|
||||
struct rte_memseg_list *msl;
|
||||
struct rte_fbarray *arr;
|
||||
struct rte_memseg *seg;
|
||||
int msl_idx, ms_idx;
|
||||
rte_iova_t physaddr;
|
||||
int error;
|
||||
size_t sysctl_size = sizeof(physaddr);
|
||||
char physaddr_str[64];
|
||||
bool is_adjacent;
|
||||
|
||||
is_adjacent = prev_end != 0 && physaddr == prev_end;
|
||||
prev_end = physaddr + hpi->hugepage_sz;
|
||||
/* first, check if this segment is IOVA-adjacent to
|
||||
* the previous one.
|
||||
*/
|
||||
snprintf(physaddr_str, sizeof(physaddr_str),
|
||||
"hw.contigmem.physaddr.%ld", offset_make(k, j));
|
||||
error = sysctlbyname(physaddr_str, &physaddr,
|
||||
&sysctl_size, NULL, 0);
|
||||
if (error < 0) {
|
||||
RTE_LOG(ERR, EAL, "Failed to get physical addr for buffer %u "
|
||||
"from %s\n", j, hpi->hugedir);
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS;
|
||||
msl_idx++) {
|
||||
bool empty, need_hole;
|
||||
msl = &mcfg->memsegs[msl_idx];
|
||||
is_adjacent = prev_end != 0 && physaddr == prev_end;
|
||||
prev_end = physaddr + hpi->hugepage_sz;
|
||||
|
||||
for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS;
|
||||
msl_idx++) {
|
||||
bool empty, need_hole;
|
||||
msl = &mcfg->memsegs[msl_idx];
|
||||
arr = &msl->memseg_arr;
|
||||
|
||||
if (msl->page_sz != page_sz || msl->socket_id != k)
|
||||
continue;
|
||||
|
||||
empty = arr->count == 0;
|
||||
|
||||
/* we need a hole if this isn't an empty memseg
|
||||
* list, and if previous segment was not
|
||||
* adjacent to current one.
|
||||
*/
|
||||
need_hole = !empty && !is_adjacent;
|
||||
|
||||
/* we need 1, plus hole if not adjacent */
|
||||
ms_idx = rte_fbarray_find_next_n_free(arr,
|
||||
0, 1 + (need_hole ? 1 : 0));
|
||||
|
||||
/* memseg list is full? */
|
||||
if (ms_idx < 0)
|
||||
continue;
|
||||
|
||||
if (need_hole && prev_ms_idx == ms_idx - 1)
|
||||
ms_idx++;
|
||||
prev_ms_idx = ms_idx;
|
||||
|
||||
break;
|
||||
}
|
||||
if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
|
||||
RTE_LOG(ERR, EAL, "Could not find space for memseg. Please increase %s and/or %s in configuration.\n",
|
||||
RTE_STR(RTE_MAX_MEMSEG_PER_TYPE),
|
||||
RTE_STR(RTE_MAX_MEM_MB_PER_TYPE));
|
||||
return -1;
|
||||
}
|
||||
arr = &msl->memseg_arr;
|
||||
seg = rte_fbarray_get(arr, ms_idx);
|
||||
|
||||
if (msl->page_sz != page_sz)
|
||||
continue;
|
||||
addr = RTE_PTR_ADD(msl->base_va,
|
||||
(size_t)msl->page_sz * ms_idx);
|
||||
|
||||
empty = arr->count == 0;
|
||||
/* address is already mapped in memseg list, so using
|
||||
* MAP_FIXED here is safe.
|
||||
*/
|
||||
addr = mmap(addr, page_sz, PROT_READ|PROT_WRITE,
|
||||
MAP_SHARED | MAP_FIXED,
|
||||
hpi->lock_descriptor,
|
||||
offset_make(k, j));
|
||||
if (addr == MAP_FAILED) {
|
||||
RTE_LOG(ERR, EAL, "Failed to mmap buffer %u from %s\n",
|
||||
j, hpi->hugedir);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* we need a hole if this isn't an empty memseg
|
||||
* list, and if previous segment was not
|
||||
* adjacent to current one.
|
||||
*/
|
||||
need_hole = !empty && !is_adjacent;
|
||||
seg->addr = addr;
|
||||
seg->iova = physaddr;
|
||||
seg->hugepage_sz = page_sz;
|
||||
seg->len = page_sz;
|
||||
seg->nchannel = mcfg->nchannel;
|
||||
seg->nrank = mcfg->nrank;
|
||||
seg->socket_id = k;
|
||||
|
||||
/* we need 1, plus hole if not adjacent */
|
||||
ms_idx = rte_fbarray_find_next_n_free(arr,
|
||||
0, 1 + (need_hole ? 1 : 0));
|
||||
rte_fbarray_set_used(arr, ms_idx);
|
||||
|
||||
/* memseg list is full? */
|
||||
if (ms_idx < 0)
|
||||
continue;
|
||||
RTE_LOG(INFO, EAL, "Mapped memory segment %u @ %p: physaddr:0x%"
|
||||
PRIx64", len %zu, socket %d\n",
|
||||
seg_idx++, addr, physaddr, page_sz, k);
|
||||
|
||||
if (need_hole && prev_ms_idx == ms_idx - 1)
|
||||
ms_idx++;
|
||||
prev_ms_idx = ms_idx;
|
||||
|
||||
break;
|
||||
total_mem += seg->len;
|
||||
}
|
||||
if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
|
||||
RTE_LOG(ERR, EAL, "Could not find space for memseg. Please increase %s and/or %s in configuration.\n",
|
||||
RTE_STR(RTE_MAX_MEMSEG_PER_TYPE),
|
||||
RTE_STR(RTE_MAX_MEM_MB_PER_TYPE));
|
||||
return -1;
|
||||
}
|
||||
arr = &msl->memseg_arr;
|
||||
seg = rte_fbarray_get(arr, ms_idx);
|
||||
|
||||
addr = RTE_PTR_ADD(msl->base_va,
|
||||
(size_t)msl->page_sz * ms_idx);
|
||||
|
||||
/* address is already mapped in memseg list, so using
|
||||
* MAP_FIXED here is safe.
|
||||
*/
|
||||
addr = mmap(addr, page_sz, PROT_READ|PROT_WRITE,
|
||||
MAP_SHARED | MAP_FIXED,
|
||||
hpi->lock_descriptor,
|
||||
j * EAL_PAGE_SIZE);
|
||||
if (addr == MAP_FAILED) {
|
||||
RTE_LOG(ERR, EAL, "Failed to mmap buffer %u from %s\n",
|
||||
j, hpi->hugedir);
|
||||
return -1;
|
||||
}
|
||||
|
||||
seg->addr = addr;
|
||||
seg->iova = physaddr;
|
||||
seg->hugepage_sz = page_sz;
|
||||
seg->len = page_sz;
|
||||
seg->nchannel = mcfg->nchannel;
|
||||
seg->nrank = mcfg->nrank;
|
||||
seg->socket_id = 0;
|
||||
|
||||
rte_fbarray_set_used(arr, ms_idx);
|
||||
|
||||
RTE_LOG(INFO, EAL, "Mapped memory segment %u @ %p: physaddr:0x%"
|
||||
PRIx64", len %zu\n",
|
||||
seg_idx++, addr, physaddr, page_sz);
|
||||
|
||||
total_mem += seg->len;
|
||||
}
|
||||
if (total_mem >= internal_conf->memory)
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (total_mem < internal_conf->memory) {
|
||||
RTE_LOG(ERR, EAL, "Couldn't reserve requested memory, "
|
||||
"requested: %" PRIu64 "M "
|
||||
@ -334,7 +337,7 @@ static int
|
||||
memseg_primary_init(void)
|
||||
{
|
||||
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
|
||||
int hpi_idx, msl_idx = 0;
|
||||
int hpi_idx, msl_idx = 0, cur_socket;
|
||||
struct rte_memseg_list *msl;
|
||||
uint64_t max_mem, total_mem;
|
||||
struct internal_config *internal_conf =
|
||||
@ -360,78 +363,80 @@ memseg_primary_init(void)
|
||||
/* create memseg lists */
|
||||
for (hpi_idx = 0; hpi_idx < (int) internal_conf->num_hugepage_sizes;
|
||||
hpi_idx++) {
|
||||
uint64_t max_type_mem, total_type_mem = 0;
|
||||
uint64_t avail_mem;
|
||||
int type_msl_idx, max_segs, avail_segs, total_segs = 0;
|
||||
struct hugepage_info *hpi;
|
||||
uint64_t hugepage_sz;
|
||||
|
||||
hpi = &internal_conf->hugepage_info[hpi_idx];
|
||||
hugepage_sz = hpi->hugepage_sz;
|
||||
|
||||
/* no NUMA support on FreeBSD */
|
||||
for (cur_socket = 0; cur_socket < (int)rte_socket_count(); cur_socket++) {
|
||||
uint64_t avail_mem;
|
||||
int type_msl_idx, max_segs, avail_segs, total_segs = 0, total_pages = 0;
|
||||
uint64_t max_type_mem, total_type_mem = 0;
|
||||
/* check if we've already exceeded total memory amount */
|
||||
if (total_mem >= max_mem)
|
||||
goto endloop;
|
||||
|
||||
/* check if we've already exceeded total memory amount */
|
||||
if (total_mem >= max_mem)
|
||||
break;
|
||||
/* first, calculate theoretical limits according to config */
|
||||
max_type_mem = RTE_MIN(max_mem - total_mem,
|
||||
(uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20);
|
||||
max_segs = RTE_MAX_MEMSEG_PER_TYPE;
|
||||
|
||||
/* first, calculate theoretical limits according to config */
|
||||
max_type_mem = RTE_MIN(max_mem - total_mem,
|
||||
(uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20);
|
||||
max_segs = RTE_MAX_MEMSEG_PER_TYPE;
|
||||
/* now, limit all of that to whatever will actually be
|
||||
* available to us, because without dynamic allocation support,
|
||||
* all of that extra memory will be sitting there being useless
|
||||
* and slowing down core dumps in case of a crash.
|
||||
*
|
||||
* we need (N*2)-1 segments because we cannot guarantee that
|
||||
* each segment will be IOVA-contiguous with the previous one,
|
||||
* so we will allocate more and put spaces between segments
|
||||
* that are non-contiguous.
|
||||
*/
|
||||
total_pages = hpi->num_pages[cur_socket];
|
||||
avail_segs = (total_pages * 2) - 1;
|
||||
avail_mem = avail_segs * hugepage_sz;
|
||||
|
||||
/* now, limit all of that to whatever will actually be
|
||||
* available to us, because without dynamic allocation support,
|
||||
* all of that extra memory will be sitting there being useless
|
||||
* and slowing down core dumps in case of a crash.
|
||||
*
|
||||
* we need (N*2)-1 segments because we cannot guarantee that
|
||||
* each segment will be IOVA-contiguous with the previous one,
|
||||
* so we will allocate more and put spaces between segments
|
||||
* that are non-contiguous.
|
||||
*/
|
||||
avail_segs = (hpi->num_pages[0] * 2) - 1;
|
||||
avail_mem = avail_segs * hugepage_sz;
|
||||
max_type_mem = RTE_MIN(avail_mem, max_type_mem);
|
||||
max_segs = RTE_MIN(avail_segs, max_segs);
|
||||
|
||||
max_type_mem = RTE_MIN(avail_mem, max_type_mem);
|
||||
max_segs = RTE_MIN(avail_segs, max_segs);
|
||||
type_msl_idx = 0;
|
||||
while (total_type_mem < max_type_mem &&
|
||||
total_segs < max_segs) {
|
||||
uint64_t cur_max_mem, cur_mem;
|
||||
unsigned int n_segs;
|
||||
|
||||
type_msl_idx = 0;
|
||||
while (total_type_mem < max_type_mem &&
|
||||
total_segs < max_segs) {
|
||||
uint64_t cur_max_mem, cur_mem;
|
||||
unsigned int n_segs;
|
||||
if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
|
||||
RTE_LOG(ERR, EAL,
|
||||
"No more space in memseg lists, please increase %s\n",
|
||||
RTE_STR(RTE_MAX_MEMSEG_LISTS));
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
|
||||
RTE_LOG(ERR, EAL,
|
||||
"No more space in memseg lists, please increase %s\n",
|
||||
RTE_STR(RTE_MAX_MEMSEG_LISTS));
|
||||
return -1;
|
||||
}
|
||||
|
||||
msl = &mcfg->memsegs[msl_idx++];
|
||||
|
||||
cur_max_mem = max_type_mem - total_type_mem;
|
||||
|
||||
cur_mem = get_mem_amount(hugepage_sz,
|
||||
cur_max_mem);
|
||||
n_segs = cur_mem / hugepage_sz;
|
||||
|
||||
if (eal_memseg_list_init(msl, hugepage_sz, n_segs,
|
||||
0, type_msl_idx, false))
|
||||
return -1;
|
||||
|
||||
total_segs += msl->memseg_arr.len;
|
||||
total_type_mem = total_segs * hugepage_sz;
|
||||
type_msl_idx++;
|
||||
|
||||
if (memseg_list_alloc(msl)) {
|
||||
RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n");
|
||||
return -1;
|
||||
msl = &mcfg->memsegs[msl_idx++];
|
||||
|
||||
cur_max_mem = max_type_mem - total_type_mem;
|
||||
|
||||
cur_mem = get_mem_amount(hugepage_sz,
|
||||
cur_max_mem);
|
||||
n_segs = cur_mem / hugepage_sz;
|
||||
|
||||
if (eal_memseg_list_init(msl, hugepage_sz, n_segs,
|
||||
cur_socket, type_msl_idx, false))
|
||||
return -1;
|
||||
|
||||
total_segs += msl->memseg_arr.len;
|
||||
total_type_mem = total_segs * hugepage_sz;
|
||||
type_msl_idx++;
|
||||
|
||||
if (memseg_list_alloc(msl)) {
|
||||
RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
total_mem += total_type_mem;
|
||||
}
|
||||
total_mem += total_type_mem;
|
||||
}
|
||||
endloop:
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -12,6 +12,7 @@
|
||||
*/
|
||||
|
||||
#include <pthread_np.h>
|
||||
#include <stdint.h>
|
||||
|
||||
typedef cpuset_t rte_cpuset_t;
|
||||
#define RTE_CPU_AND(dst, src1, src2) do \
|
||||
@ -49,4 +50,19 @@ typedef cpuset_t rte_cpuset_t;
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
static inline int offset_to_node(uint64_t offset)
|
||||
{
|
||||
return (offset >> 12) & 0xF;
|
||||
}
|
||||
|
||||
static inline int offset_to_buffer(uint64_t offset)
|
||||
{
|
||||
return (offset >> 16) & 0xFFFF;
|
||||
}
|
||||
|
||||
static inline uint64_t offset_make(int node, int buffer)
|
||||
{
|
||||
return (((uint64_t)node & 0xF) << 12) | (((uint64_t)buffer & 0xFFFF) << 16);
|
||||
}
|
||||
|
||||
#endif /* _RTE_OS_H_ */
|
||||
|
Loading…
Reference in New Issue
Block a user