memory: add --socket-mem option

On NUMA systems, --socket-mem makes it possible to select the node where
allocations will go.

Signed-off-by: Intel
This commit is contained in:
Intel 2012-12-20 00:00:00 +01:00 committed by Thomas Monjalon
parent ea5dd2744b
commit b6a468ad41
4 changed files with 453 additions and 80 deletions

View File

@ -80,11 +80,14 @@
#define OPT_NO_PCI "no-pci" #define OPT_NO_PCI "no-pci"
#define OPT_NO_HUGE "no-huge" #define OPT_NO_HUGE "no-huge"
#define OPT_FILE_PREFIX "file-prefix" #define OPT_FILE_PREFIX "file-prefix"
#define OPT_SOCKET_MEM "socket-mem"
#define RTE_EAL_BLACKLIST_SIZE 0x100 #define RTE_EAL_BLACKLIST_SIZE 0x100
#define MEMSIZE_IF_NO_HUGE_PAGE (64ULL * 1024ULL * 1024ULL) #define MEMSIZE_IF_NO_HUGE_PAGE (64ULL * 1024ULL * 1024ULL)
#define SOCKET_MEM_STRLEN (RTE_MAX_NUMA_NODES * 10)
#define GET_BLACKLIST_FIELD(in, fd, lim, dlm) \ #define GET_BLACKLIST_FIELD(in, fd, lim, dlm) \
{ \ { \
unsigned long val; \ unsigned long val; \
@ -293,6 +296,8 @@ eal_usage(const char *prgname)
" (multiple -b options are allowed)\n" " (multiple -b options are allowed)\n"
" -m MB : memory to allocate (see also --"OPT_SOCKET_MEM")\n" " -m MB : memory to allocate (see also --"OPT_SOCKET_MEM")\n"
" -r NUM : force number of memory ranks (don't detect)\n" " -r NUM : force number of memory ranks (don't detect)\n"
" --"OPT_SOCKET_MEM" : memory to allocate on specific \n"
" sockets (use comma separated values)\n"
" --"OPT_HUGE_DIR" : directory where hugetlbfs is mounted\n" " --"OPT_HUGE_DIR" : directory where hugetlbfs is mounted\n"
" --"OPT_PROC_TYPE" : type of this process\n" " --"OPT_PROC_TYPE" : type of this process\n"
" --"OPT_FILE_PREFIX": prefix for hugepage filenames\n" " --"OPT_FILE_PREFIX": prefix for hugepage filenames\n"
@ -339,16 +344,69 @@ eal_parse_coremask(const char *coremask)
return 0; return 0;
} }
static int
eal_parse_socket_mem(char *socket_mem)
{
char * arg[RTE_MAX_NUMA_NODES];
char *end;
int arg_num, i, len;
uint64_t total_mem = 0;
len = strnlen(socket_mem, SOCKET_MEM_STRLEN);
if (len == SOCKET_MEM_STRLEN) {
RTE_LOG(ERR, EAL, "--socket-mem is too long\n");
return -1;
}
/* all other error cases will be caught later */
if (!isdigit(socket_mem[len-1]))
return -1;
/* split the optarg into separate socket values */
arg_num = rte_strsplit(socket_mem, len,
arg, RTE_MAX_NUMA_NODES, ',');
/* if split failed, or 0 arguments */
if (arg_num <= 0)
return -1;
internal_config.force_sockets = 1;
/* parse each defined socket option */
errno = 0;
for (i = 0; i < arg_num; i++) {
end = NULL;
internal_config.socket_mem[i] = strtoull(arg[i], &end, 10);
/* check for invalid input */
if ((errno != 0) ||
(arg[i][0] == '\0') || (end == NULL) || (*end != '\0'))
return -1;
internal_config.socket_mem[i] *= 1024ULL;
internal_config.socket_mem[i] *= 1024ULL;
total_mem += internal_config.socket_mem[i];
}
/* check if we have a positive amount of total memory */
if (total_mem == 0)
return -1;
return 0;
}
static inline uint64_t static inline uint64_t
eal_get_hugepage_mem_size(void) eal_get_hugepage_mem_size(void)
{ {
uint64_t size = 0; uint64_t size = 0;
unsigned i; unsigned i, j;
for (i = 0; i < internal_config.num_hugepage_sizes; i++){ for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
struct hugepage_info *hpi = &internal_config.hugepage_info[i]; struct hugepage_info *hpi = &internal_config.hugepage_info[i];
if (hpi->hugedir != NULL) if (hpi->hugedir != NULL) {
size += hpi->hugepage_sz * hpi->num_pages; for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
size += hpi->hugepage_sz * hpi->num_pages[j];
}
}
} }
return (size); return (size);
@ -401,7 +459,7 @@ eal_parse_blacklist_opt(const char *optarg, size_t idx)
static int static int
eal_parse_args(int argc, char **argv) eal_parse_args(int argc, char **argv)
{ {
int opt, ret; int opt, ret, i;
char **argvopt; char **argvopt;
int option_index; int option_index;
int coremask_ok = 0; int coremask_ok = 0;
@ -415,6 +473,7 @@ eal_parse_args(int argc, char **argv)
{OPT_NO_SHCONF, 0, 0, 0}, {OPT_NO_SHCONF, 0, 0, 0},
{OPT_PROC_TYPE, 1, 0, 0}, {OPT_PROC_TYPE, 1, 0, 0},
{OPT_FILE_PREFIX, 1, 0, 0}, {OPT_FILE_PREFIX, 1, 0, 0},
{OPT_SOCKET_MEM, 1, 0, 0},
{0, 0, 0, 0} {0, 0, 0, 0}
}; };
@ -425,11 +484,15 @@ eal_parse_args(int argc, char **argv)
internal_config.force_nchannel = 0; internal_config.force_nchannel = 0;
internal_config.hugefile_prefix = HUGEFILE_PREFIX_DEFAULT; internal_config.hugefile_prefix = HUGEFILE_PREFIX_DEFAULT;
internal_config.hugepage_dir = NULL; internal_config.hugepage_dir = NULL;
internal_config.force_sockets = 0;
#ifdef RTE_LIBEAL_USE_HPET #ifdef RTE_LIBEAL_USE_HPET
internal_config.no_hpet = 0; internal_config.no_hpet = 0;
#else #else
internal_config.no_hpet = 1; internal_config.no_hpet = 1;
#endif #endif
/* zero out the NUMA config */
for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
internal_config.socket_mem[i] = 0;
while ((opt = getopt_long(argc, argvopt, "b:c:m:n:r:v", while ((opt = getopt_long(argc, argvopt, "b:c:m:n:r:v",
lgopts, &option_index)) != EOF) { lgopts, &option_index)) != EOF) {
@ -508,6 +571,14 @@ eal_parse_args(int argc, char **argv)
else if (!strcmp(lgopts[option_index].name, OPT_FILE_PREFIX)) { else if (!strcmp(lgopts[option_index].name, OPT_FILE_PREFIX)) {
internal_config.hugefile_prefix = optarg; internal_config.hugefile_prefix = optarg;
} }
else if (!strcmp(lgopts[option_index].name, OPT_SOCKET_MEM)) {
if (eal_parse_socket_mem(optarg) < 0) {
RTE_LOG(ERR, EAL, "invalid parameters for --"
OPT_SOCKET_MEM "\n");
eal_usage(prgname);
return -1;
}
}
break; break;
default: default:
@ -541,6 +612,21 @@ eal_parse_args(int argc, char **argv)
eal_usage(prgname); eal_usage(prgname);
return -1; return -1;
} }
if (internal_config.memory > 0 && internal_config.force_sockets == 1) {
RTE_LOG(ERR, EAL, "Options -m and --socket-mem cannot be specified "
"at the same time\n");
eal_usage(prgname);
return -1;
}
/* --no-huge doesn't make sense with either -m or --socket-mem */
if (internal_config.no_hugetlbfs &&
(internal_config.memory > 0 ||
internal_config.force_sockets == 1)) {
RTE_LOG(ERR, EAL, "Options -m or --socket-mem cannot be specified "
"together with --no-huge!\n");
eal_usage(prgname);
return -1;
}
if (blacklist_index > 0) if (blacklist_index > 0)
rte_eal_pci_set_blacklist(eal_dev_blacklist, blacklist_index); rte_eal_pci_set_blacklist(eal_dev_blacklist, blacklist_index);
@ -548,11 +634,35 @@ eal_parse_args(int argc, char **argv)
if (optind >= 0) if (optind >= 0)
argv[optind-1] = prgname; argv[optind-1] = prgname;
/* if no memory amounts were requested, this will result in 0 and
* will be overriden later, right after eal_hugepage_info_init() */
for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
internal_config.memory += internal_config.socket_mem[i];
ret = optind-1; ret = optind-1;
optind = 0; /* reset getopt lib */ optind = 0; /* reset getopt lib */
return ret; return ret;
} }
static void
eal_check_mem_on_local_socket(void)
{
const struct rte_memseg *ms;
int i, socket_id;
socket_id = rte_lcore_to_socket_id(rte_config.master_lcore);
ms = rte_eal_get_physmem_layout();
for (i = 0; i < RTE_MAX_MEMSEG; i++)
if (ms[i].socket_id == socket_id &&
ms[i].len > 0)
return;
RTE_LOG(WARNING, EAL, "WARNING: Master core has no "
"memory on local socket!\n");
}
/* Launch threads, called at application init(). */ /* Launch threads, called at application init(). */
int int
rte_eal_init(int argc, char **argv) rte_eal_init(int argc, char **argv)
@ -572,7 +682,7 @@ rte_eal_init(int argc, char **argv)
if (eal_hugepage_info_init() < 0) if (eal_hugepage_info_init() < 0)
rte_panic("Cannot get hugepage information\n"); rte_panic("Cannot get hugepage information\n");
if (internal_config.memory == 0) { if (internal_config.memory == 0 && internal_config.force_sockets == 0) {
if (internal_config.no_hugetlbfs) if (internal_config.no_hugetlbfs)
internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE; internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE;
else else
@ -612,6 +722,8 @@ rte_eal_init(int argc, char **argv)
RTE_LOG(DEBUG, EAL, "Master core %u is ready (tid=%x)\n", RTE_LOG(DEBUG, EAL, "Master core %u is ready (tid=%x)\n",
rte_config.master_lcore, (int)thread_id); rte_config.master_lcore, (int)thread_id);
eal_check_mem_on_local_socket();
RTE_LCORE_FOREACH_SLAVE(i) { RTE_LCORE_FOREACH_SLAVE(i) {
/* /*

View File

@ -174,6 +174,11 @@ swap_hpi(struct hugepage_info *a, struct hugepage_info *b)
memcpy(b, buf, sizeof(*a)); memcpy(b, buf, sizeof(*a));
} }
/*
* when we initialize the hugepage info, everything goes
* to socket 0 by default. it will later get sorted by memory
* initialization procedure.
*/
int int
eal_hugepage_info_init(void) eal_hugepage_info_init(void)
{ {
@ -192,16 +197,27 @@ eal_hugepage_info_init(void)
struct hugepage_info *hpi = \ struct hugepage_info *hpi = \
&internal_config.hugepage_info[num_sizes]; &internal_config.hugepage_info[num_sizes];
hpi->hugepage_sz = rte_str_to_size(&dirent->d_name[dirent_start_len]); hpi->hugepage_sz = rte_str_to_size(&dirent->d_name[dirent_start_len]);
hpi->num_pages = get_num_hugepages(dirent->d_name);
hpi->hugedir = get_hugepage_dir(hpi->hugepage_sz); hpi->hugedir = get_hugepage_dir(hpi->hugepage_sz);
/* first, check if we have a mountpoint */
if (hpi->hugedir == NULL){ if (hpi->hugedir == NULL){
RTE_LOG(INFO, EAL, "%u hugepages of size %llu reserved, "\ RTE_LOG(INFO, EAL, "%u hugepages of size %llu reserved, "\
"but no mounted hugetlbfs found for that size\n", "but no mounted hugetlbfs found for that size\n",
hpi->num_pages, (unsigned) get_num_hugepages(dirent->d_name),
(unsigned long long)hpi->hugepage_sz); (unsigned long long)hpi->hugepage_sz);
hpi->num_pages = 0; } else {
} else /* for now, put all pages into socket 0,
* later they will be sorted */
hpi->num_pages[0] = get_num_hugepages(dirent->d_name);
#ifndef RTE_ARCH_X86_64
/* for 32-bit systems, limit number of hugepages to 1GB per page size */
hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0],
RTE_PGSIZE_1G / hpi->hugepage_sz);
#endif
num_sizes++; num_sizes++;
}
} }
dirent = readdir(dir); dirent = readdir(dir);
} }
@ -221,8 +237,9 @@ eal_hugepage_info_init(void)
/* now we have all info, check we have at least one valid size */ /* now we have all info, check we have at least one valid size */
for (i = 0; i < num_sizes; i++) for (i = 0; i < num_sizes; i++)
if (internal_config.hugepage_info[i].hugedir != NULL && if (internal_config.hugepage_info[i].hugedir != NULL &&
internal_config.hugepage_info[i].num_pages > 0) internal_config.hugepage_info[i].num_pages[0] > 0)
return 0; return 0;
/* no valid hugepage mounts available, return error */ /* no valid hugepage mounts available, return error */
return -1; return -1;
} }

View File

@ -176,7 +176,7 @@ map_all_hugepages(struct hugepage *hugepg_tbl,
void *vma_addr = NULL; void *vma_addr = NULL;
uint64_t vma_len = 0; uint64_t vma_len = 0;
for (i = 0; i < hpi->num_pages; i++) { for (i = 0; i < hpi->num_pages[0]; i++) {
uint64_t hugepage_sz = hpi->hugepage_sz; uint64_t hugepage_sz = hpi->hugepage_sz;
if (orig) { if (orig) {
@ -203,7 +203,7 @@ map_all_hugepages(struct hugepage *hugepg_tbl,
/* reserve a virtual area for next contiguous /* reserve a virtual area for next contiguous
* physical block: count the number of * physical block: count the number of
* contiguous physical pages. */ * contiguous physical pages. */
for (j = i+1; j < hpi->num_pages ; j++) { for (j = i+1; j < hpi->num_pages[0] ; j++) {
if (hugepg_tbl[j].physaddr != if (hugepg_tbl[j].physaddr !=
hugepg_tbl[j-1].physaddr + hugepage_sz) hugepg_tbl[j-1].physaddr + hugepage_sz)
break; break;
@ -255,7 +255,7 @@ static int
unmap_all_hugepages_orig(struct hugepage *hugepg_tbl, struct hugepage_info *hpi) unmap_all_hugepages_orig(struct hugepage *hugepg_tbl, struct hugepage_info *hpi)
{ {
unsigned i; unsigned i;
for (i = 0; i < hpi->num_pages; i++) { for (i = 0; i < hpi->num_pages[0]; i++) {
if (hugepg_tbl[i].orig_va) { if (hugepg_tbl[i].orig_va) {
munmap(hugepg_tbl[i].orig_va, hpi->hugepage_sz); munmap(hugepg_tbl[i].orig_va, hpi->hugepage_sz);
hugepg_tbl[i].orig_va = NULL; hugepg_tbl[i].orig_va = NULL;
@ -287,7 +287,7 @@ find_physaddr(struct hugepage *hugepg_tbl, struct hugepage_info *hpi)
return -1; return -1;
} }
for (i = 0; i < hpi->num_pages; i++) { for (i = 0; i < hpi->num_pages[0]; i++) {
off_t offset; off_t offset;
virt_pfn = (unsigned long)hugepg_tbl[i].orig_va / virt_pfn = (unsigned long)hugepg_tbl[i].orig_va /
page_size; page_size;
@ -377,7 +377,7 @@ find_numasocket(struct hugepage *hugepg_tbl, struct hugepage_info *hpi)
} }
/* if we find this page in our mappings, set socket_id */ /* if we find this page in our mappings, set socket_id */
for (i = 0; i < hpi->num_pages; i++) { for (i = 0; i < hpi->num_pages[0]; i++) {
void *va = (void *)(unsigned long)virt_addr; void *va = (void *)(unsigned long)virt_addr;
if (hugepg_tbl[i].orig_va == va) { if (hugepg_tbl[i].orig_va == va) {
hugepg_tbl[i].socket_id = socket_id; hugepg_tbl[i].socket_id = socket_id;
@ -385,8 +385,10 @@ find_numasocket(struct hugepage *hugepg_tbl, struct hugepage_info *hpi)
} }
} }
} }
if (hp_count < hpi->num_pages)
if (hp_count < hpi->num_pages[0])
goto error; goto error;
fclose(f); fclose(f);
return 0; return 0;
@ -408,7 +410,7 @@ sort_by_physaddr(struct hugepage *hugepg_tbl, struct hugepage_info *hpi)
uint64_t smallest_addr; uint64_t smallest_addr;
struct hugepage tmp; struct hugepage tmp;
for (i = 0; i < hpi->num_pages; i++) { for (i = 0; i < hpi->num_pages[0]; i++) {
smallest_addr = 0; smallest_addr = 0;
smallest_idx = -1; smallest_idx = -1;
@ -416,7 +418,7 @@ sort_by_physaddr(struct hugepage *hugepg_tbl, struct hugepage_info *hpi)
* browse all entries starting at 'i', and find the * browse all entries starting at 'i', and find the
* entry with the smallest addr * entry with the smallest addr
*/ */
for (j=i; j<hpi->num_pages; j++) { for (j=i; j< hpi->num_pages[0]; j++) {
if (smallest_addr == 0 || if (smallest_addr == 0 ||
hugepg_tbl[j].physaddr < smallest_addr) { hugepg_tbl[j].physaddr < smallest_addr) {
@ -461,53 +463,179 @@ create_shared_memory(const char *filename, const size_t mem_size)
} }
/* /*
* This function takes in the list of hugepage sizes and the * this copies *active* hugepages from one hugepage table to another.
* destination is typically the shared memory.
*/
static int
copy_hugepages_to_shared_mem(struct hugepage * dst, int dest_size,
const struct hugepage * src, int src_size)
{
int src_pos, dst_pos = 0;
for (src_pos = 0; src_pos < src_size; src_pos++) {
if (src[src_pos].final_va != NULL) {
/* error on overflow attempt */
if (dst_pos == dest_size)
return -1;
memcpy(&dst[dst_pos], &src[src_pos], sizeof(struct hugepage));
dst_pos++;
}
}
return 0;
}
/*
* unmaps hugepages that are not going to be used. since we originally allocate
* ALL hugepages (not just those we need), additional unmapping needs to be done.
*/
static int
unmap_unneeded_hugepages(struct hugepage *hugepg_tbl,
struct hugepage_info *hpi,
unsigned num_hp_info)
{
unsigned socket, size;
int page, nrpages = 0;
int fd;
/* get total number of hugepages */
for (size = 0; size < num_hp_info; size++)
for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++)
nrpages += internal_config.hugepage_info[size].num_pages[socket];
for (size = 0; size < num_hp_info; size++) {
for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) {
unsigned pages_found = 0;
/* traverse until we have unmapped all the unused pages */
for (page = 0; page < nrpages; page++) {
struct hugepage *hp = &hugepg_tbl[page];
/* find a page that matches the criteria */
if ((hp->size == hpi[size].hugepage_sz) &&
(hp->socket_id == (int) socket)) {
/* if we skipped enough pages, unmap the rest */
if (pages_found == hpi[size].num_pages[socket]) {
munmap(hp->final_va, hp->size);
hp->final_va = NULL;
}
else {
pages_found++;
}
} /* match page */
} /* foreach page */
} /* foreach socket */
} /* foreach pagesize */
return 0;
}
static inline uint64_t
get_socket_mem_size(int socket)
{
uint64_t size = 0;
unsigned i;
for (i = 0; i < internal_config.num_hugepage_sizes; i++){
struct hugepage_info *hpi = &internal_config.hugepage_info[i];
if (hpi->hugedir != NULL)
size += hpi->hugepage_sz * hpi->num_pages[socket];
}
return (size);
}
/*
* This function is a NUMA-aware equivalent of calc_num_pages.
* It takes in the list of hugepage sizes and the
* number of pages thereof, and calculates the best number of * number of pages thereof, and calculates the best number of
* pages of each size to fulfill the request for <memory> ram * pages of each size to fulfill the request for <memory> ram
*/ */
static int static int
calc_num_pages(uint64_t memory, calc_num_pages_per_socket(uint64_t * memory,
struct hugepage_info *hp_info, struct hugepage_info *hp_info,
struct hugepage_info *hp_used, struct hugepage_info *hp_used,
unsigned num_hp_info) unsigned num_hp_info)
{ {
unsigned i = 0; unsigned socket, j, i = 0;
unsigned requested, available;
int total_num_pages = 0; int total_num_pages = 0;
uint64_t remaining_mem, cur_mem;
uint64_t total_mem = internal_config.memory;
if (num_hp_info == 0) if (num_hp_info == 0)
return -1; return -1;
for (i = 0; i < num_hp_info; i++){ for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; socket++) {
hp_used[i].hugepage_sz = hp_info[i].hugepage_sz; /* if specific memory amounts per socket weren't requested */
hp_used[i].hugedir = hp_info[i].hugedir; if (internal_config.force_sockets == 0) {
hp_used[i].num_pages = RTE_MIN(memory / hp_info[i].hugepage_sz, /* take whatever is available */
hp_info[i].num_pages); memory[socket] = RTE_MIN(get_socket_mem_size(socket),
total_mem);
memory -= hp_used[i].num_pages * hp_used[i].hugepage_sz;
total_num_pages += hp_used[i].num_pages;
/* check if we have met all memory requests */
if (memory == 0)
break;
/* check if we have any more pages left at this size, if so
* move on to next size */
if (hp_used[i].num_pages == hp_info[i].num_pages)
continue;
/* At this point we know that there are more pages available that are
* bigger than the memory we want, so lets see if we can get enough
* from other page sizes.
*/
unsigned j;
uint64_t remaining_mem = 0;
for (j = i+1; j < num_hp_info; j++)
remaining_mem += hp_info[j].hugepage_sz * hp_info[j].num_pages;
/* is there enough other memory, if not allocate another page and quit*/
if (remaining_mem < memory){
memory -= hp_info[i].hugepage_sz;
hp_used[i].num_pages++;
total_num_pages++;
break; /* we are done */
} }
/* skips if the memory on specific socket wasn't requested */
for (i = 0; i < num_hp_info && memory[socket] != 0; i++){
hp_used[i].hugedir = hp_info[i].hugedir;
hp_used[i].num_pages[socket] = RTE_MIN(
memory[socket] / hp_info[i].hugepage_sz,
hp_info[i].num_pages[socket]);
cur_mem = hp_used[i].num_pages[socket] *
hp_used[i].hugepage_sz;
memory[socket] -= cur_mem;
total_mem -= cur_mem;
total_num_pages += hp_used[i].num_pages[socket];
/* check if we have met all memory requests */
if (memory[socket] == 0)
break;
/* check if we have any more pages left at this size, if so
* move on to next size */
if (hp_used[i].num_pages[socket] == hp_info[i].num_pages[socket])
continue;
/* At this point we know that there are more pages available that are
* bigger than the memory we want, so lets see if we can get enough
* from other page sizes.
*/
remaining_mem = 0;
for (j = i+1; j < num_hp_info; j++)
remaining_mem += hp_info[j].hugepage_sz *
hp_info[j].num_pages[socket];
/* is there enough other memory, if not allocate another page and quit */
if (remaining_mem < memory[socket]){
cur_mem = RTE_MIN(memory[socket],
hp_info[i].hugepage_sz);
memory[socket] -= cur_mem;
total_mem -= cur_mem;
hp_used[i].num_pages[socket]++;
total_num_pages++;
break; /* we are done with this socket*/
}
}
/* if we didn't satisfy all memory requirements per socket */
if (memory[socket] > 0) {
/* to prevent icc errors */
requested = (unsigned) (internal_config.socket_mem[socket] /
0x100000);
available = requested -
((unsigned) (memory[socket] / 0x100000));
RTE_LOG(INFO, EAL, "Not enough memory available on socket %u! "
"Requested: %uMB, available: %uMB\n", socket,
requested, available);
return -1;
}
}
/* if we didn't satisfy total memory requirements */
if (total_mem > 0) {
requested = (unsigned) (internal_config.memory / 0x100000);
available = requested - (unsigned) (total_mem / 0x100000);
RTE_LOG(INFO, EAL, "Not enough memory available! Requested: %uMB,"
" available: %uMB\n", requested, available);
return -1;
} }
return total_num_pages; return total_num_pages;
} }
@ -527,10 +655,14 @@ static int
rte_eal_hugepage_init(void) rte_eal_hugepage_init(void)
{ {
struct rte_mem_config *mcfg; struct rte_mem_config *mcfg;
struct hugepage *hugepage; struct hugepage *hugepage, *tmp_hp = NULL;
struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES]; struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
uint64_t memory[RTE_MAX_NUMA_NODES];
unsigned hp_offset;
int i, j, new_memseg; int i, j, new_memseg;
int nrpages; int nrpages, total_pages = 0;
void *addr; void *addr;
memset(used_hp, 0, sizeof(used_hp)); memset(used_hp, 0, sizeof(used_hp));
@ -541,66 +673,169 @@ rte_eal_hugepage_init(void)
/* for debug purposes, hugetlbfs can be disabled */ /* for debug purposes, hugetlbfs can be disabled */
if (internal_config.no_hugetlbfs) { if (internal_config.no_hugetlbfs) {
addr = malloc(internal_config.memory); addr = malloc(internal_config.memory);
mcfg->memseg[0].phys_addr = (unsigned long)addr; mcfg->memseg[0].phys_addr = (phys_addr_t)(uintptr_t)addr;
mcfg->memseg[0].addr = addr; mcfg->memseg[0].addr = addr;
mcfg->memseg[0].len = internal_config.memory; mcfg->memseg[0].len = internal_config.memory;
mcfg->memseg[0].socket_id = 0; mcfg->memseg[0].socket_id = 0;
return 0; return 0;
} }
nrpages = calc_num_pages(internal_config.memory,
&internal_config.hugepage_info[0], &used_hp[0],
internal_config.num_hugepage_sizes);
for (i = 0; i < (int)internal_config.num_hugepage_sizes; i++)
RTE_LOG(INFO, EAL, "Requesting %u pages of size %"PRIu64"\n",
used_hp[i].num_pages, used_hp[i].hugepage_sz);
hugepage = create_shared_memory(eal_hugepage_info_path(), /* calculate total number of hugepages available. at this point we haven't
nrpages * sizeof(struct hugepage)); * yet started sorting them so they all are on socket 0 */
if (hugepage == NULL) for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
return -1; /* meanwhile, also initialize used_hp hugepage sizes in used_hp */
memset(hugepage, 0, nrpages * sizeof(struct hugepage)); used_hp[i].hugepage_sz = internal_config.hugepage_info[i].hugepage_sz;
unsigned hp_offset = 0; /* where we start the current page size entries */ total_pages += internal_config.hugepage_info[i].num_pages[0];
}
/*
* allocate a memory area for hugepage table.
* this isn't shared memory yet. due to the fact that we need some
* processing done on these pages, shared memory will be created
* at a later stage.
*/
tmp_hp = malloc(total_pages * sizeof(struct hugepage));
if (tmp_hp == NULL)
goto fail;
memset(tmp_hp, 0, total_pages * sizeof(struct hugepage));
hp_offset = 0; /* where we start the current page size entries */
/* map all hugepages and sort them */
for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){ for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
struct hugepage_info *hpi = &used_hp[i]; struct hugepage_info *hpi;
/*
* we don't yet mark hugepages as used at this stage, so
* we just map all hugepages available to the system
* all hugepages are still located on socket 0
*/
hpi = &internal_config.hugepage_info[i];
if (hpi->num_pages == 0) if (hpi->num_pages == 0)
continue; continue;
if (map_all_hugepages(&hugepage[hp_offset], hpi, 1) < 0){ /* map all hugepages available */
if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 1) < 0){
RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n", RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n",
(unsigned)(hpi->hugepage_sz / 0x100000)); (unsigned)(hpi->hugepage_sz / 0x100000));
goto fail; goto fail;
} }
if (find_physaddr(&hugepage[hp_offset], hpi) < 0){ /* find physical addresses and sockets for each hugepage */
if (find_physaddr(&tmp_hp[hp_offset], hpi) < 0){
RTE_LOG(DEBUG, EAL, "Failed to find phys addr for %u MB pages\n", RTE_LOG(DEBUG, EAL, "Failed to find phys addr for %u MB pages\n",
(unsigned)(hpi->hugepage_sz / 0x100000)); (unsigned)(hpi->hugepage_sz / 0x100000));
goto fail; goto fail;
} }
if (find_numasocket(&hugepage[hp_offset], hpi) < 0){ if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){
RTE_LOG(DEBUG, EAL, "Failed to find NUMA socket for %u MB pages\n", RTE_LOG(DEBUG, EAL, "Failed to find NUMA socket for %u MB pages\n",
(unsigned)(hpi->hugepage_sz / 0x100000)); (unsigned)(hpi->hugepage_sz / 0x100000));
goto fail; goto fail;
} }
if (sort_by_physaddr(&hugepage[hp_offset], hpi) < 0) if (sort_by_physaddr(&tmp_hp[hp_offset], hpi) < 0)
goto fail; goto fail;
if (map_all_hugepages(&hugepage[hp_offset], hpi, 0) < 0){ /* remap all hugepages */
if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) < 0){
RTE_LOG(DEBUG, EAL, "Failed to remap %u MB pages\n", RTE_LOG(DEBUG, EAL, "Failed to remap %u MB pages\n",
(unsigned)(hpi->hugepage_sz / 0x100000)); (unsigned)(hpi->hugepage_sz / 0x100000));
goto fail; goto fail;
} }
if (unmap_all_hugepages_orig(&hugepage[hp_offset], hpi) < 0) /* unmap original mappings */
if (unmap_all_hugepages_orig(&tmp_hp[hp_offset], hpi) < 0)
goto fail; goto fail;
/* we have processed a num of hugepages of this size, so inc offset */ /* we have processed a num of hugepages of this size, so inc offset */
hp_offset += hpi->num_pages; hp_offset += hpi->num_pages[0];
} }
/* clean out the numbers of pages */
for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++)
for (j = 0; j < RTE_MAX_NUMA_NODES; j++)
internal_config.hugepage_info[i].num_pages[j] = 0;
/* get hugepages for each socket */
for (i = 0; i < total_pages; i++) {
int socket = tmp_hp[i].socket_id;
/* find a hugepage info with right size and increment num_pages */
for (j = 0; j < (int) internal_config.num_hugepage_sizes; j++) {
if (tmp_hp[i].size ==
internal_config.hugepage_info[j].hugepage_sz) {
internal_config.hugepage_info[j].num_pages[socket]++;
}
}
}
/* make a copy of socket_mem, needed for number of pages calculation */
for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
memory[i] = internal_config.socket_mem[i];
/* calculate final number of pages */
nrpages = calc_num_pages_per_socket(memory,
internal_config.hugepage_info, used_hp,
internal_config.num_hugepage_sizes);
/* error if not enough memory available */
if (nrpages < 0)
goto fail;
/* reporting in! */
for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
if (used_hp[i].num_pages[j] > 0) {
RTE_LOG(INFO, EAL,
"Requesting %u pages of size %uMB"
" from socket %i\n",
used_hp[i].num_pages[j],
(unsigned)
(used_hp[i].hugepage_sz / 0x100000),
j);
}
}
}
/* create shared memory */
hugepage = create_shared_memory(eal_hugepage_info_path(),
nrpages * sizeof(struct hugepage));
if (hugepage == NULL) {
RTE_LOG(ERR, EAL, "Failed to create shared memory!\n");
goto fail;
}
/*
* unmap pages that we won't need (looks at used_hp).
* also, sets final_va to NULL on pages that were unmapped.
*/
if (unmap_unneeded_hugepages(tmp_hp, used_hp,
internal_config.num_hugepage_sizes) < 0) {
RTE_LOG(ERR, EAL, "Unmapping and locking hugepages failed!\n");
goto fail;
}
/*
* copy stuff from malloc'd hugepage* to the actual shared memory.
* this procedure only copies those hugepages that have final_va
* not NULL. has overflow protection.
*/
if (copy_hugepages_to_shared_mem(hugepage, nrpages,
tmp_hp, total_pages) < 0) {
RTE_LOG(ERR, EAL, "Copying tables to shared memory failed!\n");
goto fail;
}
/* free the temporary hugepage table */
free(tmp_hp);
tmp_hp = NULL;
memset(mcfg->memseg, 0, sizeof(mcfg->memseg)); memset(mcfg->memseg, 0, sizeof(mcfg->memseg));
j = -1; j = -1;
for (i = 0; i < nrpages; i++) { for (i = 0; i < nrpages; i++) {
@ -614,10 +849,10 @@ rte_eal_hugepage_init(void)
else if (hugepage[i].size != hugepage[i-1].size) else if (hugepage[i].size != hugepage[i-1].size)
new_memseg = 1; new_memseg = 1;
else if ((hugepage[i].physaddr - hugepage[i-1].physaddr) != else if ((hugepage[i].physaddr - hugepage[i-1].physaddr) !=
hugepage[i].size) hugepage[i].size)
new_memseg = 1; new_memseg = 1;
else if (((unsigned long)hugepage[i].final_va - else if (((unsigned long)hugepage[i].final_va -
(unsigned long)hugepage[i-1].final_va) != hugepage[i].size) (unsigned long)hugepage[i-1].final_va) != hugepage[i].size)
new_memseg = 1; new_memseg = 1;
if (new_memseg) { if (new_memseg) {
@ -641,7 +876,9 @@ rte_eal_hugepage_init(void)
return 0; return 0;
fail: fail:
if (tmp_hp)
free(tmp_hp);
return -1; return -1;
} }
@ -783,6 +1020,7 @@ rte_eal_memdevice_init(void)
int int
rte_eal_memory_init(void) rte_eal_memory_init(void)
{ {
RTE_LOG(INFO, EAL, "Setting up hugepage memory...\n");
const int retval = rte_eal_process_type() == RTE_PROC_PRIMARY ? const int retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
rte_eal_hugepage_init() : rte_eal_hugepage_init() :
rte_eal_hugepage_attach(); rte_eal_hugepage_attach();

View File

@ -40,6 +40,8 @@
#ifndef _EAL_LINUXAPP_INTERNAL_CFG #ifndef _EAL_LINUXAPP_INTERNAL_CFG
#define _EAL_LINUXAPP_INTERNAL_CFG #define _EAL_LINUXAPP_INTERNAL_CFG
#include <rte_eal.h>
#define MAX_HUGEPAGE_SIZES 3 /**< support up to 3 page sizes */ #define MAX_HUGEPAGE_SIZES 3 /**< support up to 3 page sizes */
/* /*
@ -49,7 +51,8 @@
struct hugepage_info { struct hugepage_info {
uint64_t hugepage_sz; /**< size of a huge page */ uint64_t hugepage_sz; /**< size of a huge page */
const char *hugedir; /**< dir where hugetlbfs is mounted */ const char *hugedir; /**< dir where hugetlbfs is mounted */
uint32_t num_pages; /**< number of hugepages of that size */ uint32_t num_pages[RTE_MAX_NUMA_NODES];
/**< number of hugepages of that size on each socket */
}; };
/** /**
@ -64,6 +67,9 @@ struct internal_config {
volatile unsigned no_hpet; /**< true to disable HPET */ volatile unsigned no_hpet; /**< true to disable HPET */
volatile unsigned no_shconf; /**< true if there is no shared config */ volatile unsigned no_shconf; /**< true if there is no shared config */
volatile enum rte_proc_type_t process_type; /* multi-process proc type */ volatile enum rte_proc_type_t process_type; /* multi-process proc type */
/* true to try allocating memory on specific sockets */
volatile unsigned force_sockets;
volatile uint64_t socket_mem[RTE_MAX_NUMA_NODES]; /**< amount of memory per socket*/
const char *hugefile_prefix; /**< the base filename of hugetlbfs files */ const char *hugefile_prefix; /**< the base filename of hugetlbfs files */
const char *hugepage_dir; /**< specific hugetlbfs directory to use */ const char *hugepage_dir; /**< specific hugetlbfs directory to use */