1087 lines
25 KiB
C
Raw Normal View History

/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2010-2018 Intel Corporation.
* Copyright(c) 2014 6WIND S.A.
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdarg.h>
#include <unistd.h>
#include <pthread.h>
#include <syslog.h>
#include <getopt.h>
#include <sys/file.h>
#include <stddef.h>
#include <errno.h>
#include <limits.h>
#include <sys/mman.h>
#include <sys/queue.h>
#include <sys/stat.h>
#include <rte_compat.h>
#include <rte_common.h>
#include <rte_debug.h>
#include <rte_memory.h>
#include <rte_launch.h>
#include <rte_eal.h>
#include <rte_errno.h>
#include <rte_per_lcore.h>
#include <rte_lcore.h>
#include <rte_service_component.h>
#include <rte_log.h>
#include <rte_random.h>
#include <rte_cycles.h>
#include <rte_string_fns.h>
#include <rte_cpuflags.h>
#include <rte_interrupts.h>
#include <rte_bus.h>
#include <rte_dev.h>
#include <rte_devargs.h>
#include <rte_version.h>
#include <rte_vfio.h>
#include <rte_option.h>
#include <rte_atomic.h>
#include <malloc_heap.h>
#include "eal_private.h"
#include "eal_thread.h"
#include "eal_internal_cfg.h"
#include "eal_filesystem.h"
#include "eal_hugepages.h"
#include "eal_options.h"
#include "eal_memcfg.h"
#define MEMSIZE_IF_NO_HUGE_PAGE (64ULL * 1024ULL * 1024ULL)
/* Allow the application to print its usage message too if set */
static rte_usage_hook_t rte_application_usage_hook = NULL;
/* early configuration structure, when memory config is not mmapped */
static struct rte_mem_config early_mem_config;
/* define fd variable here, because file needs to be kept open for the
* duration of the program, as we hold a write lock on it in the primary proc */
static int mem_cfg_fd = -1;
static struct flock wr_lock = {
.l_type = F_WRLCK,
.l_whence = SEEK_SET,
mem: replace memseg with memseg lists Before, we were aggregating multiple pages into one memseg, so the number of memsegs was small. Now, each page gets its own memseg, so the list of memsegs is huge. To accommodate the new memseg list size and to keep the under-the-hood workings sane, the memseg list is now not just a single list, but multiple lists. To be precise, each hugepage size available on the system gets one or more memseg lists, per socket. In order to support dynamic memory allocation, we reserve all memory in advance (unless we're in 32-bit legacy mode, in which case we do not preallocate memory). As in, we do an anonymous mmap() of the entire maximum size of memory per hugepage size, per socket (which is limited to either RTE_MAX_MEMSEG_PER_TYPE pages or RTE_MAX_MEM_MB_PER_TYPE megabytes worth of memory, whichever is the smaller one), split over multiple lists (which are limited to either RTE_MAX_MEMSEG_PER_LIST memsegs or RTE_MAX_MEM_MB_PER_LIST megabytes per list, whichever is the smaller one). There is also a global limit of CONFIG_RTE_MAX_MEM_MB megabytes, which is mainly used for 32-bit targets to limit amounts of preallocated memory, but can be used to place an upper limit on total amount of VA memory that can be allocated by DPDK application. So, for each hugepage size, we get (by default) up to 128G worth of memory, per socket, split into chunks of up to 32G in size. The address space is claimed at the start, in eal_common_memory.c. The actual page allocation code is in eal_memalloc.c (Linux-only), and largely consists of copied EAL memory init code. Pages in the list are also indexed by address. That is, in order to figure out where the page belongs, one can simply look at base address for a memseg list. Similarly, figuring out IOVA address of a memzone is a matter of finding the right memseg list, getting offset and dividing by page size to get the appropriate memseg. This commit also removes rte_eal_dump_physmem_layout() call, according to deprecation notice [1], and removes that deprecation notice as well. On 32-bit targets due to limited VA space, DPDK will no longer spread memory to different sockets like before. Instead, it will (by default) allocate all of the memory on socket where master lcore is. To override this behavior, --socket-mem must be used. The rest of the changes are really ripple effects from the memseg change - heap changes, compile fixes, and rewrites to support fbarray-backed memseg lists. Due to earlier switch to _walk() functions, most of the changes are simple fixes, however some of the _walk() calls were switched to memseg list walk, where it made sense to do so. Additionally, we are also switching locks from flock() to fcntl(). Down the line, we will be introducing single-file segments option, and we cannot use flock() locks to lock parts of the file. Therefore, we will use fcntl() locks for legacy mem as well, in case someone is unfortunate enough to accidentally start legacy mem primary process alongside an already working non-legacy mem-based primary process. [1] http://dpdk.org/dev/patchwork/patch/34002/ Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com> Tested-by: Santosh Shukla <santosh.shukla@caviumnetworks.com> Tested-by: Hemant Agrawal <hemant.agrawal@nxp.com> Tested-by: Gowrishankar Muthukrishnan <gowrishankar.m@linux.vnet.ibm.com>
2018-04-11 13:30:24 +01:00
.l_start = offsetof(struct rte_mem_config, memsegs),
.l_len = sizeof(early_mem_config.memsegs),
};
/* Address of global and public configuration */
static struct rte_config rte_config = {
.mem_config = &early_mem_config,
};
/* internal configuration (per-core) */
struct lcore_config lcore_config[RTE_MAX_LCORE];
/* internal configuration */
struct internal_config internal_config;
/* used by rte_rdtsc() */
int rte_cycles_vmware_tsc_map;
/* platform-specific runtime dir */
static char runtime_dir[PATH_MAX];
static const char *default_runtime_dir = "/var/run";
int
eal_create_runtime_dir(void)
{
const char *directory = default_runtime_dir;
const char *xdg_runtime_dir = getenv("XDG_RUNTIME_DIR");
const char *fallback = "/tmp";
char tmp[PATH_MAX];
int ret;
if (getuid() != 0) {
/* try XDG path first, fall back to /tmp */
if (xdg_runtime_dir != NULL)
directory = xdg_runtime_dir;
else
directory = fallback;
}
/* create DPDK subdirectory under runtime dir */
ret = snprintf(tmp, sizeof(tmp), "%s/dpdk", directory);
if (ret < 0 || ret == sizeof(tmp)) {
RTE_LOG(ERR, EAL, "Error creating DPDK runtime path name\n");
return -1;
}
/* create prefix-specific subdirectory under DPDK runtime dir */
ret = snprintf(runtime_dir, sizeof(runtime_dir), "%s/%s",
tmp, eal_get_hugefile_prefix());
if (ret < 0 || ret == sizeof(runtime_dir)) {
RTE_LOG(ERR, EAL, "Error creating prefix-specific runtime path name\n");
return -1;
}
/* create the path if it doesn't exist. no "mkdir -p" here, so do it
* step by step.
*/
ret = mkdir(tmp, 0700);
if (ret < 0 && errno != EEXIST) {
RTE_LOG(ERR, EAL, "Error creating '%s': %s\n",
tmp, strerror(errno));
return -1;
}
ret = mkdir(runtime_dir, 0700);
if (ret < 0 && errno != EEXIST) {
RTE_LOG(ERR, EAL, "Error creating '%s': %s\n",
runtime_dir, strerror(errno));
return -1;
}
return 0;
}
int
eal_clean_runtime_dir(void)
{
/* FreeBSD doesn't need this implemented for now, because, unlike Linux,
* FreeBSD doesn't create per-process files, so no need to clean up.
*/
return 0;
}
const char *
rte_eal_get_runtime_dir(void)
{
return runtime_dir;
}
/* Return user provided mbuf pool ops name */
const char *
rte_eal_mbuf_user_pool_ops(void)
{
return internal_config.user_mbuf_pool_ops_name;
}
/* Return a pointer to the configuration structure */
struct rte_config *
rte_eal_get_configuration(void)
{
return &rte_config;
}
enum rte_iova_mode
rte_eal_iova_mode(void)
{
return rte_eal_get_configuration()->iova_mode;
}
/* parse a sysfs (or other) file containing one integer value */
int
eal_parse_sysfs_value(const char *filename, unsigned long *val)
{
FILE *f;
char buf[BUFSIZ];
char *end = NULL;
if ((f = fopen(filename, "r")) == NULL) {
RTE_LOG(ERR, EAL, "%s(): cannot open sysfs value %s\n",
__func__, filename);
return -1;
}
if (fgets(buf, sizeof(buf), f) == NULL) {
RTE_LOG(ERR, EAL, "%s(): cannot read sysfs value %s\n",
__func__, filename);
fclose(f);
return -1;
}
*val = strtoul(buf, &end, 0);
if ((buf[0] == '\0') || (end == NULL) || (*end != '\n')) {
RTE_LOG(ERR, EAL, "%s(): cannot parse sysfs value %s\n",
__func__, filename);
fclose(f);
return -1;
}
fclose(f);
return 0;
}
/* create memory configuration in shared/mmap memory. Take out
* a write lock on the memsegs, so we can auto-detect primary/secondary.
* This means we never close the file while running (auto-close on exit).
* We also don't lock the whole file, so that in future we can use read-locks
* on other parts, e.g. memzones, to detect if there are running secondary
* processes. */
static int
rte_eal_config_create(void)
{
void *rte_mem_cfg_addr;
int retval;
const char *pathname = eal_runtime_config_path();
if (internal_config.no_shconf)
return 0;
if (mem_cfg_fd < 0){
mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0600);
if (mem_cfg_fd < 0) {
RTE_LOG(ERR, EAL, "Cannot open '%s' for rte_mem_config\n",
pathname);
return -1;
}
}
retval = ftruncate(mem_cfg_fd, sizeof(*rte_config.mem_config));
if (retval < 0){
close(mem_cfg_fd);
mem_cfg_fd = -1;
RTE_LOG(ERR, EAL, "Cannot resize '%s' for rte_mem_config\n",
pathname);
return -1;
}
retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock);
if (retval < 0){
close(mem_cfg_fd);
mem_cfg_fd = -1;
RTE_LOG(ERR, EAL, "Cannot create lock on '%s'. Is another primary "
"process running?\n", pathname);
return -1;
}
rte_mem_cfg_addr = mmap(NULL, sizeof(*rte_config.mem_config),
PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0);
if (rte_mem_cfg_addr == MAP_FAILED){
RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config\n");
close(mem_cfg_fd);
mem_cfg_fd = -1;
return -1;
}
memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config));
rte_config.mem_config = rte_mem_cfg_addr;
/* store address of the config in the config itself so that secondary
* processes could later map the config into this exact location
*/
rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr;
return 0;
}
/* attach to an existing shared memory config */
static int
rte_eal_config_attach(void)
{
void *rte_mem_cfg_addr;
const char *pathname = eal_runtime_config_path();
if (internal_config.no_shconf)
return 0;
if (mem_cfg_fd < 0){
mem_cfg_fd = open(pathname, O_RDWR);
if (mem_cfg_fd < 0) {
RTE_LOG(ERR, EAL, "Cannot open '%s' for rte_mem_config\n",
pathname);
return -1;
}
}
rte_mem_cfg_addr = mmap(NULL, sizeof(*rte_config.mem_config),
PROT_READ, MAP_SHARED, mem_cfg_fd, 0);
/* don't close the fd here, it will be closed on reattach */
if (rte_mem_cfg_addr == MAP_FAILED) {
close(mem_cfg_fd);
mem_cfg_fd = -1;
RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config! error %i (%s)\n",
errno, strerror(errno));
return -1;
}
rte_config.mem_config = rte_mem_cfg_addr;
return 0;
}
/* reattach the shared config at exact memory location primary process has it */
static int
rte_eal_config_reattach(void)
{
struct rte_mem_config *mem_config;
void *rte_mem_cfg_addr;
if (internal_config.no_shconf)
return 0;
/* save the address primary process has mapped shared config to */
rte_mem_cfg_addr =
(void *)(uintptr_t)rte_config.mem_config->mem_cfg_addr;
/* unmap original config */
munmap(rte_config.mem_config, sizeof(struct rte_mem_config));
/* remap the config at proper address */
mem_config = (struct rte_mem_config *) mmap(rte_mem_cfg_addr,
sizeof(*mem_config), PROT_READ | PROT_WRITE, MAP_SHARED,
mem_cfg_fd, 0);
close(mem_cfg_fd);
mem_cfg_fd = -1;
if (mem_config == MAP_FAILED) {
RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config! error %i (%s)\n",
errno, strerror(errno));
return -1;
} else if (mem_config != rte_mem_cfg_addr) {
/* errno is stale, don't use */
RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config at [%p], got [%p]\n",
rte_mem_cfg_addr, mem_config);
munmap(mem_config, sizeof(struct rte_mem_config));
return -1;
}
rte_config.mem_config = mem_config;
return 0;
}
/* Detect if we are a primary or a secondary process */
enum rte_proc_type_t
eal_proc_type_detect(void)
{
enum rte_proc_type_t ptype = RTE_PROC_PRIMARY;
const char *pathname = eal_runtime_config_path();
/* if there no shared config, there can be no secondary processes */
if (!internal_config.no_shconf) {
/* if we can open the file but not get a write-lock we are a
* secondary process. NOTE: if we get a file handle back, we
* keep that open and don't close it to prevent a race condition
* between multiple opens.
*/
if (((mem_cfg_fd = open(pathname, O_RDWR)) >= 0) &&
(fcntl(mem_cfg_fd, F_SETLK, &wr_lock) < 0))
ptype = RTE_PROC_SECONDARY;
}
RTE_LOG(INFO, EAL, "Auto-detected process type: %s\n",
ptype == RTE_PROC_PRIMARY ? "PRIMARY" : "SECONDARY");
return ptype;
}
/* Sets up rte_config structure with the pointer to shared memory config.*/
static int
rte_config_init(void)
{
rte_config.process_type = internal_config.process_type;
switch (rte_config.process_type){
case RTE_PROC_PRIMARY:
if (rte_eal_config_create() < 0)
return -1;
eal_mcfg_update_from_internal();
break;
case RTE_PROC_SECONDARY:
if (rte_eal_config_attach() < 0)
return -1;
eal_mcfg_wait_complete();
if (eal_mcfg_check_version() < 0) {
RTE_LOG(ERR, EAL, "Primary and secondary process DPDK version mismatch\n");
return -1;
}
if (rte_eal_config_reattach() < 0)
return -1;
eal_mcfg_update_internal();
break;
case RTE_PROC_AUTO:
case RTE_PROC_INVALID:
RTE_LOG(ERR, EAL, "Invalid process type %d\n",
rte_config.process_type);
return -1;
}
return 0;
}
/* display usage */
static void
eal_usage(const char *prgname)
{
printf("\nUsage: %s ", prgname);
eal_common_usage();
/* Allow the application to print its usage message too if hook is set */
if ( rte_application_usage_hook ) {
printf("===== Application Usage =====\n\n");
rte_application_usage_hook(prgname);
}
}
/* Set a per-application usage message */
rte_usage_hook_t
rte_set_application_usage_hook( rte_usage_hook_t usage_func )
{
rte_usage_hook_t old_func;
/* Will be NULL on the first call to denote the last usage routine. */
old_func = rte_application_usage_hook;
rte_application_usage_hook = usage_func;
return old_func;
}
static inline size_t
eal_get_hugepage_mem_size(void)
{
uint64_t size = 0;
unsigned i, j;
for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
struct hugepage_info *hpi = &internal_config.hugepage_info[i];
if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0) {
for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
size += hpi->hugepage_sz * hpi->num_pages[j];
}
}
}
return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX;
}
/* Parse the arguments for --log-level only */
static void
eal_log_level_parse(int argc, char **argv)
{
int opt;
char **argvopt;
int option_index;
const int old_optind = optind;
const int old_optopt = optopt;
const int old_optreset = optreset;
char * const old_optarg = optarg;
argvopt = argv;
optind = 1;
optreset = 1;
while ((opt = getopt_long(argc, argvopt, eal_short_options,
eal_long_options, &option_index)) != EOF) {
int ret;
/* getopt is not happy, stop right now */
if (opt == '?')
break;
ret = (opt == OPT_LOG_LEVEL_NUM) ?
eal_parse_common_option(opt, optarg, &internal_config) : 0;
/* common parser is not happy */
if (ret < 0)
break;
}
/* restore getopt lib */
optind = old_optind;
optopt = old_optopt;
optreset = old_optreset;
optarg = old_optarg;
}
/* Parse the argument given in the command line of the application */
static int
eal_parse_args(int argc, char **argv)
{
int opt, ret;
char **argvopt;
int option_index;
char *prgname = argv[0];
const int old_optind = optind;
const int old_optopt = optopt;
const int old_optreset = optreset;
char * const old_optarg = optarg;
argvopt = argv;
optind = 1;
optreset = 1;
opterr = 0;
while ((opt = getopt_long(argc, argvopt, eal_short_options,
eal_long_options, &option_index)) != EOF) {
/*
* getopt didn't recognise the option, lets parse the
* registered options to see if the flag is valid
*/
if (opt == '?') {
ret = rte_option_parse(argv[optind-1]);
if (ret == 0)
continue;
eal_usage(prgname);
ret = -1;
goto out;
}
ret = eal_parse_common_option(opt, optarg, &internal_config);
/* common parser is not happy */
if (ret < 0) {
eal_usage(prgname);
ret = -1;
goto out;
}
/* common parser handled this option */
if (ret == 0)
continue;
switch (opt) {
case OPT_MBUF_POOL_OPS_NAME_NUM:
{
char *ops_name = strdup(optarg);
if (ops_name == NULL)
RTE_LOG(ERR, EAL, "Could not store mbuf pool ops name\n");
else {
/* free old ops name */
if (internal_config.user_mbuf_pool_ops_name !=
NULL)
free(internal_config.user_mbuf_pool_ops_name);
internal_config.user_mbuf_pool_ops_name =
ops_name;
}
break;
}
case 'h':
eal_usage(prgname);
exit(EXIT_SUCCESS);
default:
if (opt < OPT_LONG_MIN_NUM && isprint(opt)) {
RTE_LOG(ERR, EAL, "Option %c is not supported "
"on FreeBSD\n", opt);
} else if (opt >= OPT_LONG_MIN_NUM &&
opt < OPT_LONG_MAX_NUM) {
RTE_LOG(ERR, EAL, "Option %s is not supported "
"on FreeBSD\n",
eal_long_options[option_index].name);
} else {
RTE_LOG(ERR, EAL, "Option %d is not supported "
"on FreeBSD\n", opt);
}
eal_usage(prgname);
ret = -1;
goto out;
}
}
/* create runtime data directory */
if (internal_config.no_shconf == 0 &&
eal_create_runtime_dir() < 0) {
RTE_LOG(ERR, EAL, "Cannot create runtime directory\n");
ret = -1;
goto out;
}
if (eal_adjust_config(&internal_config) != 0) {
ret = -1;
goto out;
}
/* sanity checks */
if (eal_check_common_options(&internal_config) != 0) {
eal_usage(prgname);
ret = -1;
goto out;
}
if (optind >= 0)
argv[optind-1] = prgname;
ret = optind-1;
out:
/* restore getopt lib */
optind = old_optind;
optopt = old_optopt;
optreset = old_optreset;
optarg = old_optarg;
return ret;
}
static int
mem: replace memseg with memseg lists Before, we were aggregating multiple pages into one memseg, so the number of memsegs was small. Now, each page gets its own memseg, so the list of memsegs is huge. To accommodate the new memseg list size and to keep the under-the-hood workings sane, the memseg list is now not just a single list, but multiple lists. To be precise, each hugepage size available on the system gets one or more memseg lists, per socket. In order to support dynamic memory allocation, we reserve all memory in advance (unless we're in 32-bit legacy mode, in which case we do not preallocate memory). As in, we do an anonymous mmap() of the entire maximum size of memory per hugepage size, per socket (which is limited to either RTE_MAX_MEMSEG_PER_TYPE pages or RTE_MAX_MEM_MB_PER_TYPE megabytes worth of memory, whichever is the smaller one), split over multiple lists (which are limited to either RTE_MAX_MEMSEG_PER_LIST memsegs or RTE_MAX_MEM_MB_PER_LIST megabytes per list, whichever is the smaller one). There is also a global limit of CONFIG_RTE_MAX_MEM_MB megabytes, which is mainly used for 32-bit targets to limit amounts of preallocated memory, but can be used to place an upper limit on total amount of VA memory that can be allocated by DPDK application. So, for each hugepage size, we get (by default) up to 128G worth of memory, per socket, split into chunks of up to 32G in size. The address space is claimed at the start, in eal_common_memory.c. The actual page allocation code is in eal_memalloc.c (Linux-only), and largely consists of copied EAL memory init code. Pages in the list are also indexed by address. That is, in order to figure out where the page belongs, one can simply look at base address for a memseg list. Similarly, figuring out IOVA address of a memzone is a matter of finding the right memseg list, getting offset and dividing by page size to get the appropriate memseg. This commit also removes rte_eal_dump_physmem_layout() call, according to deprecation notice [1], and removes that deprecation notice as well. On 32-bit targets due to limited VA space, DPDK will no longer spread memory to different sockets like before. Instead, it will (by default) allocate all of the memory on socket where master lcore is. To override this behavior, --socket-mem must be used. The rest of the changes are really ripple effects from the memseg change - heap changes, compile fixes, and rewrites to support fbarray-backed memseg lists. Due to earlier switch to _walk() functions, most of the changes are simple fixes, however some of the _walk() calls were switched to memseg list walk, where it made sense to do so. Additionally, we are also switching locks from flock() to fcntl(). Down the line, we will be introducing single-file segments option, and we cannot use flock() locks to lock parts of the file. Therefore, we will use fcntl() locks for legacy mem as well, in case someone is unfortunate enough to accidentally start legacy mem primary process alongside an already working non-legacy mem-based primary process. [1] http://dpdk.org/dev/patchwork/patch/34002/ Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com> Tested-by: Santosh Shukla <santosh.shukla@caviumnetworks.com> Tested-by: Hemant Agrawal <hemant.agrawal@nxp.com> Tested-by: Gowrishankar Muthukrishnan <gowrishankar.m@linux.vnet.ibm.com>
2018-04-11 13:30:24 +01:00
check_socket(const struct rte_memseg_list *msl, void *arg)
{
int *socket_id = arg;
if (msl->external)
return 0;
mem: replace memseg with memseg lists Before, we were aggregating multiple pages into one memseg, so the number of memsegs was small. Now, each page gets its own memseg, so the list of memsegs is huge. To accommodate the new memseg list size and to keep the under-the-hood workings sane, the memseg list is now not just a single list, but multiple lists. To be precise, each hugepage size available on the system gets one or more memseg lists, per socket. In order to support dynamic memory allocation, we reserve all memory in advance (unless we're in 32-bit legacy mode, in which case we do not preallocate memory). As in, we do an anonymous mmap() of the entire maximum size of memory per hugepage size, per socket (which is limited to either RTE_MAX_MEMSEG_PER_TYPE pages or RTE_MAX_MEM_MB_PER_TYPE megabytes worth of memory, whichever is the smaller one), split over multiple lists (which are limited to either RTE_MAX_MEMSEG_PER_LIST memsegs or RTE_MAX_MEM_MB_PER_LIST megabytes per list, whichever is the smaller one). There is also a global limit of CONFIG_RTE_MAX_MEM_MB megabytes, which is mainly used for 32-bit targets to limit amounts of preallocated memory, but can be used to place an upper limit on total amount of VA memory that can be allocated by DPDK application. So, for each hugepage size, we get (by default) up to 128G worth of memory, per socket, split into chunks of up to 32G in size. The address space is claimed at the start, in eal_common_memory.c. The actual page allocation code is in eal_memalloc.c (Linux-only), and largely consists of copied EAL memory init code. Pages in the list are also indexed by address. That is, in order to figure out where the page belongs, one can simply look at base address for a memseg list. Similarly, figuring out IOVA address of a memzone is a matter of finding the right memseg list, getting offset and dividing by page size to get the appropriate memseg. This commit also removes rte_eal_dump_physmem_layout() call, according to deprecation notice [1], and removes that deprecation notice as well. On 32-bit targets due to limited VA space, DPDK will no longer spread memory to different sockets like before. Instead, it will (by default) allocate all of the memory on socket where master lcore is. To override this behavior, --socket-mem must be used. The rest of the changes are really ripple effects from the memseg change - heap changes, compile fixes, and rewrites to support fbarray-backed memseg lists. Due to earlier switch to _walk() functions, most of the changes are simple fixes, however some of the _walk() calls were switched to memseg list walk, where it made sense to do so. Additionally, we are also switching locks from flock() to fcntl(). Down the line, we will be introducing single-file segments option, and we cannot use flock() locks to lock parts of the file. Therefore, we will use fcntl() locks for legacy mem as well, in case someone is unfortunate enough to accidentally start legacy mem primary process alongside an already working non-legacy mem-based primary process. [1] http://dpdk.org/dev/patchwork/patch/34002/ Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com> Tested-by: Santosh Shukla <santosh.shukla@caviumnetworks.com> Tested-by: Hemant Agrawal <hemant.agrawal@nxp.com> Tested-by: Gowrishankar Muthukrishnan <gowrishankar.m@linux.vnet.ibm.com>
2018-04-11 13:30:24 +01:00
if (msl->socket_id == *socket_id && msl->memseg_arr.count != 0)
return 1;
return 0;
}
static void
eal_check_mem_on_local_socket(void)
{
int socket_id;
socket_id = rte_lcore_to_socket_id(rte_config.master_lcore);
mem: replace memseg with memseg lists Before, we were aggregating multiple pages into one memseg, so the number of memsegs was small. Now, each page gets its own memseg, so the list of memsegs is huge. To accommodate the new memseg list size and to keep the under-the-hood workings sane, the memseg list is now not just a single list, but multiple lists. To be precise, each hugepage size available on the system gets one or more memseg lists, per socket. In order to support dynamic memory allocation, we reserve all memory in advance (unless we're in 32-bit legacy mode, in which case we do not preallocate memory). As in, we do an anonymous mmap() of the entire maximum size of memory per hugepage size, per socket (which is limited to either RTE_MAX_MEMSEG_PER_TYPE pages or RTE_MAX_MEM_MB_PER_TYPE megabytes worth of memory, whichever is the smaller one), split over multiple lists (which are limited to either RTE_MAX_MEMSEG_PER_LIST memsegs or RTE_MAX_MEM_MB_PER_LIST megabytes per list, whichever is the smaller one). There is also a global limit of CONFIG_RTE_MAX_MEM_MB megabytes, which is mainly used for 32-bit targets to limit amounts of preallocated memory, but can be used to place an upper limit on total amount of VA memory that can be allocated by DPDK application. So, for each hugepage size, we get (by default) up to 128G worth of memory, per socket, split into chunks of up to 32G in size. The address space is claimed at the start, in eal_common_memory.c. The actual page allocation code is in eal_memalloc.c (Linux-only), and largely consists of copied EAL memory init code. Pages in the list are also indexed by address. That is, in order to figure out where the page belongs, one can simply look at base address for a memseg list. Similarly, figuring out IOVA address of a memzone is a matter of finding the right memseg list, getting offset and dividing by page size to get the appropriate memseg. This commit also removes rte_eal_dump_physmem_layout() call, according to deprecation notice [1], and removes that deprecation notice as well. On 32-bit targets due to limited VA space, DPDK will no longer spread memory to different sockets like before. Instead, it will (by default) allocate all of the memory on socket where master lcore is. To override this behavior, --socket-mem must be used. The rest of the changes are really ripple effects from the memseg change - heap changes, compile fixes, and rewrites to support fbarray-backed memseg lists. Due to earlier switch to _walk() functions, most of the changes are simple fixes, however some of the _walk() calls were switched to memseg list walk, where it made sense to do so. Additionally, we are also switching locks from flock() to fcntl(). Down the line, we will be introducing single-file segments option, and we cannot use flock() locks to lock parts of the file. Therefore, we will use fcntl() locks for legacy mem as well, in case someone is unfortunate enough to accidentally start legacy mem primary process alongside an already working non-legacy mem-based primary process. [1] http://dpdk.org/dev/patchwork/patch/34002/ Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com> Tested-by: Santosh Shukla <santosh.shukla@caviumnetworks.com> Tested-by: Hemant Agrawal <hemant.agrawal@nxp.com> Tested-by: Gowrishankar Muthukrishnan <gowrishankar.m@linux.vnet.ibm.com>
2018-04-11 13:30:24 +01:00
if (rte_memseg_list_walk(check_socket, &socket_id) == 0)
RTE_LOG(WARNING, EAL, "WARNING: Master core has no memory on local socket!\n");
}
mem: replace memseg with memseg lists Before, we were aggregating multiple pages into one memseg, so the number of memsegs was small. Now, each page gets its own memseg, so the list of memsegs is huge. To accommodate the new memseg list size and to keep the under-the-hood workings sane, the memseg list is now not just a single list, but multiple lists. To be precise, each hugepage size available on the system gets one or more memseg lists, per socket. In order to support dynamic memory allocation, we reserve all memory in advance (unless we're in 32-bit legacy mode, in which case we do not preallocate memory). As in, we do an anonymous mmap() of the entire maximum size of memory per hugepage size, per socket (which is limited to either RTE_MAX_MEMSEG_PER_TYPE pages or RTE_MAX_MEM_MB_PER_TYPE megabytes worth of memory, whichever is the smaller one), split over multiple lists (which are limited to either RTE_MAX_MEMSEG_PER_LIST memsegs or RTE_MAX_MEM_MB_PER_LIST megabytes per list, whichever is the smaller one). There is also a global limit of CONFIG_RTE_MAX_MEM_MB megabytes, which is mainly used for 32-bit targets to limit amounts of preallocated memory, but can be used to place an upper limit on total amount of VA memory that can be allocated by DPDK application. So, for each hugepage size, we get (by default) up to 128G worth of memory, per socket, split into chunks of up to 32G in size. The address space is claimed at the start, in eal_common_memory.c. The actual page allocation code is in eal_memalloc.c (Linux-only), and largely consists of copied EAL memory init code. Pages in the list are also indexed by address. That is, in order to figure out where the page belongs, one can simply look at base address for a memseg list. Similarly, figuring out IOVA address of a memzone is a matter of finding the right memseg list, getting offset and dividing by page size to get the appropriate memseg. This commit also removes rte_eal_dump_physmem_layout() call, according to deprecation notice [1], and removes that deprecation notice as well. On 32-bit targets due to limited VA space, DPDK will no longer spread memory to different sockets like before. Instead, it will (by default) allocate all of the memory on socket where master lcore is. To override this behavior, --socket-mem must be used. The rest of the changes are really ripple effects from the memseg change - heap changes, compile fixes, and rewrites to support fbarray-backed memseg lists. Due to earlier switch to _walk() functions, most of the changes are simple fixes, however some of the _walk() calls were switched to memseg list walk, where it made sense to do so. Additionally, we are also switching locks from flock() to fcntl(). Down the line, we will be introducing single-file segments option, and we cannot use flock() locks to lock parts of the file. Therefore, we will use fcntl() locks for legacy mem as well, in case someone is unfortunate enough to accidentally start legacy mem primary process alongside an already working non-legacy mem-based primary process. [1] http://dpdk.org/dev/patchwork/patch/34002/ Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com> Tested-by: Santosh Shukla <santosh.shukla@caviumnetworks.com> Tested-by: Hemant Agrawal <hemant.agrawal@nxp.com> Tested-by: Gowrishankar Muthukrishnan <gowrishankar.m@linux.vnet.ibm.com>
2018-04-11 13:30:24 +01:00
static int
sync_func(__attribute__((unused)) void *arg)
{
return 0;
}
/* return non-zero if hugepages are enabled. */
int rte_eal_has_hugepages(void)
{
return !internal_config.no_hugetlbfs;
}
/* Abstraction for port I/0 privilege */
int
rte_eal_iopl_init(void)
{
static int fd = -1;
if (fd < 0)
fd = open("/dev/io", O_RDWR);
if (fd < 0)
return -1;
eal/bsd: fix virtio on FreeBSD Closing /dev/io fd causes SIGBUS in inb/outb instructions as the process loses the IOPL privileges once the fd is closed: (gdb) bt 0 0x0000000000492f2c in outb (port=49170, data=0 '\000') at /usr/include/machine/cpufunc.h:244 1 0x0000000000492f7a in outb_p (data=0 '\000', port=49170) at /dpdk/dpdk-2.0.0/lib/librte_pmd_virtio/virtio_pci.h:211 2 0x000000000049328d in vtpci_set_status (hw=0x80331f380, status=0 '\000') at /dpdk/dpdk-2.0.0/lib/librte_pmd_virtio/virtio_pci.c:130 3 0x00000000004931fe in vtpci_reset (hw=0x80331f380) at /dpdk/dpdk-2.0.0/lib/librte_pmd_virtio/virtio_pci.c:108 4 0x00000000004a175e in eth_virtio_dev_init (eth_dev=0x831b80 <rte_eth_devices>) at /dpdk/dpdk-2.0.0/lib/librte_pmd_virtio/virtio_ethdev.c:1150 5 0x0000000000462c09 in rte_eth_dev_init (pci_drv=0x79d1a0 <rte_virtio_pmd>, pci_dev=0x802417560) at /dpdk/dpdk-2.0.0/lib/librte_ether/rte_ethdev.c:326 6 0x000000000046f03f in rte_eal_pci_probe_one_driver (dr=0x79d1a0 <rte_virtio_pmd>, dev=0x802417560) at /dpdk/dpdk-2.0.0/lib/librte_eal/bsdapp/eal/eal_pci.c:487 7 0x0000000000475b06 in pci_probe_all_drivers (dev=0x802417560) at /dpdk/dpdk-2.0.0/lib/librte_eal/common/eal_common_pci.c:116 8 0x0000000000475bb9 in rte_eal_pci_probe () at /dpdk/dpdk-2.0.0/lib/librte_eal/common/eal_common_pci.c:246 9 0x000000000046cd63 in rte_eal_init (argc=5, argv=0x7fffffffeaf0) at /dpdk/dpdk-2.0.0/lib/librte_eal/bsdapp/eal/eal.c:554 10 0x0000000000404544 in main () Fixes: 8a312224bcde ("eal/bsd: fix fd leak") Signed-off-by: Raz Amir <razamir22@gmail.com> Reviewed-by: Bruce Richardson <bruce.richardson@intel.com>
2015-04-16 14:52:07 +03:00
/* keep fd open for iopl */
return 0;
}
static void rte_eal_init_alert(const char *msg)
{
fprintf(stderr, "EAL: FATAL: %s\n", msg);
RTE_LOG(ERR, EAL, "%s\n", msg);
}
/* Launch threads, called at application init(). */
int
rte_eal_init(int argc, char **argv)
{
int i, fctret, ret;
pthread_t thread_id;
static rte_atomic32_t run_once = RTE_ATOMIC32_INIT(0);
char cpuset[RTE_CPU_AFFINITY_STR_LEN];
char thread_name[RTE_MAX_THREAD_NAME_LEN];
/* checks if the machine is adequate */
if (!rte_cpu_is_supported()) {
rte_eal_init_alert("unsupported cpu type.");
rte_errno = ENOTSUP;
return -1;
}
if (!rte_atomic32_test_and_set(&run_once)) {
rte_eal_init_alert("already called initialization.");
rte_errno = EALREADY;
return -1;
}
thread_id = pthread_self();
eal_reset_internal_config(&internal_config);
/* set log level as early as possible */
eal_log_level_parse(argc, argv);
if (rte_eal_cpu_init() < 0) {
rte_eal_init_alert("Cannot detect lcores.");
rte_errno = ENOTSUP;
return -1;
}
fctret = eal_parse_args(argc, argv);
if (fctret < 0) {
rte_eal_init_alert("Invalid 'command line' arguments.");
rte_errno = EINVAL;
rte_atomic32_clear(&run_once);
return -1;
}
/* FreeBSD always uses legacy memory model */
internal_config.legacy_mem = true;
if (eal_plugins_init() < 0) {
rte_eal_init_alert("Cannot init plugins");
rte_errno = EINVAL;
rte_atomic32_clear(&run_once);
return -1;
}
if (eal_option_device_parse()) {
rte_errno = ENODEV;
rte_atomic32_clear(&run_once);
return -1;
}
if (rte_config_init() < 0) {
rte_eal_init_alert("Cannot init config");
return -1;
}
if (rte_eal_intr_init() < 0) {
rte_eal_init_alert("Cannot init interrupt-handling thread");
return -1;
}
if (rte_eal_alarm_init() < 0) {
rte_eal_init_alert("Cannot init alarm");
/* rte_eal_alarm_init sets rte_errno on failure. */
return -1;
}
/* Put mp channel init before bus scan so that we can init the vdev
* bus through mp channel in the secondary process before the bus scan.
*/
if (rte_mp_channel_init() < 0 && rte_errno != ENOTSUP) {
rte_eal_init_alert("failed to init mp channel");
if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
rte_errno = EFAULT;
return -1;
}
}
if (rte_bus_scan()) {
rte_eal_init_alert("Cannot scan the buses for devices");
rte_errno = ENODEV;
rte_atomic32_clear(&run_once);
return -1;
}
/* if no EAL option "--iova-mode=<pa|va>", use bus IOVA scheme */
if (internal_config.iova_mode == RTE_IOVA_DC) {
/* autodetect the IOVA mapping mode (default is RTE_IOVA_PA) */
enum rte_iova_mode iova_mode = rte_bus_get_iommu_class();
if (iova_mode == RTE_IOVA_DC)
iova_mode = RTE_IOVA_PA;
rte_eal_get_configuration()->iova_mode = iova_mode;
} else {
rte_eal_get_configuration()->iova_mode =
internal_config.iova_mode;
}
RTE_LOG(INFO, EAL, "Selected IOVA mode '%s'\n",
rte_eal_iova_mode() == RTE_IOVA_PA ? "PA" : "VA");
if (internal_config.no_hugetlbfs == 0) {
/* rte_config isn't initialized yet */
ret = internal_config.process_type == RTE_PROC_PRIMARY ?
eal_hugepage_info_init() :
eal_hugepage_info_read();
if (ret < 0) {
rte_eal_init_alert("Cannot get hugepage information.");
rte_errno = EACCES;
rte_atomic32_clear(&run_once);
return -1;
}
}
if (internal_config.memory == 0 && internal_config.force_sockets == 0) {
if (internal_config.no_hugetlbfs)
internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE;
else
internal_config.memory = eal_get_hugepage_mem_size();
}
if (internal_config.vmware_tsc_map == 1) {
#ifdef RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT
rte_cycles_vmware_tsc_map = 1;
RTE_LOG (DEBUG, EAL, "Using VMWARE TSC MAP, "
"you must have monitor_control.pseudo_perfctr = TRUE\n");
#else
RTE_LOG (WARNING, EAL, "Ignoring --vmware-tsc-map because "
"RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT is not set\n");
#endif
}
/* in secondary processes, memory init may allocate additional fbarrays
* not present in primary processes, so to avoid any potential issues,
* initialize memzones first.
*/
if (rte_eal_memzone_init() < 0) {
rte_eal_init_alert("Cannot init memzone");
rte_errno = ENODEV;
return -1;
}
if (rte_eal_memory_init() < 0) {
rte_eal_init_alert("Cannot init memory");
rte_errno = ENOMEM;
return -1;
}
if (rte_eal_malloc_heap_init() < 0) {
rte_eal_init_alert("Cannot init malloc heap");
rte_errno = ENODEV;
return -1;
}
if (rte_eal_tailqs_init() < 0) {
rte_eal_init_alert("Cannot init tail queues for objects");
rte_errno = EFAULT;
return -1;
}
if (rte_eal_timer_init() < 0) {
rte_eal_init_alert("Cannot init HPET or TSC timers");
rte_errno = ENOTSUP;
return -1;
}
eal_check_mem_on_local_socket();
eal_thread_init_master(rte_config.master_lcore);
ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset));
RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%p;cpuset=[%s%s])\n",
rte_config.master_lcore, thread_id, cpuset,
ret == 0 ? "" : "...");
RTE_LCORE_FOREACH_SLAVE(i) {
/*
* create communication pipes between master thread
* and children
*/
if (pipe(lcore_config[i].pipe_master2slave) < 0)
rte_panic("Cannot create pipe\n");
if (pipe(lcore_config[i].pipe_slave2master) < 0)
rte_panic("Cannot create pipe\n");
lcore_config[i].state = WAIT;
/* create a thread for each lcore */
ret = pthread_create(&lcore_config[i].thread_id, NULL,
eal_thread_loop, NULL);
if (ret != 0)
rte_panic("Cannot create thread\n");
/* Set thread_name for aid in debugging. */
snprintf(thread_name, sizeof(thread_name),
"lcore-slave-%d", i);
rte_thread_setname(lcore_config[i].thread_id, thread_name);
}
/*
* Launch a dummy function on all slave lcores, so that master lcore
* knows they are all ready when this function returns.
*/
rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER);
rte_eal_mp_wait_lcore();
/* initialize services so vdevs register service during bus_probe. */
ret = rte_service_init();
if (ret) {
rte_eal_init_alert("rte_service_init() failed");
rte_errno = ENOEXEC;
return -1;
}
/* Probe all the buses and devices/drivers on them */
if (rte_bus_probe()) {
rte_eal_init_alert("Cannot probe devices");
rte_errno = ENOTSUP;
return -1;
}
/* initialize default service/lcore mappings and start running. Ignore
* -ENOTSUP, as it indicates no service coremask passed to EAL.
*/
ret = rte_service_start_with_defaults();
if (ret < 0 && ret != -ENOTSUP) {
rte_errno = ENOEXEC;
return -1;
}
/*
* Clean up unused files in runtime directory. We do this at the end of
* init and not at the beginning because we want to clean stuff up
* whether we are primary or secondary process, but we cannot remove
* primary process' files because secondary should be able to run even
* if primary process is dead.
*
* In no_shconf mode, no runtime directory is created in the first
* place, so no cleanup needed.
*/
if (!internal_config.no_shconf && eal_clean_runtime_dir() < 0) {
rte_eal_init_alert("Cannot clear runtime directory\n");
return -1;
}
eal_mcfg_complete();
/* Call each registered callback, if enabled */
rte_option_init();
return fctret;
}
int
rte_eal_cleanup(void)
{
rte_service_finalize();
rte_mp_channel_cleanup();
eal_cleanup_config(&internal_config);
return 0;
}
/* get core role */
enum rte_lcore_role_t
rte_eal_lcore_role(unsigned lcore_id)
{
return rte_config.lcore_role[lcore_id];
}
enum rte_proc_type_t
rte_eal_process_type(void)
{
return rte_config.process_type;
}
int rte_eal_has_pci(void)
{
return !internal_config.no_pci;
}
int rte_eal_create_uio_dev(void)
{
return internal_config.create_uio_dev;
}
enum rte_intr_mode
rte_eal_vfio_intr_mode(void)
{
return RTE_INTR_MODE_NONE;
}
int rte_vfio_setup_device(__rte_unused const char *sysfs_base,
__rte_unused const char *dev_addr,
__rte_unused int *vfio_dev_fd,
__rte_unused struct vfio_device_info *device_info)
{
return -1;
}
int rte_vfio_release_device(__rte_unused const char *sysfs_base,
__rte_unused const char *dev_addr,
__rte_unused int fd)
{
return -1;
}
int rte_vfio_enable(__rte_unused const char *modname)
{
return -1;
}
int rte_vfio_is_enabled(__rte_unused const char *modname)
{
return 0;
}
int rte_vfio_noiommu_is_enabled(void)
{
return 0;
}
int rte_vfio_clear_group(__rte_unused int vfio_group_fd)
{
return 0;
}
int
rte_vfio_dma_map(uint64_t __rte_unused vaddr, __rte_unused uint64_t iova,
__rte_unused uint64_t len)
{
return -1;
}
int
rte_vfio_dma_unmap(uint64_t __rte_unused vaddr, uint64_t __rte_unused iova,
__rte_unused uint64_t len)
{
return -1;
}
int
rte_vfio_get_group_num(__rte_unused const char *sysfs_base,
__rte_unused const char *dev_addr,
__rte_unused int *iommu_group_num)
{
return -1;
}
int
rte_vfio_get_container_fd(void)
{
return -1;
}
int
rte_vfio_get_group_fd(__rte_unused int iommu_group_num)
{
return -1;
}
int
rte_vfio_container_create(void)
{
return -1;
}
int
rte_vfio_container_destroy(__rte_unused int container_fd)
{
return -1;
}
int
rte_vfio_container_group_bind(__rte_unused int container_fd,
__rte_unused int iommu_group_num)
{
return -1;
}
int
rte_vfio_container_group_unbind(__rte_unused int container_fd,
__rte_unused int iommu_group_num)
{
return -1;
}
int
rte_vfio_container_dma_map(__rte_unused int container_fd,
__rte_unused uint64_t vaddr,
__rte_unused uint64_t iova,
__rte_unused uint64_t len)
{
return -1;
}
int
rte_vfio_container_dma_unmap(__rte_unused int container_fd,
__rte_unused uint64_t vaddr,
__rte_unused uint64_t iova,
__rte_unused uint64_t len)
{
return -1;
}