2018-01-29 14:11:25 +01:00
|
|
|
/* SPDX-License-Identifier: BSD-3-Clause
|
|
|
|
* Copyright(c) 2010-2018 Intel Corporation.
|
|
|
|
* Copyright(c) 2014 6WIND S.A.
|
2014-02-10 11:49:10 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <stdint.h>
|
|
|
|
#include <string.h>
|
|
|
|
#include <stdarg.h>
|
|
|
|
#include <unistd.h>
|
|
|
|
#include <pthread.h>
|
|
|
|
#include <syslog.h>
|
|
|
|
#include <getopt.h>
|
|
|
|
#include <sys/file.h>
|
|
|
|
#include <stddef.h>
|
|
|
|
#include <errno.h>
|
|
|
|
#include <limits.h>
|
|
|
|
#include <sys/mman.h>
|
|
|
|
#include <sys/queue.h>
|
|
|
|
|
2018-01-21 20:48:06 -05:00
|
|
|
#include <rte_compat.h>
|
2014-02-10 11:49:10 +00:00
|
|
|
#include <rte_common.h>
|
|
|
|
#include <rte_debug.h>
|
|
|
|
#include <rte_memory.h>
|
|
|
|
#include <rte_launch.h>
|
|
|
|
#include <rte_eal.h>
|
|
|
|
#include <rte_eal_memconfig.h>
|
2017-03-22 16:19:27 -04:00
|
|
|
#include <rte_errno.h>
|
2014-02-10 11:49:10 +00:00
|
|
|
#include <rte_per_lcore.h>
|
|
|
|
#include <rte_lcore.h>
|
2017-07-11 15:19:28 +01:00
|
|
|
#include <rte_service_component.h>
|
2014-02-10 11:49:10 +00:00
|
|
|
#include <rte_log.h>
|
|
|
|
#include <rte_random.h>
|
|
|
|
#include <rte_cycles.h>
|
|
|
|
#include <rte_string_fns.h>
|
|
|
|
#include <rte_cpuflags.h>
|
|
|
|
#include <rte_interrupts.h>
|
2017-01-19 14:21:35 +00:00
|
|
|
#include <rte_bus.h>
|
2014-07-02 11:14:03 +02:00
|
|
|
#include <rte_dev.h>
|
2014-04-25 13:59:41 +02:00
|
|
|
#include <rte_devargs.h>
|
2014-02-10 11:49:10 +00:00
|
|
|
#include <rte_version.h>
|
|
|
|
#include <rte_atomic.h>
|
|
|
|
#include <malloc_heap.h>
|
|
|
|
|
|
|
|
#include "eal_private.h"
|
|
|
|
#include "eal_thread.h"
|
|
|
|
#include "eal_internal_cfg.h"
|
|
|
|
#include "eal_filesystem.h"
|
|
|
|
#include "eal_hugepages.h"
|
2014-09-22 10:37:59 +02:00
|
|
|
#include "eal_options.h"
|
2014-02-10 11:49:10 +00:00
|
|
|
|
|
|
|
#define MEMSIZE_IF_NO_HUGE_PAGE (64ULL * 1024ULL * 1024ULL)
|
|
|
|
|
|
|
|
/* Allow the application to print its usage message too if set */
|
|
|
|
static rte_usage_hook_t rte_application_usage_hook = NULL;
|
|
|
|
/* early configuration structure, when memory config is not mmapped */
|
|
|
|
static struct rte_mem_config early_mem_config;
|
|
|
|
|
|
|
|
/* define fd variable here, because file needs to be kept open for the
|
|
|
|
* duration of the program, as we hold a write lock on it in the primary proc */
|
|
|
|
static int mem_cfg_fd = -1;
|
|
|
|
|
|
|
|
static struct flock wr_lock = {
|
|
|
|
.l_type = F_WRLCK,
|
|
|
|
.l_whence = SEEK_SET,
|
mem: replace memseg with memseg lists
Before, we were aggregating multiple pages into one memseg, so the
number of memsegs was small. Now, each page gets its own memseg,
so the list of memsegs is huge. To accommodate the new memseg list
size and to keep the under-the-hood workings sane, the memseg list
is now not just a single list, but multiple lists. To be precise,
each hugepage size available on the system gets one or more memseg
lists, per socket.
In order to support dynamic memory allocation, we reserve all
memory in advance (unless we're in 32-bit legacy mode, in which
case we do not preallocate memory). As in, we do an anonymous
mmap() of the entire maximum size of memory per hugepage size, per
socket (which is limited to either RTE_MAX_MEMSEG_PER_TYPE pages or
RTE_MAX_MEM_MB_PER_TYPE megabytes worth of memory, whichever is the
smaller one), split over multiple lists (which are limited to
either RTE_MAX_MEMSEG_PER_LIST memsegs or RTE_MAX_MEM_MB_PER_LIST
megabytes per list, whichever is the smaller one). There is also
a global limit of CONFIG_RTE_MAX_MEM_MB megabytes, which is mainly
used for 32-bit targets to limit amounts of preallocated memory,
but can be used to place an upper limit on total amount of VA
memory that can be allocated by DPDK application.
So, for each hugepage size, we get (by default) up to 128G worth
of memory, per socket, split into chunks of up to 32G in size.
The address space is claimed at the start, in eal_common_memory.c.
The actual page allocation code is in eal_memalloc.c (Linux-only),
and largely consists of copied EAL memory init code.
Pages in the list are also indexed by address. That is, in order
to figure out where the page belongs, one can simply look at base
address for a memseg list. Similarly, figuring out IOVA address
of a memzone is a matter of finding the right memseg list, getting
offset and dividing by page size to get the appropriate memseg.
This commit also removes rte_eal_dump_physmem_layout() call,
according to deprecation notice [1], and removes that deprecation
notice as well.
On 32-bit targets due to limited VA space, DPDK will no longer
spread memory to different sockets like before. Instead, it will
(by default) allocate all of the memory on socket where master
lcore is. To override this behavior, --socket-mem must be used.
The rest of the changes are really ripple effects from the memseg
change - heap changes, compile fixes, and rewrites to support
fbarray-backed memseg lists. Due to earlier switch to _walk()
functions, most of the changes are simple fixes, however some
of the _walk() calls were switched to memseg list walk, where
it made sense to do so.
Additionally, we are also switching locks from flock() to fcntl().
Down the line, we will be introducing single-file segments option,
and we cannot use flock() locks to lock parts of the file. Therefore,
we will use fcntl() locks for legacy mem as well, in case someone is
unfortunate enough to accidentally start legacy mem primary process
alongside an already working non-legacy mem-based primary process.
[1] http://dpdk.org/dev/patchwork/patch/34002/
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Tested-by: Santosh Shukla <santosh.shukla@caviumnetworks.com>
Tested-by: Hemant Agrawal <hemant.agrawal@nxp.com>
Tested-by: Gowrishankar Muthukrishnan <gowrishankar.m@linux.vnet.ibm.com>
2018-04-11 13:30:24 +01:00
|
|
|
.l_start = offsetof(struct rte_mem_config, memsegs),
|
|
|
|
.l_len = sizeof(early_mem_config.memsegs),
|
2014-02-10 11:49:10 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
/* Address of global and public configuration */
|
|
|
|
static struct rte_config rte_config = {
|
|
|
|
.mem_config = &early_mem_config,
|
|
|
|
};
|
|
|
|
|
|
|
|
/* internal configuration (per-core) */
|
|
|
|
struct lcore_config lcore_config[RTE_MAX_LCORE];
|
|
|
|
|
|
|
|
/* internal configuration */
|
|
|
|
struct internal_config internal_config;
|
|
|
|
|
|
|
|
/* used by rte_rdtsc() */
|
|
|
|
int rte_cycles_vmware_tsc_map;
|
|
|
|
|
2018-02-02 13:33:01 +05:30
|
|
|
/* Return user provided mbuf pool ops name */
|
|
|
|
const char * __rte_experimental
|
|
|
|
rte_eal_mbuf_user_pool_ops(void)
|
|
|
|
{
|
|
|
|
return internal_config.user_mbuf_pool_ops_name;
|
|
|
|
}
|
|
|
|
|
2017-10-06 13:15:29 +05:30
|
|
|
/* Return mbuf pool ops name */
|
|
|
|
const char *
|
|
|
|
rte_eal_mbuf_default_mempool_ops(void)
|
|
|
|
{
|
2018-01-30 21:27:47 +05:30
|
|
|
if (internal_config.user_mbuf_pool_ops_name == NULL)
|
|
|
|
return RTE_MBUF_DEFAULT_MEMPOOL_OPS;
|
|
|
|
|
2018-01-29 13:40:43 +05:30
|
|
|
return internal_config.user_mbuf_pool_ops_name;
|
2017-10-06 13:15:29 +05:30
|
|
|
}
|
|
|
|
|
2014-02-10 11:49:10 +00:00
|
|
|
/* Return a pointer to the configuration structure */
|
|
|
|
struct rte_config *
|
|
|
|
rte_eal_get_configuration(void)
|
|
|
|
{
|
|
|
|
return &rte_config;
|
|
|
|
}
|
|
|
|
|
2017-10-06 16:33:42 +05:30
|
|
|
enum rte_iova_mode
|
|
|
|
rte_eal_iova_mode(void)
|
|
|
|
{
|
|
|
|
return rte_eal_get_configuration()->iova_mode;
|
|
|
|
}
|
|
|
|
|
2014-02-10 11:49:10 +00:00
|
|
|
/* parse a sysfs (or other) file containing one integer value */
|
|
|
|
int
|
|
|
|
eal_parse_sysfs_value(const char *filename, unsigned long *val)
|
|
|
|
{
|
|
|
|
FILE *f;
|
|
|
|
char buf[BUFSIZ];
|
|
|
|
char *end = NULL;
|
|
|
|
|
|
|
|
if ((f = fopen(filename, "r")) == NULL) {
|
|
|
|
RTE_LOG(ERR, EAL, "%s(): cannot open sysfs value %s\n",
|
|
|
|
__func__, filename);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (fgets(buf, sizeof(buf), f) == NULL) {
|
|
|
|
RTE_LOG(ERR, EAL, "%s(): cannot read sysfs value %s\n",
|
|
|
|
__func__, filename);
|
|
|
|
fclose(f);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
*val = strtoul(buf, &end, 0);
|
|
|
|
if ((buf[0] == '\0') || (end == NULL) || (*end != '\n')) {
|
|
|
|
RTE_LOG(ERR, EAL, "%s(): cannot parse sysfs value %s\n",
|
|
|
|
__func__, filename);
|
|
|
|
fclose(f);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
fclose(f);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* create memory configuration in shared/mmap memory. Take out
|
|
|
|
* a write lock on the memsegs, so we can auto-detect primary/secondary.
|
|
|
|
* This means we never close the file while running (auto-close on exit).
|
|
|
|
* We also don't lock the whole file, so that in future we can use read-locks
|
|
|
|
* on other parts, e.g. memzones, to detect if there are running secondary
|
|
|
|
* processes. */
|
|
|
|
static void
|
|
|
|
rte_eal_config_create(void)
|
|
|
|
{
|
|
|
|
void *rte_mem_cfg_addr;
|
|
|
|
int retval;
|
|
|
|
|
|
|
|
const char *pathname = eal_runtime_config_path();
|
|
|
|
|
|
|
|
if (internal_config.no_shconf)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (mem_cfg_fd < 0){
|
|
|
|
mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0660);
|
|
|
|
if (mem_cfg_fd < 0)
|
|
|
|
rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
|
|
|
|
}
|
|
|
|
|
|
|
|
retval = ftruncate(mem_cfg_fd, sizeof(*rte_config.mem_config));
|
|
|
|
if (retval < 0){
|
|
|
|
close(mem_cfg_fd);
|
|
|
|
rte_panic("Cannot resize '%s' for rte_mem_config\n", pathname);
|
|
|
|
}
|
|
|
|
|
|
|
|
retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock);
|
|
|
|
if (retval < 0){
|
|
|
|
close(mem_cfg_fd);
|
|
|
|
rte_exit(EXIT_FAILURE, "Cannot create lock on '%s'. Is another primary "
|
|
|
|
"process running?\n", pathname);
|
|
|
|
}
|
|
|
|
|
|
|
|
rte_mem_cfg_addr = mmap(NULL, sizeof(*rte_config.mem_config),
|
|
|
|
PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0);
|
|
|
|
|
|
|
|
if (rte_mem_cfg_addr == MAP_FAILED){
|
|
|
|
rte_panic("Cannot mmap memory for rte_config\n");
|
|
|
|
}
|
|
|
|
memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config));
|
2017-04-07 13:44:47 -04:00
|
|
|
rte_config.mem_config = rte_mem_cfg_addr;
|
2014-02-10 11:49:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* attach to an existing shared memory config */
|
|
|
|
static void
|
|
|
|
rte_eal_config_attach(void)
|
|
|
|
{
|
|
|
|
void *rte_mem_cfg_addr;
|
|
|
|
const char *pathname = eal_runtime_config_path();
|
|
|
|
|
|
|
|
if (internal_config.no_shconf)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (mem_cfg_fd < 0){
|
|
|
|
mem_cfg_fd = open(pathname, O_RDWR);
|
|
|
|
if (mem_cfg_fd < 0)
|
|
|
|
rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
|
|
|
|
}
|
|
|
|
|
2014-06-04 00:42:50 +01:00
|
|
|
rte_mem_cfg_addr = mmap(NULL, sizeof(*rte_config.mem_config),
|
2014-02-10 11:49:10 +00:00
|
|
|
PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0);
|
|
|
|
close(mem_cfg_fd);
|
|
|
|
if (rte_mem_cfg_addr == MAP_FAILED)
|
|
|
|
rte_panic("Cannot mmap memory for rte_config\n");
|
|
|
|
|
2017-04-07 13:44:47 -04:00
|
|
|
rte_config.mem_config = rte_mem_cfg_addr;
|
2014-02-10 11:49:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Detect if we are a primary or a secondary process */
|
2014-11-20 22:57:22 +01:00
|
|
|
enum rte_proc_type_t
|
2014-02-10 11:49:10 +00:00
|
|
|
eal_proc_type_detect(void)
|
|
|
|
{
|
|
|
|
enum rte_proc_type_t ptype = RTE_PROC_PRIMARY;
|
|
|
|
const char *pathname = eal_runtime_config_path();
|
|
|
|
|
|
|
|
/* if we can open the file but not get a write-lock we are a secondary
|
|
|
|
* process. NOTE: if we get a file handle back, we keep that open
|
|
|
|
* and don't close it to prevent a race condition between multiple opens */
|
|
|
|
if (((mem_cfg_fd = open(pathname, O_RDWR)) >= 0) &&
|
|
|
|
(fcntl(mem_cfg_fd, F_SETLK, &wr_lock) < 0))
|
|
|
|
ptype = RTE_PROC_SECONDARY;
|
|
|
|
|
|
|
|
RTE_LOG(INFO, EAL, "Auto-detected process type: %s\n",
|
|
|
|
ptype == RTE_PROC_PRIMARY ? "PRIMARY" : "SECONDARY");
|
|
|
|
|
|
|
|
return ptype;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Sets up rte_config structure with the pointer to shared memory config.*/
|
|
|
|
static void
|
|
|
|
rte_config_init(void)
|
|
|
|
{
|
2014-11-20 22:57:22 +01:00
|
|
|
rte_config.process_type = internal_config.process_type;
|
2014-02-10 11:49:10 +00:00
|
|
|
|
|
|
|
switch (rte_config.process_type){
|
|
|
|
case RTE_PROC_PRIMARY:
|
|
|
|
rte_eal_config_create();
|
|
|
|
break;
|
|
|
|
case RTE_PROC_SECONDARY:
|
|
|
|
rte_eal_config_attach();
|
|
|
|
rte_eal_mcfg_wait_complete(rte_config.mem_config);
|
|
|
|
break;
|
|
|
|
case RTE_PROC_AUTO:
|
|
|
|
case RTE_PROC_INVALID:
|
|
|
|
rte_panic("Invalid process type\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* display usage */
|
|
|
|
static void
|
|
|
|
eal_usage(const char *prgname)
|
|
|
|
{
|
2014-09-22 10:37:59 +02:00
|
|
|
printf("\nUsage: %s ", prgname);
|
|
|
|
eal_common_usage();
|
2014-02-10 11:49:10 +00:00
|
|
|
/* Allow the application to print its usage message too if hook is set */
|
|
|
|
if ( rte_application_usage_hook ) {
|
|
|
|
printf("===== Application Usage =====\n\n");
|
|
|
|
rte_application_usage_hook(prgname);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Set a per-application usage message */
|
|
|
|
rte_usage_hook_t
|
|
|
|
rte_set_application_usage_hook( rte_usage_hook_t usage_func )
|
|
|
|
{
|
|
|
|
rte_usage_hook_t old_func;
|
|
|
|
|
|
|
|
/* Will be NULL on the first call to denote the last usage routine. */
|
|
|
|
old_func = rte_application_usage_hook;
|
|
|
|
rte_application_usage_hook = usage_func;
|
|
|
|
|
|
|
|
return old_func;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline size_t
|
|
|
|
eal_get_hugepage_mem_size(void)
|
|
|
|
{
|
|
|
|
uint64_t size = 0;
|
|
|
|
unsigned i, j;
|
|
|
|
|
|
|
|
for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
|
|
|
|
struct hugepage_info *hpi = &internal_config.hugepage_info[i];
|
|
|
|
if (hpi->hugedir != NULL) {
|
|
|
|
for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
|
|
|
|
size += hpi->hugepage_sz * hpi->num_pages[j];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX;
|
|
|
|
}
|
|
|
|
|
2015-06-08 16:55:52 -05:00
|
|
|
/* Parse the arguments for --log-level only */
|
|
|
|
static void
|
|
|
|
eal_log_level_parse(int argc, char **argv)
|
|
|
|
{
|
|
|
|
int opt;
|
|
|
|
char **argvopt;
|
|
|
|
int option_index;
|
2015-10-19 21:13:10 +08:00
|
|
|
const int old_optind = optind;
|
|
|
|
const int old_optopt = optopt;
|
|
|
|
const int old_optreset = optreset;
|
|
|
|
char * const old_optarg = optarg;
|
2015-06-08 16:55:52 -05:00
|
|
|
|
|
|
|
argvopt = argv;
|
2015-10-19 21:13:10 +08:00
|
|
|
optind = 1;
|
|
|
|
optreset = 1;
|
2015-06-08 16:55:52 -05:00
|
|
|
|
|
|
|
while ((opt = getopt_long(argc, argvopt, eal_short_options,
|
|
|
|
eal_long_options, &option_index)) != EOF) {
|
|
|
|
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/* getopt is not happy, stop right now */
|
|
|
|
if (opt == '?')
|
|
|
|
break;
|
|
|
|
|
|
|
|
ret = (opt == OPT_LOG_LEVEL_NUM) ?
|
|
|
|
eal_parse_common_option(opt, optarg, &internal_config) : 0;
|
|
|
|
|
|
|
|
/* common parser is not happy */
|
|
|
|
if (ret < 0)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2015-10-19 21:13:10 +08:00
|
|
|
/* restore getopt lib */
|
|
|
|
optind = old_optind;
|
|
|
|
optopt = old_optopt;
|
|
|
|
optreset = old_optreset;
|
|
|
|
optarg = old_optarg;
|
2015-06-08 16:55:52 -05:00
|
|
|
}
|
|
|
|
|
2014-02-10 11:49:10 +00:00
|
|
|
/* Parse the argument given in the command line of the application */
|
|
|
|
static int
|
|
|
|
eal_parse_args(int argc, char **argv)
|
|
|
|
{
|
2014-11-20 22:57:22 +01:00
|
|
|
int opt, ret;
|
2014-02-10 11:49:10 +00:00
|
|
|
char **argvopt;
|
|
|
|
int option_index;
|
|
|
|
char *prgname = argv[0];
|
2015-10-19 21:13:10 +08:00
|
|
|
const int old_optind = optind;
|
|
|
|
const int old_optopt = optopt;
|
|
|
|
const int old_optreset = optreset;
|
|
|
|
char * const old_optarg = optarg;
|
2014-02-10 11:49:10 +00:00
|
|
|
|
|
|
|
argvopt = argv;
|
2015-10-19 21:13:10 +08:00
|
|
|
optind = 1;
|
|
|
|
optreset = 1;
|
2014-02-10 11:49:10 +00:00
|
|
|
|
2014-09-22 10:37:59 +02:00
|
|
|
while ((opt = getopt_long(argc, argvopt, eal_short_options,
|
|
|
|
eal_long_options, &option_index)) != EOF) {
|
2014-02-10 11:49:10 +00:00
|
|
|
|
2014-09-22 10:37:59 +02:00
|
|
|
/* getopt is not happy, stop right now */
|
2015-01-29 17:51:17 +01:00
|
|
|
if (opt == '?') {
|
|
|
|
eal_usage(prgname);
|
2015-10-19 21:13:10 +08:00
|
|
|
ret = -1;
|
|
|
|
goto out;
|
2015-01-29 17:51:17 +01:00
|
|
|
}
|
2014-02-10 11:49:10 +00:00
|
|
|
|
2014-09-22 10:38:00 +02:00
|
|
|
ret = eal_parse_common_option(opt, optarg, &internal_config);
|
2014-09-22 10:37:59 +02:00
|
|
|
/* common parser is not happy */
|
|
|
|
if (ret < 0) {
|
|
|
|
eal_usage(prgname);
|
2015-10-19 21:13:10 +08:00
|
|
|
ret = -1;
|
|
|
|
goto out;
|
2014-09-22 10:37:59 +02:00
|
|
|
}
|
|
|
|
/* common parser handled this option */
|
2014-11-17 10:14:10 +01:00
|
|
|
if (ret == 0)
|
2014-09-22 10:37:59 +02:00
|
|
|
continue;
|
|
|
|
|
|
|
|
switch (opt) {
|
2017-10-06 13:15:29 +05:30
|
|
|
case OPT_MBUF_POOL_OPS_NAME_NUM:
|
2018-01-29 13:40:43 +05:30
|
|
|
internal_config.user_mbuf_pool_ops_name = optarg;
|
2017-10-06 13:15:29 +05:30
|
|
|
break;
|
2015-01-29 17:51:17 +01:00
|
|
|
case 'h':
|
|
|
|
eal_usage(prgname);
|
|
|
|
exit(EXIT_SUCCESS);
|
2014-02-10 11:49:10 +00:00
|
|
|
default:
|
2014-09-22 10:38:00 +02:00
|
|
|
if (opt < OPT_LONG_MIN_NUM && isprint(opt)) {
|
2014-09-22 10:37:59 +02:00
|
|
|
RTE_LOG(ERR, EAL, "Option %c is not supported "
|
|
|
|
"on FreeBSD\n", opt);
|
2014-09-22 10:38:00 +02:00
|
|
|
} else if (opt >= OPT_LONG_MIN_NUM &&
|
|
|
|
opt < OPT_LONG_MAX_NUM) {
|
|
|
|
RTE_LOG(ERR, EAL, "Option %s is not supported "
|
|
|
|
"on FreeBSD\n",
|
|
|
|
eal_long_options[option_index].name);
|
2014-09-22 10:37:59 +02:00
|
|
|
} else {
|
|
|
|
RTE_LOG(ERR, EAL, "Option %d is not supported "
|
|
|
|
"on FreeBSD\n", opt);
|
|
|
|
}
|
2014-02-10 11:49:10 +00:00
|
|
|
eal_usage(prgname);
|
2015-10-19 21:13:10 +08:00
|
|
|
ret = -1;
|
|
|
|
goto out;
|
2014-02-10 11:49:10 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-10-19 21:13:10 +08:00
|
|
|
if (eal_adjust_config(&internal_config) != 0) {
|
|
|
|
ret = -1;
|
|
|
|
goto out;
|
|
|
|
}
|
2014-02-10 11:49:10 +00:00
|
|
|
|
2014-11-17 10:14:10 +01:00
|
|
|
/* sanity checks */
|
|
|
|
if (eal_check_common_options(&internal_config) != 0) {
|
2014-04-25 13:59:41 +02:00
|
|
|
eal_usage(prgname);
|
2015-10-19 21:13:10 +08:00
|
|
|
ret = -1;
|
|
|
|
goto out;
|
2014-02-10 11:49:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (optind >= 0)
|
|
|
|
argv[optind-1] = prgname;
|
|
|
|
ret = optind-1;
|
2015-10-19 21:13:10 +08:00
|
|
|
|
|
|
|
out:
|
|
|
|
/* restore getopt lib */
|
|
|
|
optind = old_optind;
|
|
|
|
optopt = old_optopt;
|
|
|
|
optreset = old_optreset;
|
|
|
|
optarg = old_optarg;
|
|
|
|
|
2014-02-10 11:49:10 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-04-11 13:30:04 +01:00
|
|
|
static int
|
mem: replace memseg with memseg lists
Before, we were aggregating multiple pages into one memseg, so the
number of memsegs was small. Now, each page gets its own memseg,
so the list of memsegs is huge. To accommodate the new memseg list
size and to keep the under-the-hood workings sane, the memseg list
is now not just a single list, but multiple lists. To be precise,
each hugepage size available on the system gets one or more memseg
lists, per socket.
In order to support dynamic memory allocation, we reserve all
memory in advance (unless we're in 32-bit legacy mode, in which
case we do not preallocate memory). As in, we do an anonymous
mmap() of the entire maximum size of memory per hugepage size, per
socket (which is limited to either RTE_MAX_MEMSEG_PER_TYPE pages or
RTE_MAX_MEM_MB_PER_TYPE megabytes worth of memory, whichever is the
smaller one), split over multiple lists (which are limited to
either RTE_MAX_MEMSEG_PER_LIST memsegs or RTE_MAX_MEM_MB_PER_LIST
megabytes per list, whichever is the smaller one). There is also
a global limit of CONFIG_RTE_MAX_MEM_MB megabytes, which is mainly
used for 32-bit targets to limit amounts of preallocated memory,
but can be used to place an upper limit on total amount of VA
memory that can be allocated by DPDK application.
So, for each hugepage size, we get (by default) up to 128G worth
of memory, per socket, split into chunks of up to 32G in size.
The address space is claimed at the start, in eal_common_memory.c.
The actual page allocation code is in eal_memalloc.c (Linux-only),
and largely consists of copied EAL memory init code.
Pages in the list are also indexed by address. That is, in order
to figure out where the page belongs, one can simply look at base
address for a memseg list. Similarly, figuring out IOVA address
of a memzone is a matter of finding the right memseg list, getting
offset and dividing by page size to get the appropriate memseg.
This commit also removes rte_eal_dump_physmem_layout() call,
according to deprecation notice [1], and removes that deprecation
notice as well.
On 32-bit targets due to limited VA space, DPDK will no longer
spread memory to different sockets like before. Instead, it will
(by default) allocate all of the memory on socket where master
lcore is. To override this behavior, --socket-mem must be used.
The rest of the changes are really ripple effects from the memseg
change - heap changes, compile fixes, and rewrites to support
fbarray-backed memseg lists. Due to earlier switch to _walk()
functions, most of the changes are simple fixes, however some
of the _walk() calls were switched to memseg list walk, where
it made sense to do so.
Additionally, we are also switching locks from flock() to fcntl().
Down the line, we will be introducing single-file segments option,
and we cannot use flock() locks to lock parts of the file. Therefore,
we will use fcntl() locks for legacy mem as well, in case someone is
unfortunate enough to accidentally start legacy mem primary process
alongside an already working non-legacy mem-based primary process.
[1] http://dpdk.org/dev/patchwork/patch/34002/
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Tested-by: Santosh Shukla <santosh.shukla@caviumnetworks.com>
Tested-by: Hemant Agrawal <hemant.agrawal@nxp.com>
Tested-by: Gowrishankar Muthukrishnan <gowrishankar.m@linux.vnet.ibm.com>
2018-04-11 13:30:24 +01:00
|
|
|
check_socket(const struct rte_memseg_list *msl, void *arg)
|
2018-04-11 13:30:04 +01:00
|
|
|
{
|
|
|
|
int *socket_id = arg;
|
|
|
|
|
mem: replace memseg with memseg lists
Before, we were aggregating multiple pages into one memseg, so the
number of memsegs was small. Now, each page gets its own memseg,
so the list of memsegs is huge. To accommodate the new memseg list
size and to keep the under-the-hood workings sane, the memseg list
is now not just a single list, but multiple lists. To be precise,
each hugepage size available on the system gets one or more memseg
lists, per socket.
In order to support dynamic memory allocation, we reserve all
memory in advance (unless we're in 32-bit legacy mode, in which
case we do not preallocate memory). As in, we do an anonymous
mmap() of the entire maximum size of memory per hugepage size, per
socket (which is limited to either RTE_MAX_MEMSEG_PER_TYPE pages or
RTE_MAX_MEM_MB_PER_TYPE megabytes worth of memory, whichever is the
smaller one), split over multiple lists (which are limited to
either RTE_MAX_MEMSEG_PER_LIST memsegs or RTE_MAX_MEM_MB_PER_LIST
megabytes per list, whichever is the smaller one). There is also
a global limit of CONFIG_RTE_MAX_MEM_MB megabytes, which is mainly
used for 32-bit targets to limit amounts of preallocated memory,
but can be used to place an upper limit on total amount of VA
memory that can be allocated by DPDK application.
So, for each hugepage size, we get (by default) up to 128G worth
of memory, per socket, split into chunks of up to 32G in size.
The address space is claimed at the start, in eal_common_memory.c.
The actual page allocation code is in eal_memalloc.c (Linux-only),
and largely consists of copied EAL memory init code.
Pages in the list are also indexed by address. That is, in order
to figure out where the page belongs, one can simply look at base
address for a memseg list. Similarly, figuring out IOVA address
of a memzone is a matter of finding the right memseg list, getting
offset and dividing by page size to get the appropriate memseg.
This commit also removes rte_eal_dump_physmem_layout() call,
according to deprecation notice [1], and removes that deprecation
notice as well.
On 32-bit targets due to limited VA space, DPDK will no longer
spread memory to different sockets like before. Instead, it will
(by default) allocate all of the memory on socket where master
lcore is. To override this behavior, --socket-mem must be used.
The rest of the changes are really ripple effects from the memseg
change - heap changes, compile fixes, and rewrites to support
fbarray-backed memseg lists. Due to earlier switch to _walk()
functions, most of the changes are simple fixes, however some
of the _walk() calls were switched to memseg list walk, where
it made sense to do so.
Additionally, we are also switching locks from flock() to fcntl().
Down the line, we will be introducing single-file segments option,
and we cannot use flock() locks to lock parts of the file. Therefore,
we will use fcntl() locks for legacy mem as well, in case someone is
unfortunate enough to accidentally start legacy mem primary process
alongside an already working non-legacy mem-based primary process.
[1] http://dpdk.org/dev/patchwork/patch/34002/
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Tested-by: Santosh Shukla <santosh.shukla@caviumnetworks.com>
Tested-by: Hemant Agrawal <hemant.agrawal@nxp.com>
Tested-by: Gowrishankar Muthukrishnan <gowrishankar.m@linux.vnet.ibm.com>
2018-04-11 13:30:24 +01:00
|
|
|
if (msl->socket_id == *socket_id && msl->memseg_arr.count != 0)
|
2018-04-11 13:30:04 +01:00
|
|
|
return 1;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2014-02-10 11:49:10 +00:00
|
|
|
static void
|
|
|
|
eal_check_mem_on_local_socket(void)
|
|
|
|
{
|
2018-04-11 13:30:04 +01:00
|
|
|
int socket_id;
|
2014-02-10 11:49:10 +00:00
|
|
|
|
|
|
|
socket_id = rte_lcore_to_socket_id(rte_config.master_lcore);
|
|
|
|
|
mem: replace memseg with memseg lists
Before, we were aggregating multiple pages into one memseg, so the
number of memsegs was small. Now, each page gets its own memseg,
so the list of memsegs is huge. To accommodate the new memseg list
size and to keep the under-the-hood workings sane, the memseg list
is now not just a single list, but multiple lists. To be precise,
each hugepage size available on the system gets one or more memseg
lists, per socket.
In order to support dynamic memory allocation, we reserve all
memory in advance (unless we're in 32-bit legacy mode, in which
case we do not preallocate memory). As in, we do an anonymous
mmap() of the entire maximum size of memory per hugepage size, per
socket (which is limited to either RTE_MAX_MEMSEG_PER_TYPE pages or
RTE_MAX_MEM_MB_PER_TYPE megabytes worth of memory, whichever is the
smaller one), split over multiple lists (which are limited to
either RTE_MAX_MEMSEG_PER_LIST memsegs or RTE_MAX_MEM_MB_PER_LIST
megabytes per list, whichever is the smaller one). There is also
a global limit of CONFIG_RTE_MAX_MEM_MB megabytes, which is mainly
used for 32-bit targets to limit amounts of preallocated memory,
but can be used to place an upper limit on total amount of VA
memory that can be allocated by DPDK application.
So, for each hugepage size, we get (by default) up to 128G worth
of memory, per socket, split into chunks of up to 32G in size.
The address space is claimed at the start, in eal_common_memory.c.
The actual page allocation code is in eal_memalloc.c (Linux-only),
and largely consists of copied EAL memory init code.
Pages in the list are also indexed by address. That is, in order
to figure out where the page belongs, one can simply look at base
address for a memseg list. Similarly, figuring out IOVA address
of a memzone is a matter of finding the right memseg list, getting
offset and dividing by page size to get the appropriate memseg.
This commit also removes rte_eal_dump_physmem_layout() call,
according to deprecation notice [1], and removes that deprecation
notice as well.
On 32-bit targets due to limited VA space, DPDK will no longer
spread memory to different sockets like before. Instead, it will
(by default) allocate all of the memory on socket where master
lcore is. To override this behavior, --socket-mem must be used.
The rest of the changes are really ripple effects from the memseg
change - heap changes, compile fixes, and rewrites to support
fbarray-backed memseg lists. Due to earlier switch to _walk()
functions, most of the changes are simple fixes, however some
of the _walk() calls were switched to memseg list walk, where
it made sense to do so.
Additionally, we are also switching locks from flock() to fcntl().
Down the line, we will be introducing single-file segments option,
and we cannot use flock() locks to lock parts of the file. Therefore,
we will use fcntl() locks for legacy mem as well, in case someone is
unfortunate enough to accidentally start legacy mem primary process
alongside an already working non-legacy mem-based primary process.
[1] http://dpdk.org/dev/patchwork/patch/34002/
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Tested-by: Santosh Shukla <santosh.shukla@caviumnetworks.com>
Tested-by: Hemant Agrawal <hemant.agrawal@nxp.com>
Tested-by: Gowrishankar Muthukrishnan <gowrishankar.m@linux.vnet.ibm.com>
2018-04-11 13:30:24 +01:00
|
|
|
if (rte_memseg_list_walk(check_socket, &socket_id) == 0)
|
2018-04-11 13:30:04 +01:00
|
|
|
RTE_LOG(WARNING, EAL, "WARNING: Master core has no memory on local socket!\n");
|
2014-02-10 11:49:10 +00:00
|
|
|
}
|
|
|
|
|
mem: replace memseg with memseg lists
Before, we were aggregating multiple pages into one memseg, so the
number of memsegs was small. Now, each page gets its own memseg,
so the list of memsegs is huge. To accommodate the new memseg list
size and to keep the under-the-hood workings sane, the memseg list
is now not just a single list, but multiple lists. To be precise,
each hugepage size available on the system gets one or more memseg
lists, per socket.
In order to support dynamic memory allocation, we reserve all
memory in advance (unless we're in 32-bit legacy mode, in which
case we do not preallocate memory). As in, we do an anonymous
mmap() of the entire maximum size of memory per hugepage size, per
socket (which is limited to either RTE_MAX_MEMSEG_PER_TYPE pages or
RTE_MAX_MEM_MB_PER_TYPE megabytes worth of memory, whichever is the
smaller one), split over multiple lists (which are limited to
either RTE_MAX_MEMSEG_PER_LIST memsegs or RTE_MAX_MEM_MB_PER_LIST
megabytes per list, whichever is the smaller one). There is also
a global limit of CONFIG_RTE_MAX_MEM_MB megabytes, which is mainly
used for 32-bit targets to limit amounts of preallocated memory,
but can be used to place an upper limit on total amount of VA
memory that can be allocated by DPDK application.
So, for each hugepage size, we get (by default) up to 128G worth
of memory, per socket, split into chunks of up to 32G in size.
The address space is claimed at the start, in eal_common_memory.c.
The actual page allocation code is in eal_memalloc.c (Linux-only),
and largely consists of copied EAL memory init code.
Pages in the list are also indexed by address. That is, in order
to figure out where the page belongs, one can simply look at base
address for a memseg list. Similarly, figuring out IOVA address
of a memzone is a matter of finding the right memseg list, getting
offset and dividing by page size to get the appropriate memseg.
This commit also removes rte_eal_dump_physmem_layout() call,
according to deprecation notice [1], and removes that deprecation
notice as well.
On 32-bit targets due to limited VA space, DPDK will no longer
spread memory to different sockets like before. Instead, it will
(by default) allocate all of the memory on socket where master
lcore is. To override this behavior, --socket-mem must be used.
The rest of the changes are really ripple effects from the memseg
change - heap changes, compile fixes, and rewrites to support
fbarray-backed memseg lists. Due to earlier switch to _walk()
functions, most of the changes are simple fixes, however some
of the _walk() calls were switched to memseg list walk, where
it made sense to do so.
Additionally, we are also switching locks from flock() to fcntl().
Down the line, we will be introducing single-file segments option,
and we cannot use flock() locks to lock parts of the file. Therefore,
we will use fcntl() locks for legacy mem as well, in case someone is
unfortunate enough to accidentally start legacy mem primary process
alongside an already working non-legacy mem-based primary process.
[1] http://dpdk.org/dev/patchwork/patch/34002/
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Tested-by: Santosh Shukla <santosh.shukla@caviumnetworks.com>
Tested-by: Hemant Agrawal <hemant.agrawal@nxp.com>
Tested-by: Gowrishankar Muthukrishnan <gowrishankar.m@linux.vnet.ibm.com>
2018-04-11 13:30:24 +01:00
|
|
|
|
2014-02-10 11:49:10 +00:00
|
|
|
static int
|
|
|
|
sync_func(__attribute__((unused)) void *arg)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2014-06-04 00:42:50 +01:00
|
|
|
inline static void
|
2014-02-10 11:49:10 +00:00
|
|
|
rte_eal_mcfg_complete(void)
|
|
|
|
{
|
|
|
|
/* ALL shared mem_config related INIT DONE */
|
|
|
|
if (rte_config.process_type == RTE_PROC_PRIMARY)
|
|
|
|
rte_config.mem_config->magic = RTE_MAGIC;
|
|
|
|
}
|
|
|
|
|
2014-04-25 13:59:47 +02:00
|
|
|
/* return non-zero if hugepages are enabled. */
|
|
|
|
int rte_eal_has_hugepages(void)
|
|
|
|
{
|
|
|
|
return !internal_config.no_hugetlbfs;
|
|
|
|
}
|
|
|
|
|
2014-05-02 16:42:52 -07:00
|
|
|
/* Abstraction for port I/0 privilege */
|
2014-08-26 16:11:39 +02:00
|
|
|
int
|
2014-02-10 11:49:10 +00:00
|
|
|
rte_eal_iopl_init(void)
|
|
|
|
{
|
2015-04-16 14:52:07 +03:00
|
|
|
static int fd;
|
2015-02-14 09:59:07 -05:00
|
|
|
|
2014-02-10 11:49:10 +00:00
|
|
|
fd = open("/dev/io", O_RDWR);
|
|
|
|
if (fd < 0)
|
|
|
|
return -1;
|
2015-04-16 14:52:07 +03:00
|
|
|
/* keep fd open for iopl */
|
2014-02-10 11:49:10 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-03-22 16:19:27 -04:00
|
|
|
static void rte_eal_init_alert(const char *msg)
|
|
|
|
{
|
|
|
|
fprintf(stderr, "EAL: FATAL: %s\n", msg);
|
|
|
|
RTE_LOG(ERR, EAL, "%s\n", msg);
|
|
|
|
}
|
|
|
|
|
2014-02-10 11:49:10 +00:00
|
|
|
/* Launch threads, called at application init(). */
|
|
|
|
int
|
|
|
|
rte_eal_init(int argc, char **argv)
|
|
|
|
{
|
|
|
|
int i, fctret, ret;
|
|
|
|
pthread_t thread_id;
|
|
|
|
static rte_atomic32_t run_once = RTE_ATOMIC32_INIT(0);
|
2015-02-17 10:08:07 +08:00
|
|
|
char cpuset[RTE_CPU_AFFINITY_STR_LEN];
|
2015-07-28 17:51:44 -07:00
|
|
|
char thread_name[RTE_MAX_THREAD_NAME_LEN];
|
2014-02-10 11:49:10 +00:00
|
|
|
|
2016-09-23 11:47:31 -03:00
|
|
|
/* checks if the machine is adequate */
|
2017-03-22 16:19:28 -04:00
|
|
|
if (!rte_cpu_is_supported()) {
|
|
|
|
rte_eal_init_alert("unsupported cpu type.");
|
|
|
|
rte_errno = ENOTSUP;
|
|
|
|
return -1;
|
|
|
|
}
|
2016-09-23 11:47:31 -03:00
|
|
|
|
2017-03-22 16:19:32 -04:00
|
|
|
if (!rte_atomic32_test_and_set(&run_once)) {
|
|
|
|
rte_eal_init_alert("already called initialization.");
|
|
|
|
rte_errno = EALREADY;
|
2014-02-10 11:49:10 +00:00
|
|
|
return -1;
|
2017-03-22 16:19:32 -04:00
|
|
|
}
|
2014-02-10 11:49:10 +00:00
|
|
|
|
|
|
|
thread_id = pthread_self();
|
|
|
|
|
2017-04-18 16:22:21 +02:00
|
|
|
eal_reset_internal_config(&internal_config);
|
|
|
|
|
2015-06-08 16:55:52 -05:00
|
|
|
/* set log level as early as possible */
|
2017-04-04 18:40:36 +02:00
|
|
|
eal_log_level_parse(argc, argv);
|
2015-06-08 16:55:52 -05:00
|
|
|
|
2017-03-22 16:19:27 -04:00
|
|
|
if (rte_eal_cpu_init() < 0) {
|
|
|
|
rte_eal_init_alert("Cannot detect lcores.");
|
|
|
|
rte_errno = ENOTSUP;
|
|
|
|
return -1;
|
|
|
|
}
|
2014-10-08 10:43:31 +02:00
|
|
|
|
2014-02-10 11:49:10 +00:00
|
|
|
fctret = eal_parse_args(argc, argv);
|
2017-03-22 16:19:30 -04:00
|
|
|
if (fctret < 0) {
|
|
|
|
rte_eal_init_alert("Invalid 'command line' arguments.");
|
|
|
|
rte_errno = EINVAL;
|
|
|
|
rte_atomic32_clear(&run_once);
|
|
|
|
return -1;
|
|
|
|
}
|
2014-02-10 11:49:10 +00:00
|
|
|
|
2018-04-11 13:30:22 +01:00
|
|
|
/* FreeBSD always uses legacy memory model */
|
|
|
|
internal_config.legacy_mem = true;
|
|
|
|
|
2017-10-13 11:55:01 +00:00
|
|
|
if (eal_plugins_init() < 0) {
|
|
|
|
rte_eal_init_alert("Cannot init plugins\n");
|
|
|
|
rte_errno = EINVAL;
|
|
|
|
rte_atomic32_clear(&run_once);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2017-10-06 16:33:43 +05:30
|
|
|
if (eal_option_device_parse()) {
|
|
|
|
rte_errno = ENODEV;
|
|
|
|
rte_atomic32_clear(&run_once);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (rte_bus_scan()) {
|
|
|
|
rte_eal_init_alert("Cannot scan the buses for devices\n");
|
|
|
|
rte_errno = ENODEV;
|
|
|
|
rte_atomic32_clear(&run_once);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* autodetect the iova mapping mode (default is iova_pa) */
|
|
|
|
rte_eal_get_configuration()->iova_mode = rte_bus_get_iommu_class();
|
|
|
|
|
2014-02-10 11:49:10 +00:00
|
|
|
if (internal_config.no_hugetlbfs == 0 &&
|
2017-03-22 16:19:29 -04:00
|
|
|
eal_hugepage_info_init() < 0) {
|
|
|
|
rte_eal_init_alert("Cannot get hugepage information.");
|
|
|
|
rte_errno = EACCES;
|
|
|
|
rte_atomic32_clear(&run_once);
|
|
|
|
return -1;
|
|
|
|
}
|
2014-02-10 11:49:10 +00:00
|
|
|
|
|
|
|
if (internal_config.memory == 0 && internal_config.force_sockets == 0) {
|
|
|
|
if (internal_config.no_hugetlbfs)
|
|
|
|
internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE;
|
|
|
|
else
|
|
|
|
internal_config.memory = eal_get_hugepage_mem_size();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (internal_config.vmware_tsc_map == 1) {
|
|
|
|
#ifdef RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT
|
|
|
|
rte_cycles_vmware_tsc_map = 1;
|
|
|
|
RTE_LOG (DEBUG, EAL, "Using VMWARE TSC MAP, "
|
|
|
|
"you must have monitor_control.pseudo_perfctr = TRUE\n");
|
|
|
|
#else
|
|
|
|
RTE_LOG (WARNING, EAL, "Ignoring --vmware-tsc-map because "
|
|
|
|
"RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT is not set\n");
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
rte_srand(rte_rdtsc());
|
|
|
|
|
|
|
|
rte_config_init();
|
|
|
|
|
eal: add channel for multi-process communication
Previouly, there are three channels for multi-process
(i.e., primary/secondary) communication.
1. Config-file based channel, in which, the primary process writes
info into a pre-defined config file, and the secondary process
reads the info out.
2. vfio submodule has its own channel based on unix socket for the
secondary process to get container fd and group fd from the
primary process.
3. pdump submodule also has its own channel based on unix socket for
packet dump.
It'd be good to have a generic communication channel for multi-process
communication to accommodate the requirements including:
a. Secondary wants to send info to primary, for example, secondary
would like to send request (about some specific vdev to primary).
b. Sending info at any time, instead of just initialization time.
c. Share FDs with the other side, for vdev like vhost, related FDs
(memory region, kick) should be shared.
d. A send message request needs the other side to response immediately.
This patch proposes to create a communication channel, based on datagram
unix socket, for above requirements. Each process will block on a unix
socket waiting for messages from the peers.
Three new APIs are added:
1. rte_eal_mp_action_register() is used to register an action,
indexed by a string, when a component at receiver side would like
to response the messages from the peer processe.
2. rte_eal_mp_action_unregister() is used to unregister the action
if the calling component does not want to response the messages.
3. rte_eal_mp_sendmsg() is used to send a message, and returns
immediately. If there are n secondary processes, the primary
process will send n messages.
Suggested-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
Reviewed-by: Anatoly Burakov <anatoly.burakov@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
2018-01-30 06:58:08 +00:00
|
|
|
if (rte_mp_channel_init() < 0) {
|
|
|
|
rte_eal_init_alert("failed to init mp channel\n");
|
|
|
|
if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
|
|
|
|
rte_errno = EFAULT;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-04-11 13:30:25 +01:00
|
|
|
/* in secondary processes, memory init may allocate additional fbarrays
|
|
|
|
* not present in primary processes, so to avoid any potential issues,
|
|
|
|
* initialize memzones first.
|
|
|
|
*/
|
|
|
|
if (rte_eal_memzone_init() < 0) {
|
|
|
|
rte_eal_init_alert("Cannot init memzone\n");
|
|
|
|
rte_errno = ENODEV;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2017-03-22 16:19:33 -04:00
|
|
|
if (rte_eal_memory_init() < 0) {
|
|
|
|
rte_eal_init_alert("Cannot init memory\n");
|
|
|
|
rte_errno = ENOMEM;
|
|
|
|
return -1;
|
|
|
|
}
|
2014-02-10 11:49:10 +00:00
|
|
|
|
2018-04-11 13:30:25 +01:00
|
|
|
if (rte_eal_malloc_heap_init() < 0) {
|
|
|
|
rte_eal_init_alert("Cannot init malloc heap\n");
|
2017-03-22 16:19:31 -04:00
|
|
|
rte_errno = ENODEV;
|
|
|
|
return -1;
|
|
|
|
}
|
2014-02-10 11:49:10 +00:00
|
|
|
|
2017-03-22 16:19:33 -04:00
|
|
|
if (rte_eal_tailqs_init() < 0) {
|
|
|
|
rte_eal_init_alert("Cannot init tail queues for objects\n");
|
|
|
|
rte_errno = EFAULT;
|
|
|
|
return -1;
|
|
|
|
}
|
2014-02-10 11:49:10 +00:00
|
|
|
|
2017-03-22 16:19:33 -04:00
|
|
|
if (rte_eal_alarm_init() < 0) {
|
|
|
|
rte_eal_init_alert("Cannot init interrupt-handling thread\n");
|
|
|
|
/* rte_eal_alarm_init sets rte_errno on failure. */
|
|
|
|
return -1;
|
|
|
|
}
|
2014-02-10 11:49:10 +00:00
|
|
|
|
2017-03-22 16:19:35 -04:00
|
|
|
if (rte_eal_intr_init() < 0) {
|
|
|
|
rte_eal_init_alert("Cannot init interrupt-handling thread\n");
|
|
|
|
return -1;
|
|
|
|
}
|
2014-02-10 11:49:10 +00:00
|
|
|
|
2017-03-22 16:19:34 -04:00
|
|
|
if (rte_eal_timer_init() < 0) {
|
|
|
|
rte_eal_init_alert("Cannot init HPET or TSC timers\n");
|
|
|
|
rte_errno = ENOTSUP;
|
|
|
|
return -1;
|
|
|
|
}
|
2014-02-10 11:49:10 +00:00
|
|
|
|
|
|
|
eal_check_mem_on_local_socket();
|
|
|
|
|
2015-02-17 10:08:05 +08:00
|
|
|
eal_thread_init_master(rte_config.master_lcore);
|
|
|
|
|
2015-02-17 10:08:07 +08:00
|
|
|
ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
|
|
|
|
|
|
|
|
RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%p;cpuset=[%s%s])\n",
|
|
|
|
rte_config.master_lcore, thread_id, cpuset,
|
|
|
|
ret == 0 ? "" : "...");
|
|
|
|
|
2014-02-10 11:49:10 +00:00
|
|
|
RTE_LCORE_FOREACH_SLAVE(i) {
|
|
|
|
|
|
|
|
/*
|
|
|
|
* create communication pipes between master thread
|
|
|
|
* and children
|
|
|
|
*/
|
|
|
|
if (pipe(lcore_config[i].pipe_master2slave) < 0)
|
|
|
|
rte_panic("Cannot create pipe\n");
|
|
|
|
if (pipe(lcore_config[i].pipe_slave2master) < 0)
|
|
|
|
rte_panic("Cannot create pipe\n");
|
|
|
|
|
|
|
|
lcore_config[i].state = WAIT;
|
|
|
|
|
|
|
|
/* create a thread for each lcore */
|
|
|
|
ret = pthread_create(&lcore_config[i].thread_id, NULL,
|
|
|
|
eal_thread_loop, NULL);
|
|
|
|
if (ret != 0)
|
|
|
|
rte_panic("Cannot create thread\n");
|
2015-07-28 17:51:44 -07:00
|
|
|
|
|
|
|
/* Set thread_name for aid in debugging. */
|
|
|
|
snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN,
|
|
|
|
"lcore-slave-%d", i);
|
2016-06-17 14:48:16 +02:00
|
|
|
rte_thread_setname(lcore_config[i].thread_id, thread_name);
|
2014-02-10 11:49:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Launch a dummy function on all slave lcores, so that master lcore
|
|
|
|
* knows they are all ready when this function returns.
|
|
|
|
*/
|
|
|
|
rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER);
|
|
|
|
rte_eal_mp_wait_lcore();
|
|
|
|
|
2017-07-11 15:19:28 +01:00
|
|
|
/* initialize services so vdevs register service during bus_probe. */
|
|
|
|
ret = rte_service_init();
|
|
|
|
if (ret) {
|
|
|
|
rte_eal_init_alert("rte_service_init() failed\n");
|
|
|
|
rte_errno = ENOEXEC;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2017-01-18 19:35:22 +05:30
|
|
|
/* Probe all the buses and devices/drivers on them */
|
2017-03-22 16:19:39 -04:00
|
|
|
if (rte_bus_probe()) {
|
|
|
|
rte_eal_init_alert("Cannot probe devices\n");
|
|
|
|
rte_errno = ENOTSUP;
|
|
|
|
return -1;
|
|
|
|
}
|
2017-01-18 19:35:22 +05:30
|
|
|
|
2017-07-11 15:19:28 +01:00
|
|
|
/* initialize default service/lcore mappings and start running. Ignore
|
|
|
|
* -ENOTSUP, as it indicates no service coremask passed to EAL.
|
|
|
|
*/
|
|
|
|
ret = rte_service_start_with_defaults();
|
|
|
|
if (ret < 0 && ret != -ENOTSUP) {
|
|
|
|
rte_errno = ENOEXEC;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2016-03-09 13:37:24 +00:00
|
|
|
rte_eal_mcfg_complete();
|
|
|
|
|
2014-02-10 11:49:10 +00:00
|
|
|
return fctret;
|
|
|
|
}
|
|
|
|
|
2018-01-21 20:48:06 -05:00
|
|
|
int __rte_experimental
|
|
|
|
rte_eal_cleanup(void)
|
2018-01-29 16:37:30 +00:00
|
|
|
{
|
|
|
|
rte_service_finalize();
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2014-02-10 11:49:10 +00:00
|
|
|
/* get core role */
|
|
|
|
enum rte_lcore_role_t
|
|
|
|
rte_eal_lcore_role(unsigned lcore_id)
|
|
|
|
{
|
2015-05-11 17:10:25 +03:00
|
|
|
return rte_config.lcore_role[lcore_id];
|
2014-02-10 11:49:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
enum rte_proc_type_t
|
|
|
|
rte_eal_process_type(void)
|
|
|
|
{
|
2015-05-11 17:10:25 +03:00
|
|
|
return rte_config.process_type;
|
2014-02-10 11:49:10 +00:00
|
|
|
}
|
2017-10-26 12:05:51 +02:00
|
|
|
|
|
|
|
int rte_eal_has_pci(void)
|
|
|
|
{
|
|
|
|
return !internal_config.no_pci;
|
|
|
|
}
|
|
|
|
|
|
|
|
int rte_eal_create_uio_dev(void)
|
|
|
|
{
|
|
|
|
return internal_config.create_uio_dev;
|
|
|
|
}
|
|
|
|
|
|
|
|
enum rte_intr_mode
|
|
|
|
rte_eal_vfio_intr_mode(void)
|
|
|
|
{
|
|
|
|
return RTE_INTR_MODE_NONE;
|
|
|
|
}
|
2017-10-26 12:05:52 +02:00
|
|
|
|
|
|
|
/* dummy forward declaration. */
|
|
|
|
struct vfio_device_info;
|
|
|
|
|
|
|
|
/* dummy prototypes. */
|
2017-11-06 17:08:58 +01:00
|
|
|
int rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
|
2017-10-26 12:05:52 +02:00
|
|
|
int *vfio_dev_fd, struct vfio_device_info *device_info);
|
2017-11-06 17:08:58 +01:00
|
|
|
int rte_vfio_release_device(const char *sysfs_base, const char *dev_addr, int fd);
|
|
|
|
int rte_vfio_enable(const char *modname);
|
|
|
|
int rte_vfio_is_enabled(const char *modname);
|
|
|
|
int rte_vfio_noiommu_is_enabled(void);
|
2018-01-17 19:24:15 +02:00
|
|
|
int rte_vfio_clear_group(int vfio_group_fd);
|
2018-04-11 13:30:21 +01:00
|
|
|
int rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len);
|
|
|
|
int rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len);
|
2017-10-26 12:05:52 +02:00
|
|
|
|
2017-11-06 17:08:58 +01:00
|
|
|
int rte_vfio_setup_device(__rte_unused const char *sysfs_base,
|
2017-10-26 12:05:52 +02:00
|
|
|
__rte_unused const char *dev_addr,
|
|
|
|
__rte_unused int *vfio_dev_fd,
|
|
|
|
__rte_unused struct vfio_device_info *device_info)
|
|
|
|
{
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2017-11-06 17:08:58 +01:00
|
|
|
int rte_vfio_release_device(__rte_unused const char *sysfs_base,
|
2017-10-26 12:05:52 +02:00
|
|
|
__rte_unused const char *dev_addr,
|
|
|
|
__rte_unused int fd)
|
|
|
|
{
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2017-11-06 17:08:58 +01:00
|
|
|
int rte_vfio_enable(__rte_unused const char *modname)
|
2017-10-26 12:05:52 +02:00
|
|
|
{
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2017-11-06 17:08:58 +01:00
|
|
|
int rte_vfio_is_enabled(__rte_unused const char *modname)
|
2017-10-26 12:05:52 +02:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-11-06 17:08:58 +01:00
|
|
|
int rte_vfio_noiommu_is_enabled(void)
|
2017-10-26 12:05:52 +02:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
2018-01-15 10:41:25 +05:30
|
|
|
|
2018-01-17 19:24:15 +02:00
|
|
|
int rte_vfio_clear_group(__rte_unused int vfio_group_fd)
|
2018-01-15 10:41:25 +05:30
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
2018-04-11 13:30:21 +01:00
|
|
|
|
|
|
|
int __rte_experimental
|
|
|
|
rte_vfio_dma_map(uint64_t __rte_unused vaddr, __rte_unused uint64_t iova,
|
|
|
|
__rte_unused uint64_t len)
|
|
|
|
{
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
int __rte_experimental
|
|
|
|
rte_vfio_dma_unmap(uint64_t __rte_unused vaddr, uint64_t __rte_unused iova,
|
|
|
|
__rte_unused uint64_t len)
|
|
|
|
{
|
|
|
|
return -1;
|
|
|
|
}
|