latest dpdk & refactoring

This commit is contained in:
quackerd 2022-06-22 23:40:48 +08:00
parent a716583b19
commit 565dbca278
27 changed files with 1534 additions and 1765 deletions

View File

@ -52,7 +52,11 @@ ForEachMacros:
- ARB_FOREACH_REVERSE
- ARB_FOREACH_REVERSE_FROM
- ARB_FOREACH_REVERSE_SAFE
- BIT_FOREACH_ISCLR
- BIT_FOREACH_ISSET
- CPU_FOREACH
- CPU_FOREACH_ISCLR
- CPU_FOREACH_ISSET
- FOREACH_THREAD_IN_PROC
- FOREACH_PROC_IN_SYSTEM
- FOREACH_PRISON_CHILD

View File

@ -4,68 +4,79 @@ project(khat)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}")
find_package(PkgConfig REQUIRED)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY bin)
pkg_check_modules(HWLOC hwloc REQUIRED)
pkg_check_modules(DPDK libdpdk)
pkg_check_modules(SPDK spdk_event_bdev spdk_env_dpdk)
pkg_check_modules(SPDK_SYS spdk_syslibs)
pkg_check_modules(UUID uuid)
# get_filename_component(ISAL_LIB_PATH ${SPDK_INCLUDE_DIRS} DIRECTORY)
# get_filename_component(ISAL_LIB_PATH ${ISAL_LIB_PATH} DIRECTORY)
# set(ISAL_LIB_PATH ${ISAL_LIB_PATH}/isa-l/.libs)
pkg_check_modules(TOPO bsdtopo)
set(CC_FLAGS -O2 -g -Wall -Wextra -Werror -std=c++11
-Wno-deprecated-declarations
-Wno-address-of-packed-member
-Wno-zero-length-array
-Wno-gnu-zero-variadic-macro-arguments
-msse4
-mavx)
-march=native)
set(C_FLAGS -O2 -g -Wall -Wextra -Werror -std=c17
-Wno-deprecated-declarations
-Wno-address-of-packed-member
-Wno-zero-length-array
-Wno-gnu-zero-variadic-macro-arguments
-march=native)
include_directories(${CMAKE_SOURCE_DIR}/inc)
include_directories()
set(LIBNM_CC_FLAGS -O2 -g -Wall -Wextra -Werror -std=c++11 -mavx -msse4)
set(LIBNTR_C_FLAGS -O2 -g -Wall -Wextra -Werror -std=c11)
set(LIBGEN_CC_FLAGS -O2 -g -Wall -Wextra -Werror -std=c++11)
add_library(ntr STATIC libntr/ntr.c)
add_library(ntr SHARED libntr/ntr.c)
target_compile_options(ntr PRIVATE ${LIBNTR_C_FLAGS})
add_library(gen STATIC libgen/generator.cc)
target_compile_options(gen PRIVATE ${LIBGEN_CC_FLAGS})
add_library(gen SHARED libgen/generator.cc libgen/loadgen.cc)
target_link_libraries(gen PRIVATE pthread ntr ${TOPO_LINK_LIBRARIES} nms)
target_compile_options(gen PRIVATE ${LIBGEN_CC_FLAGS} ${TOPO_CFLAGS})
add_library(nm STATIC libnm/nm.cc libnm/alloc.cc libnm/loadgen.cc libnm/topo.cc)
target_include_directories(nm PRIVATE ${HWLOC_INCLUDE_DIRS})
target_link_libraries(nm PRIVATE gen ${HWLOC_LINK_LIBRARIES})
target_compile_options(nm PRIVATE ${LIBNM_CC_FLAGS} ${HWLOC_CFLAGS})
add_library(netsup SHARED net/libnetsup/dpdk.cc net/libnetsup/portconf.cc)
target_link_libraries(netsup PRIVATE ntr ${DPDK_LINK_LIBRARIES})
target_compile_options(netsup PRIVATE ${LIBGEN_CC_FLAGS} ${DPDK_CFLAGS})
add_library(nms SHARED libnms/alloc.c)
target_link_libraries(nms PRIVATE ${TOPO_LINK_LIBRARIES})
target_compile_options(nms PRIVATE ${TOPO_CFLAGS})
add_executable(khat EXCLUDE_FROM_ALL net/khat.cc)
target_link_libraries(khat PRIVATE pthread nm ntr ${DPDK_LINK_LIBRARIES})
target_compile_options(khat PRIVATE ${CC_FLAGS} ${DPDK_CFLAGS})
target_link_libraries(khat PRIVATE pthread ntr gen netsup nms ${DPDK_LINK_LIBRARIES} ${TOPO_LINK_LIBRARIES})
target_compile_options(khat PRIVATE ${CC_FLAGS} ${DPDK_CFLAGS} ${TOPO_CFLAGS})
add_executable(cat EXCLUDE_FROM_ALL net/cat.cc)
target_link_libraries(cat PRIVATE pthread nm ntr gen ${DPDK_LINK_LIBRARIES})
target_compile_options(cat PRIVATE ${CC_FLAGS} ${DPDK_CFLAGS})
target_link_libraries(cat PRIVATE pthread ntr gen netsup nms ${DPDK_LINK_LIBRARIES} ${TOPO_LINK_LIBRARIES})
target_compile_options(cat PRIVATE ${CC_FLAGS} ${DPDK_CFLAGS} ${TOPO_CFLAGS})
add_executable(rat EXCLUDE_FROM_ALL net/rat.cc)
target_link_libraries(rat PRIVATE pthread nm ntr gen ${DPDK_LINK_LIBRARIES})
target_compile_options(rat PRIVATE ${CC_FLAGS} ${DPDK_CFLAGS})
target_link_libraries(rat PRIVATE pthread ntr gen netsup nms ${DPDK_LINK_LIBRARIES} ${TOPO_LINK_LIBRARIES})
target_compile_options(rat PRIVATE ${CC_FLAGS} ${DPDK_CFLAGS} ${TOPO_CFLAGS})
add_executable(birb EXCLUDE_FROM_ALL storage/birb.cc storage/io_gen.cc storage/drivers/bdev.cc storage/drivers/bdev_thread.cc storage/drivers/nvme.cc storage/drivers/nvme_thread.cc)
target_include_directories(birb PRIVATE ${SPDK_INCLUDE_DIRS} ${DPDK_INCLUDE_DIRS} ${UUID_INCLUDE_DIRS})
target_compile_options(birb PRIVATE ${CC_FLAGS} ${SPDK_CFLAGS} ${UUID_CFLAGS})
target_link_directories(birb PRIVATE ${SPDK_LIBRARY_DIRS} ${SPDK_SYS_STATIC_LIBRARY_DIRS} ${UUID_LIBRARY_DIRS})
target_link_libraries(birb PRIVATE pthread nm ntr gen -Wl,--whole-archive ${SPDK_LIBRARIES} -Wl,--no-whole-archive ${SPDK_SYS_STATIC_LIBRARIES})
target_link_libraries(birb PRIVATE pthread ntr gen -Wl,--whole-archive ${SPDK_LIBRARIES} -Wl,--no-whole-archive ${SPDK_SYS_STATIC_LIBRARIES})
add_executable(birb_posix EXCLUDE_FROM_ALL storage/birb_posix.cc storage/io_gen.cc)
target_compile_options(birb_posix PRIVATE ${CC_FLAGS})
target_link_libraries(birb_posix PRIVATE pthread nm ntr gen)
target_link_libraries(birb_posix PRIVATE pthread ntr gen)
add_executable(memloadgen util/memloadgen.cc)
target_link_libraries(memloadgen PRIVATE pthread nm ntr)
target_compile_options(memloadgen PRIVATE ${CC_FLAGS})
target_link_libraries(memloadgen PRIVATE pthread gen ntr ${TOPO_LINK_LIBRARIES})
target_compile_options(memloadgen PRIVATE ${CC_FLAGS} ${TOPO_CFLAGS})
add_executable(test_ts test/ts.cc)
set_target_properties(test_ts PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/test)
target_link_libraries(test_ts PRIVATE nm)
target_compile_options(test_ts PRIVATE ${CC_FLAGS})
add_executable(nms_test tests/nms_test.c)
set_target_properties(nms_test PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/tests)
target_link_libraries(nms_test PRIVATE nms)
target_compile_options(nms_test PRIVATE ${C_FLAGS})

View File

@ -1,13 +1,12 @@
#pragma once
#include <sys/cpuset.h>
#include <cstdint>
#include <cstring>
#include <immintrin.h>
#include <ctime>
#include <cstdio>
#include <sys/types.h>
#include <sys/cpuset.h>
#include <sys/_cpuset.h>
#define DISALLOW_EVIL_CONSTRUCTORS(TypeName) \
TypeName(const TypeName &) = delete; \

View File

@ -7,10 +7,6 @@
#pragma once
#include <sys/param.h>
#include <netinet/in.h>
#include <assert.h>
#include <inttypes.h>
#include <limits.h>
@ -23,6 +19,10 @@
#include <utility>
#include <vector>
#include <sys/param.h>
#include "defs.hh"
#define D(fmt, ...)
#define DIE(fmt, ...) (void)0;
@ -292,3 +292,44 @@ Generator *createGenerator(std::string str);
Generator *createFacebookKey();
Generator *createFacebookValue();
Generator *createFacebookIA();
// memload generator
class memload_generator {
public:
struct memload_generator_options {
constexpr static size_t ITERATION_MAX = -1;
size_t chunk_size {512 * 1024 * 1024};
size_t iteration {ITERATION_MAX};
int verbose {0};
bool shared_buffer {true};
};
private:
DISALLOW_EVIL_CONSTRUCTORS(memload_generator);
struct thread_info {
pthread_t pthr;
void *from_buffer;
void *to_buffer;
std::atomic<int> *stop;
int tid;
struct memload_generator_options * opts;
// stat keeping
std::atomic<uint32_t> num_trans;
uint64_t begin_ts;
uint64_t end_ts;
};
std::vector<struct thread_info *> thr_infos;
std::atomic<int> end;
struct memload_generator_options opts;
static void *worker_thrd(void *_tinfo);
public:
memload_generator(cpuset_t * threads, cpuset_t * target_domain, struct memload_generator_options * opt, bool *success);
uint64_t get_bps();
bool check_done();
~memload_generator();
};

View File

@ -1,22 +1,63 @@
#pragma once
#include <rte_ethdev.h>
#include <rte_ip.h>
#include <cstdint>
#include "rte_ethdev.h"
#include "rte_ether.h"
#define MAX_NUMA_NODES (64)
struct device_conf {
int portid;
uint16_t tx_ring_sz;
uint16_t rx_ring_sz;
int num_threads;
int mtu;
uint64_t rx_offloads;
uint64_t tx_offloads;
uint64_t rss_hf;
rte_tx_callback_fn tx_fn;
void * tx_user;
rte_rx_callback_fn rx_fn;
void * rx_user;
bool timesync;
};
struct mem_conf {
int num_elements;
int cache_size;
int data_room_size;
int priv_size;
unsigned int max_pools;
};
constexpr static uint16_t MIN_RANDOM_PORT = 1000;
constexpr static uint16_t DEFAULT_RAT_PORT = 1234;
constexpr static unsigned int INIT_DELAY = 1;
constexpr static unsigned int INIT_DELAY = 3;
constexpr static unsigned int MAX_NODES = 64;
static inline void
tx_burst_all(int portid, int txqid, struct rte_mbuf ** tx_bufs, int sz)
{
int remaining = sz;
while(remaining > 0) {
remaining -= rte_eth_tx_burst(
portid, txqid, &tx_bufs[sz - remaining],
remaining);
}
}
void
dpdk_init(struct device_conf *dconf, struct mem_conf *mconf);
void
dpdk_cleanup(struct device_conf *dconf);
struct rte_mempool *
mempool_get(int nodeid);
struct port_conf {
const char * driver_name;
uint64_t rxoffload;
uint64_t txoffload;
uint64_t rss_hf;
bool timesync;
};
int
portconf_get(int portid, struct port_conf * out);
// constexpr static int LATENCY_MEASURE_TIMES = 10000;

View File

@ -10,8 +10,7 @@
#include <rte_udp.h>
#include <unistd.h>
#include "nm.hh"
#include "util.hh"
#include "defs.hh"
#include <random>
@ -24,11 +23,23 @@
constexpr static uint32_t MAX_JUMBO_MTU = 9000;
constexpr static uint32_t MAX_STANDARD_MTU = 1500;
static inline int mtu_to_pkt_size(int mtu)
static inline int
mtu_to_pkt_size(int mtu)
{
return mtu + RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN;
}
static inline void
tx_burst_all(int portid, int txqid, struct rte_mbuf ** tx_bufs, int sz)
{
int remaining = sz;
while(remaining > 0) {
remaining -= rte_eth_tx_burst(
portid, txqid, &tx_bufs[sz - remaining],
remaining);
}
}
constexpr static uint32_t ETHER_FRAME_MAGIC = 0xDCDCE5E5;
const static struct rte_ether_addr POU_MAC {
0x01, 0x00, 0x5e, 0x00, 0x01, 0x81
@ -87,7 +98,7 @@ pkt_hdr_to_netspec(struct pkt_hdr *pkt, struct net_spec *src,
uint16_t *src_port, struct net_spec *dst, uint16_t *dst_port)
{
if (src != nullptr) {
rte_ether_addr_copy(&pkt->eth_hdr.s_addr, &src->mac_addr);
rte_ether_addr_copy(&pkt->eth_hdr.src_addr, &src->mac_addr);
src->ip = rte_be_to_cpu_32(pkt->ipv4_hdr.src_addr);
}
@ -96,7 +107,7 @@ pkt_hdr_to_netspec(struct pkt_hdr *pkt, struct net_spec *src,
}
if (dst != nullptr) {
rte_ether_addr_copy(&pkt->eth_hdr.d_addr, &dst->mac_addr);
rte_ether_addr_copy(&pkt->eth_hdr.dst_addr, &dst->mac_addr);
dst->ip = rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr);
}
@ -203,7 +214,7 @@ class rdport_generator {
, cur(0)
, dist(0, MAX_PORT - min_port)
{
gen.seed(nm_get_uptime_ns());
gen.seed(get_uptime());
cur = dist(gen);
}
uint16_t next()
@ -224,23 +235,23 @@ class rdport_generator {
(rte_be_to_cpu_32(pkt->ipv4_hdr.src_addr) >> 8) & 0xff, \
(rte_be_to_cpu_32(pkt->ipv4_hdr.src_addr) >> 0) & 0xff, \
rte_be_to_cpu_16(pkt->udp_hdr.src_port), \
pkt->eth_hdr.s_addr.addr_bytes[0], \
pkt->eth_hdr.s_addr.addr_bytes[1], \
pkt->eth_hdr.s_addr.addr_bytes[2], \
pkt->eth_hdr.s_addr.addr_bytes[3], \
pkt->eth_hdr.s_addr.addr_bytes[4], \
pkt->eth_hdr.s_addr.addr_bytes[5], \
pkt->eth_hdr.src_addr.addr_bytes[0], \
pkt->eth_hdr.src_addr.addr_bytes[1], \
pkt->eth_hdr.src_addr.addr_bytes[2], \
pkt->eth_hdr.src_addr.addr_bytes[3], \
pkt->eth_hdr.src_addr.addr_bytes[4], \
pkt->eth_hdr.src_addr.addr_bytes[5], \
(rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr) >> 24) & 0xff, \
(rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr) >> 16) & 0xff, \
(rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr) >> 8) & 0xff, \
(rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr) >> 0) & 0xff, \
rte_be_to_cpu_16(pkt->udp_hdr.dst_port), \
pkt->eth_hdr.d_addr.addr_bytes[0], \
pkt->eth_hdr.d_addr.addr_bytes[1], \
pkt->eth_hdr.d_addr.addr_bytes[2], \
pkt->eth_hdr.d_addr.addr_bytes[3], \
pkt->eth_hdr.d_addr.addr_bytes[4], \
pkt->eth_hdr.d_addr.addr_bytes[5], rte_be_to_cpu_16(pkt->type))
pkt->eth_hdr.dst_addr.addr_bytes[0], \
pkt->eth_hdr.dst_addr.addr_bytes[1], \
pkt->eth_hdr.dst_addr.addr_bytes[2], \
pkt->eth_hdr.dst_addr.addr_bytes[3], \
pkt->eth_hdr.dst_addr.addr_bytes[4], \
pkt->eth_hdr.dst_addr.addr_bytes[5], rte_be_to_cpu_16(pkt->type))
static inline void
print_mac(struct rte_ether_addr *mac)
@ -253,7 +264,7 @@ print_mac(struct rte_ether_addr *mac)
static inline void
print_ipv4(uint32_t ip)
{
printf("%d-%d-%d-%d", (ip >> 24) & 0xff, (ip >> 16) & 0xff,
printf("%d.%d.%d.%d", (ip >> 24) & 0xff, (ip >> 16) & 0xff,
(ip >> 8) & 0xff, (ip >> 0) & 0xff);
}
@ -276,10 +287,10 @@ dump_pkt(struct rte_mbuf *pkt)
"Packet %p: Length 0x%x\n", (void *)pkt, rte_pktmbuf_data_len(pkt));
printf(" Ethernet header:\n");
printf(" Src:");
print_mac(&eth_hdr->s_addr);
print_mac(&eth_hdr->src_addr);
printf("\n");
printf(" Dst:");
print_mac(&eth_hdr->d_addr);
print_mac(&eth_hdr->dst_addr);
printf("\n");
printf(" Type: 0x%x\n", rte_be_to_cpu_16(eth_hdr->ether_type));
@ -339,11 +350,11 @@ construct_pkt_hdr(
// construct l2 header
eth_hdr = &pkt_data->eth_hdr;
rte_ether_addr_copy(&conn->src->mac_addr, &eth_hdr->s_addr);
rte_ether_addr_copy(&conn->src->mac_addr, &eth_hdr->src_addr);
if (is_ts_pkt) {
rte_ether_addr_copy(&POU_MAC, &eth_hdr->d_addr);
rte_ether_addr_copy(&POU_MAC, &eth_hdr->dst_addr);
} else {
rte_ether_addr_copy(&conn->dst->mac_addr, &eth_hdr->d_addr);
rte_ether_addr_copy(&conn->dst->mac_addr, &eth_hdr->dst_addr);
}
eth_hdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4);
buf->l2_len = sizeof(struct rte_ether_hdr);
@ -378,10 +389,13 @@ construct_pkt_hdr(
udp_hdr->dgram_cksum = 0; /* No UDP checksum. */
udp_hdr->dgram_len = total_sz - sizeof(struct rte_ether_hdr) - sizeof(struct rte_ipv4_hdr) - sizeof(struct rte_udp_hdr);
buf->l4_len = sizeof(struct rte_udp_hdr);
buf->ol_flags |= RTE_MBUF_F_TX_IPV4;
buf->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
buf->ol_flags |= RTE_MBUF_F_TX_UDP_CKSUM;
if (is_ts_pkt) {
// set misc flags
buf->ol_flags |= PKT_TX_IEEE1588_TMST;
buf->ol_flags |= RTE_MBUF_F_TX_IEEE1588_TMST;
pkt_data->ptp_hdr.ptp_ver = 0x2; // VER 2
pkt_data->ptp_hdr.ptp_msg_type = 0x0; // SYNC
} else {
@ -463,7 +477,7 @@ check_valid_packet(struct rte_mbuf *pkt, const struct rte_ether_addr *host_mac)
}
if (!rte_is_same_ether_addr(
expected_mac, &pkt_data->eth_hdr.d_addr))
expected_mac, &pkt_data->eth_hdr.dst_addr))
return nullptr;
}

103
inc/nm.hh
View File

@ -1,103 +0,0 @@
#pragma once
#include <sys/endian.h>
#include "gen.hh"
#include "defs.hh"
#include <atomic>
#include <cstdint>
#include <chrono>
#include <ctime>
constexpr static unsigned int NM_LEVEL_NUMA = 0;
constexpr static unsigned int NM_LEVEL_CPU = 1;
constexpr static unsigned int NM_LEVEL_CORE = 2;
constexpr static unsigned int NM_MAX_LEVEL = NM_LEVEL_CORE + 1;
constexpr static int NM_MAX_OBJS_PER_LVL = 256;
// misc functions
// 0 on success -1 on error
int nm_init(int verbosity);
uint64_t nm_tsc2ns(uint64_t tsc);
uint64_t nm_get_uptime_ns();
// topology stuff
struct nm_obj;
struct nm_obj *
nm_find_parent_obj(struct nm_obj * start, int parent_level);
int
nm_obj_count(int level);
struct nm_obj *
nm_obj_from_id(int level, int id);
struct nm_obj *
nm_obj_find_parent(struct nm_obj * start, int parent_level);
int
nm_obj_get_id(struct nm_obj * obj);
int
nm_obj_get_level(struct nm_obj * obj);
static inline int
nm_get_node_from_core(int coreid)
{
return nm_obj_get_id(nm_obj_find_parent(nm_obj_from_id(NM_LEVEL_CORE, coreid), NM_LEVEL_NUMA));
}
// memload generator
class memload_generator {
private:
DISALLOW_EVIL_CONSTRUCTORS(memload_generator);
struct thread_info {
pthread_t pthr;
uint32_t iter;
uint32_t array_size;
std::atomic<int> init_status;
std::atomic<int> *state;
std::atomic<uint32_t> num_trans;
constexpr static int INIT_START = 0;
constexpr static int INIT_SUCCESS = 1;
constexpr static int INIT_FAILED = 2;
void *from_region;
void *to_region;
uint32_t to_domainid;
uint32_t from_domainid;
uint64_t begin_ts;
uint64_t stop_ts;
};
std::vector<struct thread_info *> thr_infos;
std::atomic<int> state;
constexpr static uint32_t STATE_READY = 0;
constexpr static uint32_t STATE_START = 1;
constexpr static uint32_t STATE_STOP = 2;
uint32_t array_size;
uint32_t iteration;
static void *worker_thrd(void *_tinfo);
public:
memload_generator(uint64_t from_cmask, uint64_t to_cmask, uint32_t array_sz, uint32_t iteration,
bool *success);
void start();
void stop();
uint64_t get_bps();
bool check_done();
~memload_generator();
};
// allocators
void *
nm_malloc(unsigned int node, size_t size);
void
nm_free(unsigned int node, void * addr);

20
inc/nms.h Normal file
View File

@ -0,0 +1,20 @@
#pragma once
#include <sys/types.h>
#ifdef __cplusplus
extern "C" {
#endif
int
nms_init(int verbose);
void *
nms_malloc(int nodeid, size_t sz);
void
nms_free(int nodeid, void * addr);
#ifdef __cplusplus
}
#endif // __cplusplus

136
libgen/loadgen.cc Normal file
View File

@ -0,0 +1,136 @@
#include <atomic>
#include <pthread.h>
#include <pthread_np.h>
#include <unistd.h>
#include <sys/cpuset.h>
#include <sys/domainset.h>
#include <sys/endian.h>
#include <sys/thr.h>
#include <topo.h>
#include "gen.hh"
#include "nms.h"
void *
memload_generator::worker_thrd(void *_tinfo)
{
auto *tinfo = (struct thread_info *)_tinfo;
long tid;
thr_self(&tid);
if (tinfo->opts->verbose) {
fprintf(
stdout, "memload_generator <thread %ld>: running...\n", tid);
}
tinfo->begin_ts = topo_uptime_ns();
while (tinfo->num_trans.load() < tinfo->opts->iteration) {
memcpy((char *)tinfo->from_buffer, (char *)tinfo->to_buffer, tinfo->opts->chunk_size);
tinfo->end_ts = topo_uptime_ns();
tinfo->num_trans.fetch_add(1);
}
if (tinfo->opts->verbose) {
fprintf(
stdout, "memload_generator <thread %ld>: exiting...\n", tid);
}
return nullptr;
}
memload_generator::memload_generator(cpuset_t * threads, cpuset_t * target_domain, struct memload_generator_options * opt, bool *success)
{
*success = false;
end.store(0);
std::memcpy(&this->opts, opt, sizeof(memload_generator_options));
int nextcore = CPU_FFS(threads) - 1;
int target_domain_id = CPU_FFS(target_domain) - 1;
int num_cores = CPU_COUNT(threads);
if (target_domain_id < 0 || num_cores == 0) {
return;
}
void * local_buffer;
void * target_buffer;
if (opts.shared_buffer) {
local_buffer = nms_malloc(topo_core_to_numa(nextcore), opt->chunk_size);
target_buffer = nms_malloc(target_domain_id, opt->chunk_size);
}
int tid = 0;
while (nextcore != -1) {
auto *info = new struct thread_info;
cpuset_t cpuset;
pthread_attr_t attr;
info->stop = &this->end;
info->num_trans.store(0);
info->begin_ts = 0;
info->end_ts = 0;
info->opts = &this->opts;
if (opt->shared_buffer) {
info->from_buffer = local_buffer;
info->to_buffer = target_buffer;
} else {
info->from_buffer = nms_malloc(topo_core_to_numa(nextcore), opt->chunk_size);
info->to_buffer = nms_malloc(target_domain_id, opt->chunk_size);
}
CPU_ZERO(&cpuset);
CPU_SET(nextcore, &cpuset);
pthread_attr_init(&attr);
pthread_attr_setaffinity_np(&attr, sizeof(cpuset_t), &cpuset);
pthread_create(&info->pthr, &attr, worker_thrd, info);
if (opts.verbose) {
fprintf(stdout,
"memload_generator: created thread %d on core %d target domain %d\n", tid,
nextcore, target_domain_id);
}
thr_infos.push_back(info);
CPU_CLR(nextcore, threads);
nextcore = CPU_FFS(threads) - 1;
tid++;
}
*success = true;
if (opts.verbose) {
fprintf(stdout,
"memload_generator: exiting constructor. Success: %d...\n",
success ? 1 : 0);
}
}
bool
memload_generator::check_done()
{
bool done = true;
for (auto i : thr_infos) {
done = done && (i->num_trans.load() >= this->opts.iteration);
}
return done;
}
uint64_t
memload_generator::get_bps()
{
uint64_t total_transactions = 0;
uint64_t total_time = 0;
for (auto i : thr_infos) {
total_transactions += i->num_trans.load();
total_time = (i->end_ts - i->begin_ts) + total_time;
}
return (double)(total_transactions * this->opts.chunk_size) / ((double)total_time / thr_infos.size() / S2NS);
}
memload_generator::~memload_generator()
{
for (auto i : thr_infos) {
// XXX: free
pthread_join(i->pthr, NULL);
delete i;
}
}

View File

@ -1,113 +0,0 @@
#include <pthread.h>
#include <sys/types.h>
#include <sys/cpuset.h>
#include <sys/domainset.h>
#include <sys/thr.h>
#include <sys/mman.h>
#include <cerrno>
#include "nmp.hh"
static pthread_mutex_t alloc_lock;
static constexpr unsigned int MEM_OBJ_SIZE = 4096; // 4k
static constexpr unsigned int MEM_OBJ_NUM = 1024 * 256; // 4k * 1024 * 256 = 1GB per region
static constexpr unsigned int MEM_REGION_NUM = 4; // 4 x 1GB = 4GB total
static int nm_mem_idx[NM_MAX_OBJS_PER_LVL];
static int nm_mem_region_idx[NM_MAX_OBJS_PER_LVL];
static void* nm_mem_regions[NM_MAX_OBJS_PER_LVL][MEM_REGION_NUM];
int
nm_alloc_init()
{
long tid;
thr_self(&tid);
domainset_t orig_dom;
int orig_policy;
pthread_mutex_init(&alloc_lock, nullptr);
DOMAINSET_ZERO(&orig_dom);
// save existing thread's allocation strategy
int ret = cpuset_getdomain(CPU_LEVEL_WHICH, CPU_WHICH_TID, tid, sizeof(orig_dom), &orig_dom, &orig_policy);
if (ret != 0) {
return ret;
}
domainset_t tmp_domain;
for (int i = 0; i < nm_obj_count(NM_LEVEL_NUMA); i++) {
DOMAINSET_ZERO(&tmp_domain);
DOMAINSET_SET(i, &tmp_domain);
ret = cpuset_setdomain(CPU_LEVEL_WHICH, CPU_WHICH_TID, tid, sizeof(tmp_domain), &tmp_domain, DOMAINSET_POLICY_PREFER);
if (ret != 0) {
if (nm_get_verbose() > 0) {
fprintf(stdout, "libnm: cpuset_setdomain failed with %d\n", errno);
}
return ret;
}
for (unsigned int j = 0; j < MEM_REGION_NUM; j++) {
if ((nm_mem_regions[i][j] = mmap(nullptr, MEM_OBJ_NUM * MEM_OBJ_SIZE, PROT_READ | PROT_WRITE,
MAP_ANON | MAP_ALIGNED_SUPER | MAP_NOCORE | MAP_PRIVATE | MAP_NOSYNC,
-1, 0)) == MAP_FAILED) {
if (nm_get_verbose() > 0) {
fprintf(stdout, "libnm: mmap failed with %d\n", errno);
}
return -1;
}
// touch the pages to prefault the pages
for (unsigned int k = 0; k < MEM_OBJ_NUM; k++) {
*(uint32_t*)((char*)nm_mem_regions[i][j] + k * MEM_OBJ_SIZE) = 0;
}
if (nm_get_verbose() > 0) {
fprintf(stdout, "libnm: reserved %u bytes (%u MB) on node %d. vaddr: 0x%p\n", MEM_OBJ_NUM * MEM_OBJ_SIZE, MEM_OBJ_SIZE * MEM_OBJ_NUM / 1024 / 1024, i, nm_mem_regions[i][j]);
}
}
nm_mem_idx[i] = 0;
nm_mem_region_idx[i] = 0;
}
// restore existing thread's allocation strategy
ret = cpuset_setdomain(CPU_LEVEL_WHICH, CPU_WHICH_TID, tid, sizeof(orig_dom), &orig_dom, orig_policy);
return ret;
}
void *
nm_malloc(unsigned int node, size_t size)
{
void * ret = nullptr;
int num_objs = (size + MEM_OBJ_SIZE - 1) / MEM_OBJ_SIZE;
bool retry = false;
pthread_mutex_lock(&alloc_lock);
int cur_region = nm_mem_region_idx[node];
int cur_idx = nm_mem_idx[node];
retry:
if ((int)MEM_OBJ_NUM - cur_idx >= num_objs) {
ret = (char*)nm_mem_regions[node][cur_region] + MEM_OBJ_SIZE * cur_idx;
nm_mem_region_idx[node] = cur_region;
nm_mem_idx[node] = cur_idx + num_objs;
} else if (!retry && (cur_region < (int)MEM_REGION_NUM)) {
// check next region
cur_region++;
cur_idx = 0;
retry = true;
goto retry;
}
pthread_mutex_unlock(&alloc_lock);
return ret;
}
void
nm_free(unsigned int node ATTR_UNUSED, void * addr ATTR_UNUSED)
{
// dummy function
}

View File

@ -1,217 +0,0 @@
#include "nmp.hh"
#include <atomic>
#include <sys/cpuset.h>
#include <sys/domainset.h>
#include <sys/endian.h>
#include <sys/thr.h>
#include <pthread.h>
#include <pthread_np.h>
#include <unistd.h>
void *
memload_generator::worker_thrd(void *_tinfo)
{
auto *tinfo = (struct thread_info *)_tinfo;
long tid;
thr_self(&tid);
if (nm_get_verbose() > 0) {
fprintf(stdout,
"memload_generator <thread %ld>: from domain %d to %d\n",
tid, tinfo->from_domainid, tinfo->to_domainid);
}
if (tinfo->from_region == nullptr) {
tinfo->from_region = nm_malloc(tinfo->from_domainid, tinfo->array_size);
}
if (tinfo->to_region == nullptr) {
tinfo->to_region = nm_malloc(tinfo->to_domainid, tinfo->array_size);
}
if (tinfo->from_region == nullptr || tinfo->to_region == nullptr) {
tinfo->init_status.store(thread_info::INIT_FAILED);
if (nm_get_verbose() > 0) {
fprintf(stderr,
"memload_generator <thread %ld>: failed to allocate memory\n", tid);
}
return nullptr;
}
// populate the region with 1/2/3s
memset((char*)tinfo->from_region, 1, tinfo->array_size);
memset((char*)tinfo->to_region, 2, tinfo->array_size);
tinfo->init_status.store(thread_info::INIT_SUCCESS);
if (nm_get_verbose() > 0) {
fprintf(stdout,
"memload_generator <thread %ld>: init finished, from: %p, to: %p, waiting for start...\n",
tid, tinfo->from_region, tinfo->to_region);
}
while (tinfo->state->load() == STATE_READY) {
};
if (nm_get_verbose() > 0) {
fprintf(
stdout, "memload_generator <thread %ld>: running...\n", tid);
}
tinfo->begin_ts = nm_get_uptime_ns();
while (tinfo->state->load() == STATE_START) {
if (tinfo->num_trans.load() < tinfo->iter) {
// generate traffic
memcpy((char *)tinfo->to_region, (char *)tinfo->from_region, tinfo->array_size);
tinfo->num_trans.fetch_add(1);
if (tinfo->num_trans.load() >= tinfo->iter) {
tinfo->stop_ts = nm_get_uptime_ns();
}
} else {
usleep(1000 * 100);
}
}
//nm_free(tinfo->from_domainid, tinfo->from_region);
//nm_free(tinfo->to_domainid, tinfo->to_region);
if (nm_get_verbose() > 0) {
fprintf(
stdout, "memload_generator <thread %ld>: exiting...\n", tid);
}
return nullptr;
}
memload_generator::memload_generator(
uint64_t from_cmask, uint64_t to_cmask, uint32_t array_sz, uint32_t iteration, bool *success)
{
*success = false;
state.store(STATE_READY);
this->array_size = array_sz;
this->iteration = iteration;
int nextcore;
int to_coreid = cmask_get_next_cpu(&to_cmask);
int num_cores = cmask_get_num_cpus(from_cmask);
cpuset_t cpuset;
if (to_coreid == NEXT_CPU_NULL || num_cores == 0) {
return;
}
void * from = nm_malloc(nm_obj_get_id(nm_obj_find_parent(nm_obj_from_id(NM_LEVEL_CORE, to_coreid), NM_LEVEL_NUMA)), array_sz);
nextcore = cmask_get_next_cpu(&from_cmask);
void * to = nm_malloc(nm_obj_get_id(nm_obj_find_parent(nm_obj_from_id(NM_LEVEL_CORE, nextcore), NM_LEVEL_NUMA)), array_sz);
while (nextcore != NEXT_CPU_NULL) {
auto *info = new struct thread_info;
pthread_attr_t attr;
info->num_trans.store(0);
info->to_domainid = nm_obj_get_id(nm_obj_find_parent(nm_obj_from_id(NM_LEVEL_CORE, to_coreid), NM_LEVEL_NUMA));
info->from_domainid = nm_obj_get_id(nm_obj_find_parent(nm_obj_from_id(NM_LEVEL_CORE, nextcore), NM_LEVEL_NUMA));
info->init_status.store(thread_info::INIT_START);
info->state = &state;
info->iter = this->iteration;
info->array_size = this->array_size;
info->from_region = from;
info->to_region = to;
CPU_ZERO(&cpuset);
CPU_SET(nextcore, &cpuset);
pthread_attr_init(&attr);
pthread_attr_setaffinity_np(&attr, sizeof(cpuset_t), &cpuset);
pthread_create(&info->pthr, &attr, worker_thrd, info);
if (nm_get_verbose() > 0) {
fprintf(stdout,
"memload_generator: created thread on core %d\n",
nextcore);
}
thr_infos.push_back(info);
nextcore = cmask_get_next_cpu(&from_cmask);
}
if (nm_get_verbose() > 0) {
fprintf(
stdout, "memload_generator: waiting for thread init...\n");
}
bool failed = false;
uint num_success = 0;
while (num_success < thr_infos.size() && !failed) {
num_success = 0;
for (auto i : thr_infos) {
if (i->init_status.load() ==
thread_info::INIT_SUCCESS) {
num_success++;
}
if (i->init_status.load() == thread_info::INIT_FAILED) {
failed = true;
}
}
}
*success = num_success == thr_infos.size();
if (nm_get_verbose() > 0) {
fprintf(stdout,
"memload_generator: exiting constructor. Success: %d...\n",
success ? 1 : 0);
}
}
void
memload_generator::start()
{
if (this->state.load() == STATE_READY) {
state.store(STATE_START);
}
}
void
memload_generator::stop()
{
if (this->state.load() == STATE_START) {
state.store(STATE_STOP);
}
}
bool
memload_generator::check_done()
{
bool done = true;
for (auto i : thr_infos) {
done = done && (i->num_trans.load() >= this->iteration);
}
if (done) {
stop();
}
return done;
}
uint64_t
memload_generator::get_bps()
{
if (this->state.load() == STATE_STOP) {
uint64_t total_transactions = 0;
uint64_t total_time = 0;
for (auto i : thr_infos) {
total_transactions += i->num_trans.load();
total_time = (i->stop_ts - i->begin_ts) + total_time;
}
return (double)(total_transactions * this->array_size) / ((double)total_time / thr_infos.size() / S2NS);
} else {
return 0;
}
}
memload_generator::~memload_generator()
{
if (this->state.load() != STATE_STOP) {
stop();
}
for (auto i : thr_infos) {
pthread_join(i->pthr, nullptr);
delete i;
}
}

View File

@ -1,72 +0,0 @@
#include <sys/types.h>
#include <sys/sysctl.h>
#include <sys/thr.h>
#include <x86intrin.h>
#include <algorithm>
#include <atomic>
#include <vector>
#include <cerrno>
#include "nmp.hh"
#include "defs.hh"
static const char *SYSCTL_TSC = "machdep.tsc_freq";
static uint64_t sysctl_tsc_freq = 0;
static int verbose = 0;
int nm_get_verbose()
{
return verbose;
}
uint64_t
nm_get_uptime_ns()
{
unsigned int dummy;
_mm_lfence();
uint64_t tsc = __rdtscp(&dummy);
_mm_lfence();
return nm_tsc2ns(tsc);
}
uint64_t
nm_tsc2ns(uint64_t tsc)
{
return (uint64_t)(
(double)tsc / (double)sysctl_tsc_freq * S2NS);
}
// 0 on success
// -1 on error
int
nm_init(int verbosity)
{
int ret;
size_t sz = sizeof(sysctl_tsc_freq);
verbose = verbosity;
// init nm_tsc2ns
if ((ret = sysctlbyname(
SYSCTL_TSC, &sysctl_tsc_freq, &sz, nullptr, 0)) < 0) {
if (nm_get_verbose()) {
fprintf(stderr,
"libnm: failed to query tsc frequency via sysctl (%d)\n", errno);
}
return ret;
}
if (nm_get_verbose()) {
fprintf(stdout, "libnm: tsc frequency: %lu\n", sysctl_tsc_freq);
}
ret = nm_topo_init();
if (ret != 0) {
return ret;
}
ret = nm_alloc_init();
return ret;
}

View File

@ -1,9 +0,0 @@
#pragma once
#include "nm.hh"
int nm_topo_init();
int nm_alloc_init();
int nm_get_verbose();

View File

@ -1,204 +0,0 @@
#include "nmp.hh"
#include <cstdio>
#include <cstdlib>
#include <hwloc.h>
constexpr static int NM_MAX_CHILDREN = 128;
struct nm_obj {
int level;
int id;
struct nm_obj *parent;
int num_children;
struct nm_obj * children[NM_MAX_CHILDREN];
};
static int size_tbl[NM_MAX_LEVEL] = { 0 };
static struct nm_obj * obj_tbl[NM_MAX_LEVEL][NM_MAX_OBJS_PER_LVL];
static hwloc_obj_t
get_parent_type(hwloc_obj_t obj, hwloc_obj_type_t type)
{
obj = obj->parent;
while (obj != nullptr) {
if (obj->type == type) {
return obj;
}
obj = obj->parent;
}
return nullptr;
}
// static int
// obj_comparator(const void * a, const void * b)
// {
// return ((struct nm_obj *)a)->id - ((struct nm_obj *)b)->id;
// }
static inline int
is_level_valid(int level)
{
return level < (int)NM_MAX_LEVEL;
}
int
nm_obj_count(int level)
{
if (is_level_valid(level)) {
return size_tbl[level];
}
return -1;
}
struct nm_obj *
nm_obj_from_id(int level, int id)
{
if (is_level_valid(level) && id <= size_tbl[level]) {
return obj_tbl[level][id];
}
return nullptr;
}
struct nm_obj *
nm_obj_find_parent(struct nm_obj * start, int parent_level)
{
struct nm_obj * ret = nullptr;
while (start != nullptr) {
if (parent_level == start->level) {
ret = start;
break;
}
start = start->parent;
}
return ret;
}
int
nm_obj_get_id(struct nm_obj * obj)
{
return obj->id;
}
int
nm_obj_get_level(struct nm_obj * obj)
{
return obj->level;
}
static int
validate_objs(int level)
{
for (int i = 0; i < size_tbl[level]; i++) {
struct nm_obj * each = obj_tbl[level][i];
if (each->id != i) {
return 0;
}
}
return 1;
}
static int
add_obj(int id, int level, struct nm_obj * parent)
{
if (size_tbl[level] >= NM_MAX_OBJS_PER_LVL) {
return -1;
}
auto each = (struct nm_obj *)malloc(sizeof(struct nm_obj));
each->id = id;
each->level = level;
each->num_children = 0;
each->parent = parent;
if (parent != nullptr) {
// add children
if (parent->num_children >= NM_MAX_CHILDREN) {
return -1;
} else {
parent->children[parent->num_children] = each;
parent->num_children++;
}
}
obj_tbl[level][size_tbl[level]] = each;
size_tbl[level]++;
return 0;
}
static int
add_level(hwloc_topology * topo, hwloc_obj_type_t hwloc_level, hwloc_obj_type_t hwloc_plevel, int level, int plevel)
{
// populate numa nodes
hwloc_obj_t obj = nullptr;
hwloc_obj_t pobj = nullptr;
struct nm_obj *parent = nullptr;
while (true) {
obj = hwloc_get_next_obj_by_type(topo, hwloc_level, obj);
if (obj == nullptr) {
break;
}
pobj = get_parent_type(obj, hwloc_plevel);
if (pobj != nullptr) {
parent = obj_tbl[plevel][pobj->logical_index];
}
if (add_obj(obj->logical_index, level, parent) != 0) {
if (nm_get_verbose() > 0) {
fprintf(stderr, "libnm: failed to add object %d.\n", obj->logical_index);
}
return -1;
}
if (nm_get_verbose() > 0) {
fprintf(stdout, "libnm: identified id %d type %d parent %d type %d\n", obj->logical_index, level, parent == nullptr ? -1 : parent->id, plevel);
}
}
// sort
// std::qsort(obj_tbl[level], size_tbl[level], sizeof(struct nm_obj), obj_comparator);
if (!validate_objs(level)) {
if (nm_get_verbose() > 0) {
fprintf(stdout, "libnm: objects are shuffled at level %d.\n", level);
}
return -1;
}
return 0;
}
int
nm_topo_init()
{
int ret;
// init numa stuff
hwloc_topology *topo;
if ((ret = hwloc_topology_init(&topo)) != 0) {
return ret;
}
if ((ret = hwloc_topology_load(topo)) != 0)
return ret;
if ((ret = add_level(topo, HWLOC_OBJ_PACKAGE, HWLOC_OBJ_PACKAGE, NM_LEVEL_NUMA, NM_LEVEL_NUMA)) != 0) {
return ret;
}
if ((ret = add_level(topo, HWLOC_OBJ_CORE, HWLOC_OBJ_PACKAGE, NM_LEVEL_CPU, NM_LEVEL_NUMA)) != 0) {
return ret;
}
if ((ret = add_level(topo, HWLOC_OBJ_PU, HWLOC_OBJ_CORE, NM_LEVEL_CORE, NM_LEVEL_CPU)) != 0) {
return ret;
}
return ret;
}

196
libnms/alloc.c Normal file
View File

@ -0,0 +1,196 @@
#include <pthread.h>
#include <sys/types.h>
#include <sys/cpuset.h>
#include <sys/domainset.h>
#include <sys/thr.h>
#include <sys/mman.h>
#include <stdint.h>
#include <stdio.h>
#include <errno.h>
#include <stdatomic.h>
#include <string.h>
#include <assert.h>
#include <nms.h>
#define MAX_NUMA_DOMAINS (64)
#define MAX_REGIONS (64)
#define REGION_SIZE (1024 * 1024 * 1024)
#define MALLOC_UNIT (4096)
struct nms_region {
uintptr_t start_addr;
size_t size;
size_t occupied;
};
struct nms_desc {
// alloc
pthread_mutex_t alloc_lock;
struct nms_region regions[MAX_NUMA_DOMAINS][MAX_REGIONS];
int region_sz[MAX_NUMA_DOMAINS];
};
static _Atomic(int) initialized = 0;
static struct nms_desc g_desc;
static void *
nms_alloc_region(int nodeid, size_t sz)
{
long tid;
domainset_t orig_dom;
int orig_policy;
void * region;
thr_self(&tid);
DOMAINSET_ZERO(&orig_dom);
// save existing thread's allocation strategy
int ret = cpuset_getdomain(CPU_LEVEL_WHICH, CPU_WHICH_TID, tid, sizeof(orig_dom), &orig_dom, &orig_policy);
if (ret != 0) {
fprintf(stderr, "libnms: cpuset_getdomain failed with %d\n", errno);
return NULL;
}
domainset_t tmp_domain;
DOMAINSET_ZERO(&tmp_domain);
DOMAINSET_SET(nodeid, &tmp_domain);
ret = cpuset_setdomain(CPU_LEVEL_WHICH, CPU_WHICH_TID, tid, sizeof(tmp_domain), &tmp_domain, DOMAINSET_POLICY_ROUNDROBIN);
if (ret != 0) {
fprintf(stderr, "libnms: cpuset_setdomain failed with %d\n", errno);
return NULL;
}
if ((region = mmap(NULL, REGION_SIZE, PROT_READ | PROT_WRITE, MAP_ANON | MAP_ALIGNED_SUPER | MAP_NOCORE | MAP_PRIVATE | MAP_NOSYNC, -1, 0)) == MAP_FAILED) {
fprintf(stderr, "libnms: mmap failed with %d\n", errno);
return NULL;
}
// touch the pages to prefault the pages
for (size_t i = 0; i < REGION_SIZE; i += MALLOC_UNIT) {
*(uint8_t*)((uintptr_t)region + i) = 0;
}
// restore existing thread's allocation strategy
ret = cpuset_setdomain(CPU_LEVEL_WHICH, CPU_WHICH_TID, tid, sizeof(orig_dom), &orig_dom, orig_policy);
if (ret != 0) {
fprintf(stderr, "libnms: cpuset_setdomain failed with %d\n", errno);
munmap(region, REGION_SIZE);
return NULL;
}
return region;
}
static int
nms_desc_init(struct nms_desc * desc, int verbose)
{
memset(desc, 0, sizeof(struct nms_desc));
pthread_mutex_init(&desc->alloc_lock, NULL);
return 0;
}
static void *
nms_region_malloc(struct nms_region * region, size_t size)
{
void * ret = NULL;
if (region->size >= region->occupied + size) {
ret = (void *)(region->start_addr + region->occupied);
region->occupied += size;
region->occupied = (region->occupied + MALLOC_UNIT - 1) & ~(MALLOC_UNIT - 1);
}
return ret;
}
static int
nms_desc_add_region(struct nms_desc * desc, int nodeid, size_t size)
{
void * ret;
int idx;
ret = nms_alloc_region(nodeid, REGION_SIZE);
if (ret == NULL) {
fprintf(stderr, "libnms: failed to allocate region on node %d\n", nodeid);
return ENOMEM;
}
desc->region_sz[nodeid]++;
idx = desc->region_sz[nodeid] - 1;
desc->regions[nodeid][idx].start_addr = (uintptr_t)ret;
desc->regions[nodeid][idx].occupied = 0;
desc->regions[nodeid][idx].size = REGION_SIZE;
return 0;
}
static void *
nms_desc_malloc(struct nms_desc * desc, unsigned int nodeid, size_t size)
{
void * ret = NULL;
int idx;
int new_region = 0;
if (size > REGION_SIZE) {
return NULL;
}
pthread_mutex_lock(&desc->alloc_lock);
retry:
if (desc->region_sz[nodeid] > 0) {
idx = desc->region_sz[nodeid] - 1;
ret = nms_region_malloc(&desc->regions[nodeid][idx], size);
}
if (ret == NULL) {
// we need a new region
if (nms_desc_add_region(desc, nodeid, REGION_SIZE) != 0) {
pthread_mutex_unlock(&desc->alloc_lock);
return NULL;
}
fprintf(stdout, "libnms: request of size %zu -> allocated new region on node %d\n", size, nodeid);
goto retry;
}
pthread_mutex_unlock(&desc->alloc_lock);
return ret;
}
static void
nms_desc_free(struct nms_desc * desc __attribute__((unused)), unsigned int node __attribute__((unused)), void * addr __attribute__((unused)))
{
// dummy function
}
int
nms_init(int verbose)
{
int expected = 0;
if (atomic_compare_exchange_strong(&initialized, &expected, 2)) {
nms_desc_init(&g_desc, verbose);
atomic_store(&initialized, 1);
} else {
while(atomic_load(&initialized) != 1) {
}
fprintf(stdout,"libnms: already initialized.\n");
}
return 0;
}
void *
nms_malloc(int nodeid, size_t sz)
{
assert(atomic_load(&initialized) == 1);
return nms_desc_malloc(&g_desc, nodeid, sz);
}
void
nms_free(int nodeid, void * addr)
{
assert(atomic_load(&initialized) == 1);
nms_desc_free(&g_desc, nodeid, addr);
}

View File

@ -1,3 +1,12 @@
#include <sys/_timespec.h>
#include <atomic>
#include <cstdlib>
#include <ctime>
#include <fstream>
#include <random>
#include <vector>
#include <topo.h>
#include <rte_byteorder.h>
#include <rte_common.h>
#include <rte_config.h>
@ -10,29 +19,16 @@
#include <rte_mbuf.h>
#include <unistd.h>
#include "gen.hh"
#include "nm.hh"
#include "ntr.h"
#include "gen.hh"
#include "net/netsup.hh"
#include "net/pkt.hh"
#include "net/util.hh"
#include "nms.h"
#include <atomic>
#include <ctime>
#include <fstream>
#include <random>
#include <vector>
#define MBUF_MAX_COUNT (rte_lcore_count() * 4096)
constexpr static unsigned int MBUF_CACHE_SIZE = 512;
constexpr static unsigned int RX_RING_SIZE = 1024;
constexpr static unsigned int TX_RING_SIZE = 1024;
constexpr static unsigned int BURST_SIZE = 32;
constexpr static unsigned int MAX_SLAVES = 32;
constexpr static unsigned int SLAVES_MAX_WAIT_MS = 1000;
static const struct rte_eth_conf port_conf_default {
};
struct datapt {
uint32_t epoch;
uint32_t valid;
@ -59,23 +55,21 @@ struct options_t {
char ia_gen_str[256] = "fixed";
unsigned int target_qps { 0 };
unsigned int master_mode { 0 };
struct net_spec server_spec {
};
struct net_spec server_spec { };
cpuset_t cpu_set = CPUSET_T_INITIALIZER(0x2); // 2nd core
std::vector<struct net_spec *> slaves;
uint32_t pkt_loss_failure_threshold { 0 };
uint32_t pkt_loss_time_ms { UINT32_MAX };
int portid { 0 };
// states
struct rte_mempool *mbuf_pool { nullptr };
struct net_spec s_host_spec {
};
struct net_spec s_host_spec { };
struct conn_spec s_host_conn {
.src = &s_host_spec, .dst = &server_spec, .dst_port = POU_PORT
};
uint16_t s_portid { 0 };
unsigned int s_rxqid { 0 };
unsigned int s_txqid { 0 };
unsigned int s_socketid { 0 };
// for qps calculation
std::atomic<uint32_t> s_recved_pkts { 0 };
std::atomic<uint32_t> s_pkt_loss { 0 };
@ -97,14 +91,13 @@ struct options_t {
static struct options_t options;
static uint16_t
rx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
rx_add_timestamp(uint16_t port, uint16_t qidx __rte_unused,
struct rte_mbuf **pkts, uint16_t nb_pkts, uint16_t max_pkts __rte_unused,
void *_ __rte_unused)
{
uint64_t now = nm_tsc2ns(rte_rdtsc());
uint64_t now = topo_uptime_ns();
struct pkt_hdr *pkt_data;
struct timespec ts {
};
struct timespec ts { };
int ret;
if (options.s_state != STATE_SENT) {
@ -112,8 +105,8 @@ rx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
}
for (int i = 0; i < nb_pkts; i++) {
pkt_data = check_valid_packet(
pkts[i], &options.s_host_spec.mac_addr);
pkt_data = check_valid_packet(pkts[i],
&options.s_host_spec.mac_addr);
if (pkt_data == nullptr) {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
@ -165,16 +158,16 @@ static uint16_t
tx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
struct rte_mbuf **pkts, uint16_t nb_pkts, void *_ __rte_unused)
{
uint64_t now = nm_tsc2ns(rte_rdtsc());
uint64_t now = topo_uptime_ns();
struct pkt_hdr *pkt_data;
if (options.s_state != STATE_SENT) {
return nb_pkts;
}
// if (options.s_state != STATE_SENT) {
// return nb_pkts;
// }
for (int i = 0; i < nb_pkts; i++) {
pkt_data = check_valid_packet(
pkts[i], &options.s_host_spec.mac_addr);
pkt_data = check_valid_packet(pkts[i],
&options.s_host_spec.mac_addr);
if (pkt_data == nullptr) {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
@ -214,6 +207,7 @@ static void
send_all_slaves(uint16_t type)
{
struct rte_mbuf *tx_bufs[MAX_SLAVES];
//struct rte_eth_stats stats;
struct conn_spec cspec;
cspec.src = &options.s_host_spec;
@ -224,13 +218,18 @@ send_all_slaves(uint16_t type)
for (unsigned int i = 0; i < options.slaves.size(); i++) {
struct pkt_hdr *hdr;
cspec.dst = options.slaves.at(i);
if (alloc_pkt_hdr(options.mbuf_pool, type, &cspec, 0, &tx_bufs[i],
&hdr) != 0) {
if (alloc_pkt_hdr(mempool_get(options.s_socketid), type, &cspec, 0,
&tx_bufs[i], &hdr) != 0) {
rte_exit(EXIT_FAILURE, "failed to alloc packet\n");
}
}
if (rte_eth_tx_burst(options.s_portid, options.s_txqid, tx_bufs,
// if (rte_eth_stats_get(options.portid, &stats) != 0 ) {
// rte_exit(EXIT_FAILURE, "failed!");
// }
// printf("send_all_slaves: ipackets %lu, opackets %lu, ierrors %lu, oerrors %lu\n", stats.ipackets, stats.opackets, stats.ierrors, stats.oerrors);
if (rte_eth_tx_burst(options.portid, options.s_txqid, tx_bufs,
options.slaves.size()) != options.slaves.size()) {
rte_exit(EXIT_FAILURE, "failed to send some packets\n");
}
@ -243,14 +242,14 @@ wait_for_slaves(uint16_t etype, struct rte_mbuf **out)
{
struct rte_mbuf *tx_bufs[MAX_SLAVES];
bool stop = false;
const uint64_t start = nm_get_uptime_ns();
const uint64_t start = topo_uptime_ns();
std::vector<struct rte_ether_addr *> recved;
uint32_t tot = 0;
while (!stop) {
uint64_t now = nm_get_uptime_ns();
const uint16_t nb_rx = rte_eth_rx_burst(
options.s_portid, options.s_rxqid, tx_bufs, MAX_SLAVES);
uint64_t now = topo_uptime_ns();
const uint16_t nb_rx = rte_eth_rx_burst(options.portid,
options.s_rxqid, tx_bufs, MAX_SLAVES);
if (nb_rx > 0) {
for (unsigned int i = 0; i < nb_rx; i++) {
@ -275,7 +274,7 @@ wait_for_slaves(uint16_t etype, struct rte_mbuf **out)
if (rte_is_same_ether_addr(
&eaddr->mac_addr,
&each->eth_hdr
.s_addr)) {
.src_addr)) {
invalid = false;
break;
}
@ -298,7 +297,7 @@ wait_for_slaves(uint16_t etype, struct rte_mbuf **out)
if (rte_is_same_ether_addr(
eaddr,
&each->eth_hdr
.s_addr)) {
.src_addr)) {
invalid = true;
break;
}
@ -314,7 +313,8 @@ wait_for_slaves(uint16_t etype, struct rte_mbuf **out)
goto end_loop;
}
recved.push_back(&each->eth_hdr.s_addr);
recved.push_back(
&each->eth_hdr.src_addr);
if (recved.size() ==
options.slaves.size()) {
@ -337,9 +337,15 @@ wait_for_slaves(uint16_t etype, struct rte_mbuf **out)
}
}
// struct rte_eth_stats stats;
// if (rte_eth_stats_get(options.portid, &stats) != 0 ) {
// rte_exit(EXIT_FAILURE, "failed!");
// }
//printf("wait_slaves <AFTER>: ipackets %lu, opackets %lu, ierrors %lu, oerrors %lu\n", stats.ipackets, stats.opackets, stats.ierrors, stats.oerrors);
if (now - start > SLAVES_MAX_WAIT_MS * MS2NS) {
rte_exit(
EXIT_FAILURE, "cat: waiting for too long %d. I QUIT!!", etype);
rte_exit(EXIT_FAILURE,
"cat: waiting for too long %d. I QUIT!!", etype);
}
}
}
@ -356,25 +362,25 @@ pkt_loop()
bool recv_stat = true;
bool recv_resp = true;
if (rte_eth_dev_socket_id(options.s_portid) > 0 &&
rte_eth_dev_socket_id(options.s_portid) != (int)rte_socket_id()) {
if (rte_eth_dev_socket_id(options.portid) > 0 &&
rte_eth_dev_socket_id(options.portid) != (int)rte_socket_id()) {
ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
"locore_main: WARNING, port %d is on remote NUMA node to "
"polling thread.\n\tPerformance will "
"not be optimal.\n",
options.s_portid);
options.portid);
}
uint64_t next_ts = nm_get_uptime_ns();
uint64_t next_ts = topo_uptime_ns();
uint64_t last_send_ts = next_ts;
bool is_last_pkt_lost = false;
uint32_t num_cts_pkt_lost = 0;
while (!options.s_stop.load()) {
uint64_t now = nm_get_uptime_ns();
uint64_t now = topo_uptime_ns();
// always pop incoming packets
const uint16_t nb_rx = rte_eth_rx_burst(
options.s_portid, options.s_rxqid, rx_bufs, BURST_SIZE);
const uint16_t nb_rx = rte_eth_rx_burst(options.portid,
options.s_rxqid, rx_bufs, BURST_SIZE);
if (nb_rx > 0) {
for (int i = 0; i < nb_rx; i++) {
@ -398,7 +404,7 @@ pkt_loop()
uint16_t type = rte_be_to_cpu_16(each->type);
NTR_PKT(NTR_DEP_USER1, NTR_LEVEL_DEBUG, each,
"locore_main: ");
"locore_main: received packet %p ", each);
struct pkt_payload_epoch *pld_epoch;
struct pkt_payload_stat *pld_stat;
uint32_t epoch;
@ -409,6 +415,8 @@ pkt_loop()
epoch = rte_be_to_cpu_32(
pld_epoch->epoch);
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "lcore_main: PROBE_RESP received packet %p epoch %d\n", each, epoch);
if (options.s_last_datapt == nullptr ||
epoch !=
options.s_last_datapt->epoch) {
@ -466,9 +474,10 @@ pkt_loop()
if (options.s_state == STATE_SENT) {
// check if hw ts is read
if (!read_tx) {
int ret;
struct timespec ts;
if (rte_eth_timesync_read_tx_timestamp(
options.s_portid, &ts) == 0) {
if ((ret = rte_eth_timesync_read_tx_timestamp(
options.portid, &ts)) == 0) {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"locore_main: read hw tx timestamp %lu.\n",
(ts.tv_nsec + ts.tv_sec * S2NS));
@ -553,7 +562,7 @@ pkt_loop()
S2NS);
options.s_host_conn.src_port = port_gen.next();
if (alloc_pkt_hdr(options.mbuf_pool,
if (alloc_pkt_hdr(mempool_get(options.s_socketid),
PKT_TYPE_PROBE, &options.s_host_conn, 0,
&tx_buf, &pkt_data) != 0) {
rte_exit(EXIT_FAILURE,
@ -570,24 +579,25 @@ pkt_loop()
options.s_last_datapt->valid =
options.s_record.load();
read_tx = false;
recv_resp = false;
recv_stat = false;
last_send_ts = now;
options.s_state = STATE_SENT;
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"locore_main: sending packet %p with epoch %d\n",
"locore_main: sending packet 0x%p with epoch %d\n",
(void *)tx_buf, epoch);
const uint16_t nb_tx = rte_eth_tx_burst(
options.s_portid, options.s_txqid, &tx_buf,
1);
const uint16_t nb_tx =
rte_eth_tx_burst(options.portid,
options.s_txqid, &tx_buf, 1);
if (nb_tx != 1) {
rte_exit(EXIT_FAILURE,
"failed to send packet 0x%p, epoch %d\n",
(void *)tx_buf, epoch);
}
read_tx = false;
recv_resp = false;
recv_stat = false;
options.s_state = STATE_SENT;
}
}
}
@ -611,9 +621,9 @@ locore_main(void *tif __rte_unused)
wait_for_slaves(PKT_TYPE_SYNC_ACK, nullptr);
}
options.s_start_time.store(nm_get_uptime_ns());
options.s_start_time.store(topo_uptime_ns());
pkt_loop();
options.s_end_time.store(nm_get_uptime_ns());
options.s_end_time.store(topo_uptime_ns());
if (options.master_mode == 1) {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
@ -627,8 +637,8 @@ locore_main(void *tif __rte_unused)
for (unsigned int i = 0; i < options.slaves.size(); i++) {
// these packets already underwent validity check in
// wait_for_slaves
auto pkt_hdr = rte_pktmbuf_mtod(
mbufs[i], struct pkt_hdr *);
auto pkt_hdr = rte_pktmbuf_mtod(mbufs[i],
struct pkt_hdr *);
auto pld_qps = (struct pkt_payload_qps *)
pkt_hdr->payload;
uint32_t qps = rte_be_to_cpu_32(pld_qps->qps);
@ -650,95 +660,6 @@ locore_main(void *tif __rte_unused)
return 0;
}
static int
port_init(uint16_t portid, struct rte_mempool *mbuf_pool)
{
struct rte_eth_dev_info dev_info {
};
struct rte_eth_conf port_conf = port_conf_default;
struct rte_eth_txconf txconf {
};
struct rte_eth_rxconf rxconf {
};
uint16_t nb_rxd = RX_RING_SIZE;
uint16_t nb_txd = TX_RING_SIZE;
if (!rte_eth_dev_is_valid_port(portid)) {
return -1;
}
int ret = rte_eth_dev_info_get(portid, &dev_info);
if (ret != 0) {
return ret;
}
port_conf.rxmode.max_rx_pkt_len = RTE_ETHER_MAX_LEN;
port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_RSS_HASH;
port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_UDP_CKSUM;
port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_IPV4_CKSUM;
port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM;
port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
port_conf.txmode.offloads |= DEV_TX_OFFLOAD_MBUF_FAST_FREE;
/* Configure the Ethernet device. */
ret = rte_eth_dev_configure(portid, 1, 1, &port_conf);
if (ret != 0)
return ret;
ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd, &nb_txd);
if (ret != 0)
return ret;
/* Allocate and set up 1 RX queue per thread . */
rxconf = dev_info.default_rxconf;
rxconf.offloads = port_conf.rxmode.offloads;
for (uint32_t i = 0; i < 1; i++) {
ret = rte_eth_rx_queue_setup(portid, options.s_rxqid, nb_rxd,
rte_eth_dev_socket_id(portid), &rxconf, mbuf_pool);
if (ret < 0)
return ret;
}
txconf = dev_info.default_txconf;
txconf.offloads = port_conf.txmode.offloads;
/* Allocate and set up 1 TX queue per Ethernet port. */
for (uint32_t i = 0; i < 1; i++) {
ret = rte_eth_tx_queue_setup(portid, options.s_txqid, nb_txd,
rte_eth_dev_socket_id(portid), &txconf);
if (ret < 0)
return ret;
}
ret = rte_eth_dev_start(portid);
if (ret < 0)
return ret;
/* Display the port MAC address. */
struct rte_ether_addr addr {
};
ret = rte_eth_macaddr_get(portid, &addr);
if (ret != 0)
return ret;
ret = rte_eth_timesync_enable(portid);
if (ret != 0)
return ret;
/* Enable RX in promiscuous mode for the Ethernet device. */
ret = rte_eth_promiscuous_enable(portid);
if (ret != 0)
return ret;
rte_eth_add_tx_callback(
portid, options.s_rxqid, tx_add_timestamp, nullptr);
rte_eth_add_rx_callback(
portid, options.s_txqid, rx_add_timestamp, nullptr);
return 0;
}
static void
dump_options()
{
@ -753,11 +674,13 @@ dump_options()
" target qps = %d\n"
" host IP = 0x%x\n"
" pkt loss time = %u\n"
" pkt loss failure threshold = %u\n",
" pkt loss failure threshold = %u\n"
" portid = %d\n",
ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING, options.run_time,
options.warmup_time, options.output, CPU_COUNT(&options.cpu_set),
options.ia_gen_str, options.target_qps, options.s_host_spec.ip,
options.pkt_loss_time_ms, options.pkt_loss_failure_threshold);
options.pkt_loss_time_ms, options.pkt_loss_failure_threshold,
options.portid);
for (auto slave : options.slaves) {
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
@ -794,8 +717,6 @@ usage()
int
main(int argc, char *argv[])
{
unsigned int nb_ports;
struct rte_mempool *mbuf_pool;
std::ofstream log_file;
bool has_host_spec = false;
@ -816,7 +737,7 @@ main(int argc, char *argv[])
int c;
// parse arguments
struct net_spec *ns;
while ((c = getopt(argc, argv, "vs:S:t:T:ho:A:i:q:H:L:l:")) !=
while ((c = getopt(argc, argv, "vs:S:t:T:ho:A:i:q:H:L:l:p:")) !=
-1) {
switch (c) {
case 'v':
@ -824,8 +745,8 @@ main(int argc, char *argv[])
ntr_get_level(NTR_DEP_USER1) + 1);
break;
case 's':
if (str_to_netspec(
optarg, &options.server_spec) != 0) {
if (str_to_netspec(optarg,
&options.server_spec) != 0) {
rte_exit(EXIT_FAILURE,
"invalid server net spec.\n");
}
@ -839,16 +760,16 @@ main(int argc, char *argv[])
options.slaves.push_back(ns);
options.master_mode = 1;
if (options.slaves.size() > MAX_SLAVES) {
rte_exit(
EXIT_FAILURE, "too many rats.\n");
rte_exit(EXIT_FAILURE,
"too many rats.\n");
}
break;
case 't':
options.run_time = strtol(optarg, nullptr, 10);
break;
case 'T':
options.warmup_time = strtol(
optarg, nullptr, 10);
options.warmup_time = strtol(optarg, nullptr,
10);
break;
case 'h':
usage();
@ -865,32 +786,35 @@ main(int argc, char *argv[])
sizeof(options.ia_gen_str) - 1);
break;
case 'q':
options.target_qps = strtoul(
optarg, nullptr, 10);
options.target_qps = strtoul(optarg, nullptr,
10);
break;
case 'H':
has_host_spec = true;
if (str_to_netspec(
optarg, &options.s_host_spec) != 0) {
if (str_to_netspec(optarg,
&options.s_host_spec) != 0) {
rte_exit(EXIT_FAILURE,
"invalid host net spec.\n");
}
break;
case 'L':
options.pkt_loss_failure_threshold = strtoul(
optarg, nullptr, 10);
options.pkt_loss_failure_threshold =
strtoul(optarg, nullptr, 10);
break;
case 'l':
options.pkt_loss_time_ms = strtoul(
optarg, nullptr, 10);
options.pkt_loss_time_ms = strtoul(optarg,
nullptr, 10);
if (options.pkt_loss_time_ms == 0) {
options.pkt_loss_time_ms = UINT32_MAX;
}
break;
case 'p':
options.portid = strtol(optarg, nullptr, 10);
break;
default:
usage();
rte_exit(
EXIT_FAILURE, "unknown argument: %c\n", c);
rte_exit(EXIT_FAILURE, "unknown argument: %c\n",
c);
}
}
}
@ -899,13 +823,68 @@ main(int argc, char *argv[])
rte_exit(EXIT_FAILURE, "must specify host IP\n");
}
// init nm
if (nm_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != 0) {
rte_exit(EXIT_FAILURE, "nm init failed!\n");
// init libtopo
if (topo_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) !=
0) {
rte_exit(EXIT_FAILURE, "libtopo init failed!\n");
}
// init nms
if (nms_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != 0) {
rte_exit(EXIT_FAILURE, "failed to init libnms!\n");
}
if (CPU_COUNT(&options.cpu_set) != 1) {
rte_exit(EXIT_FAILURE, "must specify exactly one core\n");
}
int core_id = CPU_FFS(&options.cpu_set) - 1;
dump_options();
// configure memory and port
struct port_conf pconf;
struct device_conf dconf;
struct mem_conf mconf;
portconf_get(options.portid, &pconf);
dconf.mtu = MAX_STANDARD_MTU;
dconf.num_threads = 1;
dconf.portid = options.portid;
dconf.rss_hf = pconf.rss_hf;
dconf.rx_offloads = pconf.rxoffload;
dconf.tx_offloads = pconf.txoffload;
dconf.timesync = pconf.timesync;
dconf.rx_fn = rx_add_timestamp;
dconf.rx_user = nullptr;
dconf.rx_ring_sz = 2048;
dconf.tx_fn = tx_add_timestamp;
dconf.tx_user = nullptr;
dconf.tx_ring_sz = 2048;
mconf.cache_size = 64;
mconf.priv_size = 0;
mconf.num_elements = 4096;
mconf.data_room_size = RTE_MBUF_DEFAULT_BUF_SIZE + MAX_STANDARD_MTU;
mconf.max_pools = -1;
dpdk_init(&dconf, &mconf);
if (rte_eth_macaddr_get(options.portid,
&options.s_host_spec.mac_addr) != 0) {
rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n",
options.portid);
}
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
"Configured port %d with mac addr %x:%x:%x:%x:%x:%x\n",
options.portid, options.s_host_spec.mac_addr.addr_bytes[0],
options.s_host_spec.mac_addr.addr_bytes[1],
options.s_host_spec.mac_addr.addr_bytes[2],
options.s_host_spec.mac_addr.addr_bytes[3],
options.s_host_spec.mac_addr.addr_bytes[4],
options.s_host_spec.mac_addr.addr_bytes[5]);
// create default generator
options.s_iagen = createGenerator(options.ia_gen_str);
if (options.s_iagen == nullptr) {
@ -921,50 +900,6 @@ main(int argc, char *argv[])
options.output);
}
nb_ports = rte_eth_dev_count_avail();
if (nb_ports == 0) {
rte_exit(EXIT_FAILURE, "number of ports must be > 0\n");
}
uint16_t portid = rte_eth_find_next(0);
if (portid == RTE_MAX_ETHPORTS) {
rte_exit(EXIT_FAILURE, "cannot find an available port\n");
}
options.s_portid = portid;
if (rte_eth_macaddr_get(portid, &options.s_host_spec.mac_addr) != 0) {
rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n",
portid);
}
// create a mbuf memory pool on the socket
mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", MBUF_MAX_COUNT,
MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE,
rte_eth_dev_socket_id(options.s_portid));
if (mbuf_pool == nullptr) {
rte_exit(EXIT_FAILURE, "cannot create mbuf pool\n");
}
options.mbuf_pool = mbuf_pool;
if (port_init(portid, mbuf_pool) != 0) {
rte_exit(EXIT_FAILURE, "cannot init port %d\n", portid);
}
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
"Configured port %d with mac addr %x:%x:%x:%x:%x:%x\n", portid,
options.s_host_spec.mac_addr.addr_bytes[0],
options.s_host_spec.mac_addr.addr_bytes[1],
options.s_host_spec.mac_addr.addr_bytes[2],
options.s_host_spec.mac_addr.addr_bytes[3],
options.s_host_spec.mac_addr.addr_bytes[4],
options.s_host_spec.mac_addr.addr_bytes[5]);
int core_id = CPU_FFS(&options.cpu_set);
if (core_id == 0) {
rte_exit(EXIT_FAILURE, "invalid cpu list\n");
}
core_id--;
sleep(INIT_DELAY);
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
"main: launching thread on core %d\n", core_id);
@ -1014,11 +949,13 @@ main(int argc, char *argv[])
}
log_file.close();
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "qps = %d, recved = %d, loss = %d, slave recved = %d, slave loss = %d\n",
qps, options.s_recved_pkts.load(), options.s_pkt_loss.load(), options.s_slave_recved.load(), options.s_slave_loss.load());
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
"qps = %d, recved = %d, loss = %d, slave recved = %d, slave loss = %d\n",
qps, options.s_recved_pkts.load(), options.s_pkt_loss.load(),
options.s_slave_recved.load(), options.s_slave_loss.load());
// clean up
rte_eth_dev_stop(portid);
dpdk_cleanup(&dconf);
return 0;
}

View File

@ -1,3 +1,16 @@
#include <atomic>
#include <cassert>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <vector>
#include <unistd.h>
#include <sys/cpuset.h>
#include <sys/endian.h>
#include <topo.h>
#include <rte_common.h>
#include <rte_config.h>
#include <rte_cycles.h>
@ -7,40 +20,27 @@
#include <rte_launch.h>
#include <rte_lcore.h>
#include <rte_mbuf.h>
#include <unistd.h>
#include "nm.hh"
#include "ntr.h"
#include "gen.hh"
#include "net/netsup.hh"
#include "net/pkt.hh"
#include "net/util.hh"
#include "nms.h"
#include <atomic>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <vector>
#include <sys/_cpuset.h>
#include <sys/cpuset.h>
#include <sys/endian.h>
#define MBUF_MAX_COUNT (rte_lcore_count() * 4096)
constexpr static unsigned int MBUF_CACHE_SIZE = 512;
constexpr static unsigned int RX_RING_SIZE = 2048;
constexpr static unsigned int TX_RING_SIZE = 2048;
constexpr static unsigned int BURST_SIZE = 32;
constexpr static unsigned int CACHELINE_SIZE = 64;
constexpr static uint16_t THREAD_LOAD_BUFFER_SZ = 16384;
constexpr static size_t MEMPOOL_NAME_BUF_LEN = 64;
static const struct rte_mbuf_dynfield rte_mbuf_dynfield_probe_flag = {
.name = "rte_mbuf_dynfield_probe_flag",
.size = sizeof(uint32_t),
.align = __alignof__(uint32_t),
.flags = 0
};
static int PROBE_FLAG_OFFSET { 0 };
static const struct rte_eth_conf port_conf_default {
struct probe_state_t {
struct net_spec dst;
struct conn_spec cspec {
.dst = &dst
};
uint64_t last_sw_rx;
uint64_t last_sw_tx;
uint64_t last_hw_rx;
uint32_t epoch;
};
// keep track of the probe state
@ -59,63 +59,63 @@ struct thread_info {
int txqid;
int lcore_id;
int node_id;
void * cache_lines;
void * load_buffer;
};
// state machine:
constexpr static int SERVER_STATE_WAIT = 0;
constexpr static int SERVER_STATE_PROBE = 1;
struct probe_state_t {
struct net_spec dst;
struct conn_spec cspec {
.dst = &dst
};
uint32_t epoch;
uint64_t last_sw_rx;
uint64_t last_sw_tx;
uint64_t last_hw_rx;
void *cache_lines;
void *load_buffer;
};
struct options_t {
// config
int num_threads { 1 };
cpuset_t cpu_set = CPUSET_T_INITIALIZER(0x2); // 2nd core
char mempool_name_buf[MEMPOOL_NAME_BUF_LEN];
bool jumbo_frame_enabled { false }; // setting this to true changes mbuf size and mtu
bool jumbo_frame_enabled {
false
}; // setting this to true changes mbuf size and mtu
int port_mtu { MAX_STANDARD_MTU };
int thread_cacheline_cnt = { 128 };
bool mlg_enabled { false };
uint64_t mlg_bps { 0 };
uint64_t mlg_arr_sz { 0 };
cpuset_t mlg_cset = CPUSET_T_INITIALIZER(0x2);
cpuset_t mlg_dset = CPUSET_T_INITIALIZER(0x1);
int mlg_shared_buffer { 0 };
memload_generator *mlg { nullptr };
// states
uint16_t s_portid { 0 };
struct net_spec s_host_spec {
};
std::atomic<int> s_state { SERVER_STATE_WAIT };
struct probe_state_t s_probe_info;
std::vector<struct thread_info *> s_thr_info;
uint16_t portid { 0 };
struct rte_mempool *s_mempools[MAX_NODES];
// states
struct net_spec s_host_spec { };
std::vector<struct thread_info *> s_thr_info;
int probe_state_offset { 0 };
bool s_hwtimestamp { true };
struct probe_state_t s_probe_info;
std::atomic<bool> is_probing { false };
};
struct options_t options;
static bool
mbuf_is_probe_valid(struct rte_mbuf *pkt)
{
return *RTE_MBUF_DYNFIELD(pkt, options.probe_state_offset, bool *);
}
static void
mbuf_set_probe_valid(struct rte_mbuf *pkt, bool b)
{
*RTE_MBUF_DYNFIELD(pkt, options.probe_state_offset, bool *) = b;
}
static uint16_t
rx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
struct rte_mbuf **pkts, uint16_t nb_pkts, uint16_t max_pkts __rte_unused,
void *_ __rte_unused)
{
uint64_t now = nm_get_uptime_ns();
struct timespec ts {
};
int rc = 0;
uint64_t now = topo_uptime_ns();
struct timespec ts { };
struct pkt_hdr *pkt_data;
for (int i = 0; i < nb_pkts; i++) {
pkt_data = check_valid_packet(
pkts[i], &options.s_host_spec.mac_addr);
pkt_data = check_valid_packet(pkts[i],
&options.s_host_spec.mac_addr);
if (pkt_data == nullptr) {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
@ -125,41 +125,53 @@ rx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
}
if (rte_be_to_cpu_16(pkt_data->type) == PKT_TYPE_PROBE) {
int state_wait = SERVER_STATE_WAIT;
*RTE_MBUF_DYNFIELD(
pkts[i], PROBE_FLAG_OFFSET, uint32_t *) = 0;
if (rte_eth_timesync_read_rx_timestamp(
port, &ts, pkts[i]->timesync & 0x3) == 0) {
if (options.s_state.compare_exchange_strong(
state_wait, SERVER_STATE_PROBE)) {
// mark the mbuf as probe packet being
// processed only the locore that
// receives the pkt w/ userdata !=
// nullptr processes that packet
*RTE_MBUF_DYNFIELD(pkts[i],
PROBE_FLAG_OFFSET, uint32_t *) = 1;
// tag with timestamps
options.s_probe_info.last_hw_rx =
ts.tv_nsec + ts.tv_sec * S2NS;
bool cmp = false;
mbuf_set_probe_valid(pkts[i], false);
if (options.is_probing.compare_exchange_strong(cmp,
true)) {
options.s_probe_info.last_sw_rx = now;
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"rx_add_timestamp: tagged packet %p epoch %d with sw: %lu hw:%lu.\n",
if (options.s_hwtimestamp) {
if ((rc = rte_eth_timesync_read_rx_timestamp(
port, &ts,
pkts[i]->timesync & 0x3)) ==
0) {
options.s_probe_info
.last_hw_rx = ts.tv_nsec +
ts.tv_sec * S2NS;
ntr(NTR_DEP_USER1,
NTR_LEVEL_DEBUG,
"rx_add_timestamp: tagged packet %p with sw rx: %lu hw rx:%lu.\n",
(void *)pkts[i],
options.s_probe_info.epoch, now,
options.s_probe_info.last_hw_rx);
} else
ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
"rx_add_timestamp: packet %p not tagged - server is processing a probe.\n",
(void *)pkts[i]);
} else
ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
"rx_add_timestamp: packet %p not tagged - hw rx timestamp not available.\n",
(void *)pkts[i]);
} else
options.s_probe_info
.last_sw_rx,
options.s_probe_info
.last_hw_rx);
mbuf_set_probe_valid(pkts[i],
true);
} else {
options.is_probing.store(false);
ntr(NTR_DEP_USER1,
NTR_LEVEL_WARNING,
"rx_add_timestamp: packet %p not tagged - failed to read hw rx timestamp: %d.\n",
(void *)pkts[i], rc);
}
} else {
mbuf_set_probe_valid(pkts[i], true);
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"rx_add_timestamp: packet %p not tagged - type %d.\n",
"rx_add_timestamp: tagged packet %p with sw rx only: %lu.\n",
(void *)pkts[i], now);
}
} else {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"rx_add_timestamp: packet %p not tagged - server is probing.\n",
(void *)pkts[i]);
}
} else {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"rx_add_timestamp: packet %p not tagged - not PROBE packet: type %d.\n",
(void *)pkts[i], rte_be_to_cpu_16(pkt_data->type));
}
}
return nb_pkts;
}
@ -168,13 +180,13 @@ static uint16_t
tx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
struct rte_mbuf **pkts, uint16_t nb_pkts, void *_ __rte_unused)
{
uint64_t now = nm_get_uptime_ns();
uint64_t now = topo_uptime_ns();
struct pkt_hdr *pkt_data;
for (int i = 0; i < nb_pkts; i++) {
pkt_data = check_valid_packet(
pkts[i], &options.s_host_spec.mac_addr);
pkt_data = check_valid_packet(pkts[i],
&options.s_host_spec.mac_addr);
if (pkt_data == nullptr) {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
@ -188,14 +200,8 @@ tx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
// at this time the packet is not sent to the NIC yet so
// the state must be waiting stats
// XXX: this should be an assert
if (options.s_state.load() != SERVER_STATE_PROBE ||
*RTE_MBUF_DYNFIELD(
pkts[i], PROBE_FLAG_OFFSET, uint32_t *) != 1) {
rte_exit(EXIT_FAILURE,
"packet %p sent to NIC before sw callback\n",
(void *)pkts[i]);
}
assert(options.is_probing.load() &&
mbuf_is_probe_valid(pkts[i]));
options.s_probe_info.last_sw_tx = now;
@ -225,23 +231,23 @@ locore_main(void *ti)
bool pending_probe = false;
if (rte_eth_dev_socket_id(options.s_portid) > 0 &&
rte_eth_dev_socket_id(options.s_portid) != (int)rte_socket_id()) {
if (rte_eth_dev_socket_id(options.portid) > 0 &&
rte_eth_dev_socket_id(options.portid) != (int)rte_socket_id()) {
ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
"locore_main <thread %d>: WARNING, port %d is on remote NUMA node to "
"polling thread.\n\tPerformance will "
"not be optimal.\n",
tinfo->tid, options.s_portid);
tinfo->tid, options.portid);
}
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
"locore_main <thread %d>: running on locore %d with txidx %d and rxidx %d.\n",
"locore_main <thread %d>: running on locore %d with txqid %d and rxqid %d.\n",
tinfo->tid, rte_lcore_id(), tinfo->txqid, tinfo->rxqid);
while (true) {
uint16_t nb_tx = 0;
const uint16_t nb_rx = rte_eth_rx_burst(
options.s_portid, tinfo->rxqid, bufs, BURST_SIZE);
const uint16_t nb_rx = rte_eth_rx_burst(options.portid,
tinfo->rxqid, bufs, BURST_SIZE);
struct rte_mbuf *pkt_buf;
struct pkt_hdr *tx_data;
@ -249,8 +255,8 @@ locore_main(void *ti)
// XXX: optimization: in rx_add_timestamp every packet
// is already validated once can just mark valid packet
// with a value so we can avoid this redundant check
pkt_data = check_valid_packet(
bufs[i], &options.s_host_spec.mac_addr);
pkt_data = check_valid_packet(bufs[i],
&options.s_host_spec.mac_addr);
if (pkt_data == nullptr) {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
@ -262,13 +268,10 @@ locore_main(void *ti)
}
NTR_PKT(NTR_DEP_USER1, NTR_LEVEL_DEBUG, pkt_data,
"locore_main <thread %d>: ", tinfo->tid);
"locore_main <thread %d>: received packet ", tinfo->tid);
switch (rte_be_to_cpu_16(pkt_data->type)) {
case PKT_TYPE_PROBE: {
if (options.s_state.load() ==
SERVER_STATE_PROBE &&
*RTE_MBUF_DYNFIELD(bufs[i],
PROBE_FLAG_OFFSET, uint32_t *) == 1) {
if (mbuf_is_probe_valid(bufs[i])) {
// send back probe_resp pkt to probe for
// return latency
pending_probe = true;
@ -279,6 +282,7 @@ locore_main(void *ti)
((struct pkt_payload_epoch *)
pkt_data->payload)
->epoch);
pkt_hdr_to_netspec(pkt_data,
&options.s_probe_info.dst,
&options.s_probe_info.cspec
@ -286,12 +290,12 @@ locore_main(void *ti)
nullptr,
&options.s_probe_info.cspec
.src_port);
options.s_probe_info.cspec.src =
&options.s_host_spec;
if (alloc_pkt_hdr(
options
.s_mempools[tinfo->node_id],
if (alloc_pkt_hdr(mempool_get(
tinfo->node_id),
PKT_TYPE_PROBE_RESP,
&options.s_probe_info.cspec, 0,
&pkt_buf, &tx_data) != 0) {
@ -303,10 +307,11 @@ locore_main(void *ti)
pkt_data->payload,
sizeof(struct pkt_payload_epoch));
*RTE_MBUF_DYNFIELD(pkt_buf,
PROBE_FLAG_OFFSET, uint32_t *) = 1;
mbuf_set_probe_valid(pkt_buf, true);
// queue for burst send
NTR_PKT(NTR_DEP_USER1, NTR_LEVEL_DEBUG, tx_data,
"locore_main <thread %d>: sending packet ", tinfo->tid);
tx_bufs[nb_tx++] = pkt_buf;
}
break;
@ -316,20 +321,33 @@ locore_main(void *ti)
struct net_spec src;
struct net_spec dst;
// touch the unused data to pretend that we read those dummy fields
memcpy(tinfo->load_buffer, pkt_data->payload, MIN(bufs[i]->data_len - sizeof(struct pkt_hdr), THREAD_LOAD_BUFFER_SZ));
// touch the unused data to pretend that we read
// those dummy fields
memcpy(tinfo->load_buffer, pkt_data->payload,
MIN(bufs[i]->data_len -
sizeof(struct pkt_hdr),
THREAD_LOAD_BUFFER_SZ));
// perform the load
auto pld = (struct pkt_payload_load *)pkt_data->payload;
auto pld = (struct pkt_payload_load *)
pkt_data->payload;
uint32_t which = rte_be_to_cpu_32(pld->which);
uint32_t load = rte_be_to_cpu_32(pld->load);
uint32_t start_cacheline = which % (options.thread_cacheline_cnt * options.s_thr_info.size());
uint32_t thrd = start_cacheline / options.thread_cacheline_cnt;
uint32_t start = start_cacheline % options.thread_cacheline_cnt;
uint32_t start_cacheline = which %
(options.thread_cacheline_cnt *
options.s_thr_info.size());
uint32_t thrd = start_cacheline /
options.thread_cacheline_cnt;
uint32_t start = start_cacheline %
options.thread_cacheline_cnt;
for (uint j = 0; j < load; j++) {
*(uint32_t *)tinfo->load_buffer = (start + j) % options.thread_cacheline_cnt;
*(uint32_t *)tinfo->load_buffer =
(start + j) %
options.thread_cacheline_cnt;
}
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "locore_main <thread %d>: LOAD @ thread %d, start %d, load %d\n", tinfo->tid, thrd, start, load);
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"locore_main <thread %d>: LOAD @ thread %d, start %d, load %d\n",
tinfo->tid, thrd, start, load);
// reply
pkt_hdr_to_netspec(pkt_data, &src,
@ -337,11 +355,10 @@ locore_main(void *ti)
cspec.dst = &src;
cspec.src = &dst;
// printf("LOAD PKT SIZE: %d\n", bufs[i]->data_len);
// we reply to load packet regardless of the
// server state
if (alloc_pkt_hdr(
options.s_mempools[tinfo->node_id],
// printf("LOAD PKT SIZE: %d\n",
// bufs[i]->data_len); we reply to load packet
// regardless of the server state
if (alloc_pkt_hdr(mempool_get(tinfo->node_id),
PKT_TYPE_LOAD_RESP, &cspec, 0, &pkt_buf,
&tx_data) != 0) {
rte_exit(EXIT_FAILURE,
@ -352,6 +369,8 @@ locore_main(void *ti)
sizeof(struct pkt_payload_load));
// queue for burst send
NTR_PKT(NTR_DEP_USER1, NTR_LEVEL_DEBUG, tx_data,
"locore_main <thread %d>: sending packet ", tinfo->tid);
tx_bufs[nb_tx++] = pkt_buf;
break;
}
@ -366,173 +385,65 @@ locore_main(void *ti)
}
// send all packets
tx_burst_all(options.s_portid, tinfo->txqid, tx_bufs, nb_tx);
tx_burst_all(options.portid, tinfo->txqid, tx_bufs, nb_tx);
// we wanna check every loop not only when there are packets
if (pending_probe) {
struct timespec ts {
};
assert(options.is_probing.load());
struct timespec ts { };
struct pkt_payload_stat *stat;
if (options.s_hwtimestamp) {
if (rte_eth_timesync_read_tx_timestamp(
options.s_portid, &ts) == 0) {
options.portid, &ts) == 0) {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"locore_main <thread %d>: obtained hw tx timestamp %lu.\n",
tinfo->tid,
(ts.tv_sec * S2NS + ts.tv_nsec));
} else {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"locore_main <thread %d>: failed to obtain hw tx timestamp.\n",
tinfo->tid);
pending_probe = false;
goto end_stat;
}
}
// now we have everything we need
if (alloc_pkt_hdr(
options.s_mempools[tinfo->node_id],
PKT_TYPE_STAT,
&options.s_probe_info.cspec, 0,
if (alloc_pkt_hdr(mempool_get(tinfo->node_id),
PKT_TYPE_STAT, &options.s_probe_info.cspec, 0,
&pkt_buf, &tx_data) != 0) {
rte_exit(EXIT_FAILURE,
"failed to alloc pkt_buf\n");
}
// populate stats
stat = (struct pkt_payload_stat *)
tx_data->payload;
stat = (struct pkt_payload_stat *)tx_data->payload;
stat->epoch = rte_cpu_to_be_32(
options.s_probe_info.epoch);
if (options.s_hwtimestamp) {
stat->hw_rx = rte_cpu_to_be_64(
options.s_probe_info.last_hw_rx);
stat->hw_tx = rte_cpu_to_be_64(
ts.tv_nsec + ts.tv_sec * S2NS);
} else {
stat->hw_rx = 0;
stat->hw_tx = 0;
}
stat->sw_rx = rte_cpu_to_be_64(
options.s_probe_info.last_sw_rx);
stat->sw_tx = rte_cpu_to_be_64(
options.s_probe_info.last_sw_tx);
// send the packet
tx_burst_all(options.s_portid, tinfo->txqid, &pkt_buf, 1);
tx_burst_all(options.portid, tinfo->txqid, &pkt_buf, 1);
end_stat:
// release flux
pending_probe = false;
int expected = SERVER_STATE_PROBE;
if (!options.s_state.compare_exchange_strong(
expected, SERVER_STATE_WAIT)) {
rte_exit(EXIT_FAILURE,
"s_state changed unexpectedly!");
options.is_probing.store(false);
}
}
}
}
}
static int
port_init(uint16_t portid, struct rte_mempool *mbuf_pool)
{
struct rte_eth_dev_info dev_info {
};
struct rte_eth_conf port_conf = port_conf_default;
struct rte_eth_txconf txconf {
};
struct rte_eth_rxconf rxconf {
};
uint16_t nb_rxd = RX_RING_SIZE;
uint16_t nb_txd = TX_RING_SIZE;
if (!rte_eth_dev_is_valid_port(portid)) {
return -1;
}
int ret = rte_eth_dev_info_get(portid, &dev_info);
if (ret != 0) {
return ret;
}
port_conf.rxmode.max_rx_pkt_len = mtu_to_pkt_size(options.port_mtu);
port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_NONFRAG_IPV4_UDP |
ETH_RSS_L2_PAYLOAD | ETH_RSS_NONFRAG_IPV4_TCP;
port_conf.rx_adv_conf.rss_conf.rss_key = nullptr;
port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_RSS_HASH;
port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_UDP_CKSUM;
port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_IPV4_CKSUM;
port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM;
port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
port_conf.txmode.offloads |= DEV_TX_OFFLOAD_MBUF_FAST_FREE;
if (options.jumbo_frame_enabled) {
port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_JUMBO_FRAME;
}
/* Configure the Ethernet device. */
ret = rte_eth_dev_configure(
portid, options.num_threads, options.num_threads, &port_conf);
if (ret != 0)
return ret;
ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd, &nb_txd);
if (ret != 0)
return ret;
/* Allocate and set up 1 RX queue per thread per Ethernet port. */
rxconf = dev_info.default_rxconf;
if (options.jumbo_frame_enabled) {
rxconf.offloads |= DEV_RX_OFFLOAD_JUMBO_FRAME;
}
for (int i = 0; i < options.num_threads; i++) {
ret = rte_eth_rx_queue_setup(portid, i, nb_rxd,
rte_eth_dev_socket_id(portid), &rxconf, mbuf_pool);
if (ret < 0)
return ret;
options.s_thr_info.at(i)->rxqid = i;
}
txconf = dev_info.default_txconf;
txconf.offloads = port_conf.txmode.offloads;
/* Allocate and set up 1 TX queue per thread per Ethernet port. */
for (int i = 0; i < options.num_threads; i++) {
ret = rte_eth_tx_queue_setup(
portid, i, nb_txd, rte_eth_dev_socket_id(portid), &txconf);
if (ret < 0)
return ret;
options.s_thr_info.at(i)->txqid = i;
}
// set mtu
ret = rte_eth_dev_set_mtu(portid, options.port_mtu);
if (ret != 0)
return ret;
ret = rte_eth_dev_start(portid);
if (ret < 0)
return ret;
/* Display the port MAC address. */
struct rte_ether_addr addr {
};
ret = rte_eth_macaddr_get(portid, &addr);
if (ret != 0)
return ret;
ret = rte_eth_timesync_enable(portid);
if (ret != 0)
return ret;
/* Enable RX in promiscuous mode for the Ethernet device. */
ret = rte_eth_promiscuous_enable(portid);
if (ret != 0)
return ret;
for (int i = 0; i < options.num_threads; i++) {
if (rte_eth_add_tx_callback(portid,
options.s_thr_info.at(i)->txqid, tx_add_timestamp,
nullptr) == nullptr ||
rte_eth_add_rx_callback(portid,
options.s_thr_info.at(i)->rxqid, rx_add_timestamp,
nullptr) == nullptr) {
return -1;
}
}
// sync_port_clock(portid);
return 0;
}
static void
@ -542,13 +453,15 @@ usage()
"Usage:\n"
" -v(vv): verbose mode\n"
" -h: seek help\n"
" -A: cpu mask for worker threads\n"
" -A: cpu list for worker threads\n"
" -m: enable memory load generator(MLG)\n"
" -b: MLG bytes per second\n"
" -b: MLG trunk size\n"
" -x: MLG thread affinity mask\n"
" -X: MLG target domain affinity mask\n"
" -S: MLG shared buffer\n"
" -H: host spec\n"
" -J: enable jumbo frames\n");
" -J: enable jumbo frames\n"
" -p: port id\n");
fflush(stdout);
}
@ -560,19 +473,22 @@ dump_options()
" verbosity: +%d\n"
" thread count: %d\n"
" ip: 0x%x\n"
" MLG: %s [bps: %ld, thread cnt: %d, domain: %ld]\n"
" jumbo frame: %d\n",
ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING, options.num_threads,
options.s_host_spec.ip, options.mlg_enabled ? "on" : "off",
options.mlg_bps, CPU_COUNT(&options.mlg_cset), CPU_FFS(&options.mlg_dset) - 1, options.jumbo_frame_enabled);
" MLG: %s [arr_sz: %ld, thread cnt: %d, domain: %ld]\n"
" jumbo frame: %d\n"
" port id: %d\n",
ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING,
options.num_threads, options.s_host_spec.ip,
options.mlg_enabled ? "on" : "off", options.mlg_arr_sz,
CPU_COUNT(&options.mlg_cset), CPU_FFS(&options.mlg_dset) - 1,
options.jumbo_frame_enabled, options.portid);
}
int
main(int argc, char *argv[])
{
unsigned int nb_ports;
struct rte_mempool *mbuf_pool;
bool has_host_spec { false };
struct mem_conf mconf;
struct device_conf dconf;
ntr_init();
@ -590,7 +506,7 @@ main(int argc, char *argv[])
{
int c;
// parse arguments
while ((c = getopt(argc, argv, "hvA:H:mb:X:x:J")) != -1) {
while ((c = getopt(argc, argv, "hvA:H:mb:X:x:JSp:")) != -1) {
switch (c) {
case 'v':
ntr_set_level(NTR_DEP_USER1,
@ -601,15 +517,16 @@ main(int argc, char *argv[])
rte_exit(EXIT_SUCCESS, "\n");
case 'A':
cpulist_to_cpuset(optarg, &options.cpu_set);
options.num_threads = CPU_COUNT(&options.cpu_set);
options.num_threads = CPU_COUNT(
&options.cpu_set);
if (options.num_threads == 0) {
rte_exit(EXIT_FAILURE,
"must run at least one thread\n");
}
break;
case 'H':
if (str_to_netspec(
optarg, &options.s_host_spec) != 0) {
if (str_to_netspec(optarg,
&options.s_host_spec) != 0) {
rte_exit(EXIT_FAILURE,
"invalid host spec\n");
}
@ -619,7 +536,8 @@ main(int argc, char *argv[])
options.mlg_enabled = true;
break;
case 'b':
options.mlg_bps = strtoull(optarg, nullptr, 10);
options.mlg_arr_sz = strtoull(optarg, nullptr,
10);
break;
case 'X':
cpulist_to_cpuset(optarg, &options.mlg_dset);
@ -631,10 +549,16 @@ main(int argc, char *argv[])
options.jumbo_frame_enabled = true;
options.port_mtu = MAX_JUMBO_MTU;
break;
case 'S':
options.mlg_shared_buffer = 1;
break;
case 'p':
options.portid = atoi(optarg);
break;
default:
usage();
rte_exit(
EXIT_SUCCESS, "unknown argument: %c", c);
rte_exit(EXIT_SUCCESS, "unknown argument: %c",
c);
}
}
}
@ -643,109 +567,98 @@ main(int argc, char *argv[])
rte_exit(EXIT_FAILURE, "Must specify host spec\n");
}
// init nm
if (nm_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != 0) {
rte_exit(EXIT_FAILURE, "nm init failed!\n");
// init libtopo
if (topo_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) !=
0) {
rte_exit(EXIT_FAILURE, "libtopo init failed!\n");
}
// init libnms
if (nms_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != 0) {
rte_exit(EXIT_FAILURE, "libnms init failed!\n");
}
dump_options();
// init mlg
// if (options.mlg_enabled) {
// // bool success = false;
// // options.mlg = new memload_generator(options.mlg_cmask,
// // options.mlg_dmask, options.mlg_bps, &success);
// // if (!success) {
// // rte_exit(EXIT_FAILURE, "failed to init mlg\n");
// // }
// }
// register dynamic field
PROBE_FLAG_OFFSET = rte_mbuf_dynfield_register(
struct rte_mbuf_dynfield rte_mbuf_dynfield_probe_flag = {
.name = "rte_mbuf_dynfield_probe_valid",
.size = sizeof(bool),
.align = __alignof__(uint32_t),
.flags = 0
};
options.probe_state_offset = rte_mbuf_dynfield_register(
&rte_mbuf_dynfield_probe_flag);
if (PROBE_FLAG_OFFSET < 0) {
rte_exit(EXIT_FAILURE, "failed to register dynamic field\n");
}
nb_ports = rte_eth_dev_count_avail();
if (nb_ports == 0) {
rte_exit(EXIT_FAILURE, "number of ports must be > 0\n");
}
uint16_t portid = rte_eth_find_next(0);
if (portid == RTE_MAX_ETHPORTS) {
rte_exit(EXIT_FAILURE, "cannot find an available port\n");
}
options.s_portid = portid;
if (rte_eth_macaddr_get(portid, &options.s_host_spec.mac_addr) != 0) {
rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n",
portid);
}
if (rte_socket_count() > (int)MAX_NODES) {
rte_exit(EXIT_FAILURE, "Too many numa nodes\n");
}
for (int i = 0; i < (int)rte_socket_count(); i++) {
uint32_t nodeid = i;
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
"main: creating mempool for node %d\n", nodeid);
// create one mbuf pool per socket
snprintf(options.mempool_name_buf, MEMPOOL_NAME_BUF_LEN,
"khat_mempool_%d", nodeid);
mbuf_pool = rte_pktmbuf_pool_create(options.mempool_name_buf,
MBUF_MAX_COUNT, MBUF_CACHE_SIZE, 0,
options.jumbo_frame_enabled ?
RTE_MBUF_DEFAULT_BUF_SIZE + (MAX_JUMBO_MTU - MAX_STANDARD_MTU) :
RTE_MBUF_DEFAULT_BUF_SIZE, nodeid);
if (mbuf_pool == nullptr) {
rte_exit(EXIT_FAILURE, "cannot create mbuf pool: %d\n",
if (options.probe_state_offset == -1) {
rte_exit(EXIT_FAILURE, "failed to register dynamic field: %d\n",
rte_errno);
}
options.s_mempools[nodeid] = mbuf_pool;
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
"main: created mempool for node %d\n", nodeid);
// configure memory and port
struct port_conf pconf;
portconf_get(options.portid, &pconf);
if (!pconf.timesync) {
ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
"main: timesync disabled. hw timestamp unavailable.\n ");
}
dconf.mtu = options.port_mtu;
dconf.num_threads = options.num_threads;
dconf.portid = options.portid;
dconf.rss_hf = pconf.rss_hf;
dconf.rx_offloads = pconf.rxoffload;
dconf.tx_offloads = pconf.txoffload;
dconf.timesync = pconf.timesync;
dconf.rx_fn = rx_add_timestamp;
dconf.rx_user = nullptr;
dconf.rx_ring_sz = 2048;
dconf.tx_fn = tx_add_timestamp;
dconf.tx_user = nullptr;
dconf.tx_ring_sz = 2048;
mconf.cache_size = 512;
mconf.priv_size = 0;
mconf.num_elements = (dconf.rx_ring_sz + dconf.tx_ring_sz) *
rte_lcore_count() / rte_socket_count();
mconf.data_room_size = RTE_MBUF_DEFAULT_BUF_SIZE + MAX_JUMBO_MTU -
MAX_STANDARD_MTU;
mconf.max_pools = -1;
dpdk_init(&dconf, &mconf);
if (rte_eth_macaddr_get(options.portid,
&options.s_host_spec.mac_addr) != 0) {
rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n",
options.portid);
}
// init threads
uint32_t cpu_idx = CPU_FFS(&options.cpu_set);
uint32_t tid = 0;
while(cpu_idx != 0) {
while (cpu_idx != 0) {
uint32_t lcore_id = cpu_idx - 1;
uint32_t node_id = rte_lcore_to_socket_id(lcore_id);
auto *tinfo = (struct thread_info *)nm_malloc(node_id, sizeof(struct thread_info));
tinfo->cache_lines = nm_malloc(node_id, CACHELINE_SIZE * options.thread_cacheline_cnt);
tinfo->load_buffer = nm_malloc(node_id, THREAD_LOAD_BUFFER_SZ);
auto *tinfo = (struct thread_info *)nms_malloc(node_id,
sizeof(struct thread_info));
tinfo->cache_lines = nms_malloc(node_id,
CACHELINE_SIZE * options.thread_cacheline_cnt);
tinfo->load_buffer = nms_malloc(node_id,
THREAD_LOAD_BUFFER_SZ);
tinfo->tid = tid;
tinfo->lcore_id = lcore_id;
tinfo->node_id = node_id;
tinfo->rxqid = tid;
tinfo->txqid = tid;
options.s_thr_info.push_back(tinfo);
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
"main: thread %d assigned to cpu %d, node %d\n", tinfo->tid,
tinfo->lcore_id, nm_get_node_from_core(lcore_id));
tinfo->lcore_id, topo_core_to_numa(lcore_id));
tid++;
CPU_CLR(cpu_idx - 1, &options.cpu_set);
cpu_idx = CPU_FFS(&options.cpu_set);
}
if (port_init(portid, mbuf_pool) != 0) {
rte_exit(EXIT_FAILURE, "cannot init port %d\n", portid);
}
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
"Configured port %d on socket %d with mac addr %x:%x:%x:%x:%x:%x\n",
portid, rte_eth_dev_socket_id(portid),
options.s_host_spec.mac_addr.addr_bytes[0],
options.s_host_spec.mac_addr.addr_bytes[1],
options.s_host_spec.mac_addr.addr_bytes[2],
options.s_host_spec.mac_addr.addr_bytes[3],
options.s_host_spec.mac_addr.addr_bytes[4],
options.s_host_spec.mac_addr.addr_bytes[5]);
sleep(INIT_DELAY);
for (int i = 0; i < options.num_threads; i++) {
@ -762,19 +675,35 @@ main(int argc, char *argv[])
}
}
if (options.mlg_enabled)
options.mlg->start();
// init mlg
if (options.mlg_enabled) {
bool success = false;
memload_generator::memload_generator_options opts;
opts.chunk_size = options.mlg_arr_sz;
opts.iteration =
memload_generator::memload_generator_options::ITERATION_MAX;
opts.shared_buffer = options.mlg_shared_buffer;
opts.verbose = (ntr_get_level(NTR_DEP_USER1) -
NTR_LEVEL_WARNING) != 0;
options.mlg = new memload_generator(&options.mlg_cset,
&options.mlg_dset, &opts, &success);
if (!success) {
rte_exit(EXIT_FAILURE, "failed to init mlg\n");
}
}
while (true) {
usleep(S2US);
if (options.mlg_enabled) {
uint64_t bps = options.mlg->get_bps();
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"main: MLG bps = %ld ~= %ldM\n", bps, bps / 1024 / 1024);
"main: MLG bps = %ld ~= %ldM\n", bps,
bps / 1024 / 1024);
}
}
if (options.mlg_enabled)
options.mlg->stop();
// shouldn't get here
// clean up
for (int i = 0; i < options.num_threads; i++) {
struct thread_info *tinfo = options.s_thr_info.at(i);
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
@ -785,10 +714,9 @@ main(int argc, char *argv[])
}
}
// shouldn't get here
// clean up
rte_eth_dev_stop(portid);
delete options.mlg;
dpdk_cleanup(&dconf);
return 0;
}

207
net/libnetsup/dpdk.cc Normal file
View File

@ -0,0 +1,207 @@
#include "net/netsup.hh"
#include <cstdlib>
#include "rte_build_config.h"
#include "rte_common.h"
#include "rte_config.h"
#include "rte_ether.h"
#include "rte_lcore.h"
#include "rte_mempool.h"
#include "rte_mbuf.h"
#include "rte_errno.h"
#include "rte_ethdev.h"
#include "ntr.h"
static struct rte_mempool *g_mempools[MAX_NUMA_NODES] = {nullptr};
static unsigned int g_mempool_sz = 0;
static void
mempool_init(struct mem_conf *mconf)
{
struct rte_mempool * mbuf_pool;
char mempool_name[64];
for (int i = 0; i < (int)rte_socket_count(); i++) {
uint32_t nodeid = i;
// ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
// "mempool_init: creating mempool for node %d\n", nodeid);
// create one mbuf pool per socket
snprintf(mempool_name, sizeof(mempool_name), "net_mempool_%d", nodeid);
mbuf_pool = rte_pktmbuf_pool_create(mempool_name, mconf->num_elements,
mconf->cache_size, mconf->priv_size,
mconf->data_room_size, nodeid);
if (mbuf_pool == nullptr) {
rte_exit(EXIT_FAILURE, "cannot create mbuf pool: %d\n", rte_errno);
}
g_mempools[nodeid] = mbuf_pool;
g_mempool_sz++;
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "mempool_init: created mempool for node %d\n", nodeid);
}
}
struct rte_mempool *
mempool_get(int nodeid)
{
if ((unsigned int)nodeid < g_mempool_sz) {
return g_mempools[nodeid];
}
return nullptr;
}
static void
port_init(struct device_conf *dconf)
{
struct rte_ether_addr addr;
struct rte_eth_dev_info dev_info {
};
struct rte_eth_conf port_conf;
struct rte_eth_txconf txconf {
};
struct rte_eth_rxconf rxconf {
};
int ret;
if (rte_eth_dev_count_avail() == 0) {
rte_exit(EXIT_FAILURE, "number of ports must be > 0\n");
}
if (!rte_eth_dev_is_valid_port(dconf->portid)) {
rte_exit(EXIT_FAILURE, "cannot find port %d\n", dconf->portid);
}
if ((ret = rte_eth_macaddr_get(dconf->portid, &addr)) != 0) {
rte_exit(EXIT_FAILURE, "cannot get mac address of port: %d\n", ret);
}
ret = rte_eth_dev_info_get(dconf->portid, &dev_info);
if (ret != 0) {
rte_exit(EXIT_FAILURE, "failed to get dev info: %d", ret);
}
memset(&port_conf, 0, sizeof(struct rte_eth_conf));
port_conf.rxmode.mtu = dconf->mtu;
port_conf.rxmode.mq_mode = RTE_ETH_MQ_RX_RSS;
port_conf.rx_adv_conf.rss_conf.rss_key = nullptr;
port_conf.rx_adv_conf.rss_conf.rss_hf = dconf->rss_hf;
port_conf.rxmode.offloads = dconf->rx_offloads;
port_conf.txmode.offloads = dconf->tx_offloads;
/* Configure the Ethernet device. */
ret = rte_eth_dev_configure(
dconf->portid, dconf->num_threads, dconf->num_threads, &port_conf);
if (ret != 0)
rte_exit(EXIT_FAILURE, "failed to configure port: %d\n", ret);
ret = rte_eth_dev_adjust_nb_rx_tx_desc(dconf->portid, &dconf->rx_ring_sz, &dconf->tx_ring_sz);
if (ret != 0)
rte_exit(EXIT_FAILURE, "failed to set rx tx queue size: %d\n", ret);
/* Allocate and set up 1 RX queue per thread per Ethernet port. */
rxconf = dev_info.default_rxconf;
rxconf.offloads = port_conf.rxmode.offloads;
for (int i = 0; i < dconf->num_threads; i++) {
ret = rte_eth_rx_queue_setup(dconf->portid, i, dconf->rx_ring_sz,
rte_eth_dev_socket_id(dconf->portid), &rxconf,
mempool_get(rte_eth_dev_socket_id(dconf->portid)));
if (ret < 0)
rte_exit(EXIT_FAILURE, "failed to setup rx queue %d: %d\n", i, ret);
}
/* Allocate and set up 1 TX queue per thread per Ethernet port. */
txconf = dev_info.default_txconf;
txconf.offloads = port_conf.txmode.offloads;
for (int i = 0; i < dconf->num_threads; i++) {
ret = rte_eth_tx_queue_setup(
dconf->portid, i, dconf->tx_ring_sz, rte_eth_dev_socket_id(dconf->portid),
&txconf);
if (ret < 0)
rte_exit(EXIT_FAILURE, "failed to setup tx queue %d: %d", i, ret);
}
// set mtu
ret = rte_eth_dev_set_mtu(dconf->portid, dconf->mtu);
if (ret != 0)
rte_exit(EXIT_FAILURE, "failed to set mtu: %d\n", ret);
ret = rte_eth_dev_start(dconf->portid);
if (ret < 0)
rte_exit(EXIT_FAILURE, "failed to start port: %d\n", ret);
if (dconf->timesync) {
ret = rte_eth_timesync_enable(dconf->portid);
if (ret != 0)
rte_exit(EXIT_FAILURE, "failed to enable timesync: %d\n", ret);
}
/* Enable RX in promiscuous mode for the Ethernet device. */
ret = rte_eth_promiscuous_enable(dconf->portid);
if (ret != 0)
rte_exit(EXIT_FAILURE, "failed to enable promiscuous mode: %d\n", ret);
for (int i = 0; i < dconf->num_threads; i++) {
if (dconf->tx_fn != nullptr) {
if (rte_eth_add_tx_callback(dconf->portid,
i, dconf->tx_fn,
dconf->tx_user) == nullptr) {
rte_exit(EXIT_FAILURE, "failed to attach callback to tx queue %d\n", i);
}
}
if (dconf->rx_fn != nullptr) {
if (rte_eth_add_rx_callback(dconf->portid,
i, dconf->rx_fn,
dconf->rx_user) == nullptr) {
rte_exit(EXIT_FAILURE, "failed to attach callback to rx queue %d\n", i);
}
}
}
// sync_port_clock(portid);
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
"port_init: configured port %d on socket %d with mac addr %x:%x:%x:%x:%x:%x\n",
dconf->portid, rte_eth_dev_socket_id(dconf->portid),
addr.addr_bytes[0],
addr.addr_bytes[1],
addr.addr_bytes[2],
addr.addr_bytes[3],
addr.addr_bytes[4],
addr.addr_bytes[5]);
}
void
dpdk_init(struct device_conf *dconf, struct mem_conf *mconf)
{
if (rte_socket_count() > (int)MAX_NUMA_NODES) {
rte_exit(EXIT_FAILURE, "too many numa nodes\n");
}
// ensure 1-1 mapping
for (int i = 0; i < (int)rte_socket_count(); i++) {
if (rte_socket_id_by_idx(i) != i) {
rte_exit(EXIT_FAILURE, "socket %d has id %d instead.\n", i, rte_socket_id_by_idx(i));
}
}
mempool_init(mconf);
port_init(dconf);
}
void
dpdk_cleanup(struct device_conf * dconf)
{
rte_eth_dev_stop(dconf->portid);
rte_eth_dev_close(dconf->portid);
for (int i = 0; i < (int)rte_socket_count(); i++) {
rte_mempool_free(g_mempools[i]);
}
}

59
net/libnetsup/portconf.cc Normal file
View File

@ -0,0 +1,59 @@
#include "rte_ethdev.h"
#include "net/netsup.hh"
#include <cstdlib>
static struct port_conf port_confs[] = {
{
.driver_name = "net_cxgbe",
.rxoffload = RTE_ETH_RX_OFFLOAD_RSS_HASH | RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_IPV4_CKSUM,
.txoffload = RTE_ETH_TX_OFFLOAD_UDP_CKSUM | RTE_ETH_TX_OFFLOAD_IPV4_CKSUM,
.rss_hf = RTE_ETH_RSS_UDP | RTE_ETH_RSS_FRAG_IPV4,
.timesync = false
},
{
.driver_name = "net_i40e",
.rxoffload = RTE_ETH_RX_OFFLOAD_RSS_HASH | RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_IPV4_CKSUM,
.txoffload = RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE | RTE_ETH_TX_OFFLOAD_UDP_CKSUM | RTE_ETH_TX_OFFLOAD_IPV4_CKSUM,
.rss_hf = RTE_ETH_RSS_FRAG_IPV4 | RTE_ETH_RSS_NONFRAG_IPV4_UDP | RTE_ETH_RSS_NONFRAG_IPV4_OTHER | RTE_ETH_RSS_L2_PAYLOAD,
.timesync = true
},
{
.driver_name = "net_ice",
.rxoffload = RTE_ETH_RX_OFFLOAD_RSS_HASH | RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_IPV4_CKSUM | RTE_ETH_RX_OFFLOAD_TIMESTAMP,
.txoffload = RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE | RTE_ETH_TX_OFFLOAD_UDP_CKSUM | RTE_ETH_TX_OFFLOAD_IPV4_CKSUM,
.rss_hf = RTE_ETH_RSS_FRAG_IPV4 | RTE_ETH_RSS_NONFRAG_IPV4_UDP | RTE_ETH_RSS_NONFRAG_IPV4_OTHER | RTE_ETH_RSS_L2_PAYLOAD,
.timesync = true
}
};
static struct port_conf default_conf = {
.driver_name = "default",
.rxoffload = RTE_ETH_RX_OFFLOAD_RSS_HASH | RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_IPV4_CKSUM | RTE_ETH_RX_OFFLOAD_TIMESTAMP,
.txoffload = RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE | RTE_ETH_TX_OFFLOAD_UDP_CKSUM | RTE_ETH_TX_OFFLOAD_IPV4_CKSUM,
.rss_hf = RTE_ETH_RSS_FRAG_IPV4 | RTE_ETH_RSS_NONFRAG_IPV4_UDP | RTE_ETH_RSS_NONFRAG_IPV4_OTHER | RTE_ETH_RSS_L2_PAYLOAD,
.timesync = true
};
static const int port_size = sizeof(port_confs) / sizeof(port_confs[0]);
int
portconf_get(int portid, struct port_conf * out)
{
struct rte_eth_dev_info dev_info {};
if (rte_eth_dev_info_get(portid, &dev_info) != 0) {
rte_exit(EXIT_FAILURE, "failed to obtain device info for port %d\n", portid);
}
for(int i = 0; i < port_size; i++) {
struct port_conf * conf = &port_confs[i];
if (strcmp(conf->driver_name, dev_info.driver_name) == 0) {
memcpy(out, conf, sizeof(struct port_conf));
return 0;
}
}
fprintf(stdout, "portconf_get: unable to find matching conf for port %d:%s, returning default conf.\n", portid, dev_info.driver_name);
memcpy(out, &default_conf, sizeof(struct port_conf));
return -1;
}

View File

@ -1,4 +1,14 @@
#include <atomic>
#include <cstddef>
#include <list>
#include <map>
#include <mutex>
#include <random>
#include <vector>
#include <sys/endian.h>
#include <topo.h>
#include <rte_byteorder.h>
#include <rte_common.h>
#include <rte_config.h>
@ -10,28 +20,15 @@
#include <rte_mbuf.h>
#include <unistd.h>
#include "gen.hh"
#include "nm.hh"
#include "ntr.h"
#include "gen.hh"
#include "net/netsup.hh"
#include "net/pkt.hh"
#include "net/util.hh"
#include "nms.h"
#include <atomic>
#include <list>
#include <map>
#include <mutex>
#include <random>
#include <vector>
#define MBUF_MAX_COUNT (rte_lcore_count() * 4096)
constexpr static unsigned int MBUF_CACHE_SIZE = 512;
constexpr static unsigned int RX_RING_SIZE = 2048;
constexpr static unsigned int TX_RING_SIZE = 2048;
constexpr static unsigned int BURST_SIZE = 32;
static const struct rte_eth_conf port_conf_default {
};
static unsigned int
epoch_mk(unsigned int id, unsigned int epoch)
{
@ -60,6 +57,7 @@ struct thread_info {
unsigned int lcore_id { 0 };
unsigned int rxqid { 0 };
unsigned int txqid { 0 };
int socket_id;
// this field is read by the stat collecting thread
std::atomic<int> recved_pkts { 0 };
std::atomic<int> lost_pkts { 0 };
@ -74,9 +72,12 @@ struct thread_info {
mtx; // this lock protects data shared between worker threads, i.e.:
std::list<struct epoch_info *> recved_epochs;
thread_info() : which_rd(), which_rng(which_rd()), which_dice(std::uniform_int_distribution<uint32_t>(0, UINT32_MAX))
thread_info()
: which_rd()
, which_rng(which_rd())
, which_dice(std::uniform_int_distribution<uint32_t>(0, UINT32_MAX))
{
which_rng.seed(nm_get_uptime_ns());
which_rng.seed(topo_uptime_ns());
}
};
@ -94,26 +95,22 @@ struct options_t {
char ld_gen[256] { "fixed:0" };
uint32_t target_qps { 0 };
uint32_t depth { 1 };
struct net_spec server_spec {
};
struct net_spec server_spec { };
cpuset_t cpu_set = CPUSET_T_INITIALIZER(0x2); // 1 thread @ core 2
uint32_t pkt_loss_delay_ms { UINT32_MAX };
bool jumbo_frame_enabled { false };
int pkt_pad_sz { 0 };
int port_mtu { MAX_STANDARD_MTU };
int portid { 0 };
// states
unsigned int s_num_threads { 1 }; // 1 thread
struct rte_mempool *mbuf_pool { nullptr };
struct net_spec s_host_spec {
};
struct net_spec s_master_spec {
};
struct net_spec s_host_spec { };
struct net_spec s_master_spec { };
struct conn_spec s_master_cspec {
.src = &s_host_spec, .src_port = DEFAULT_RAT_PORT,
.dst = &s_master_spec, .dst_port = DEFAULT_RAT_PORT,
};
uint16_t s_portid { 0 };
std::vector<struct thread_info *> s_thr_info;
std::atomic<int> s_state { STATE_RUNNING }; // default non master mode
@ -124,8 +121,8 @@ struct options_t {
static struct options_t options;
static inline void
calc_stats(
uint64_t now, uint32_t *qps, uint32_t *recved_pkt, uint32_t *total_loss)
calc_stats(uint64_t now, uint32_t *qps, uint32_t *recved_pkt,
uint32_t *total_loss)
{
uint32_t recv = 0;
uint32_t loss = 0;
@ -159,8 +156,8 @@ proto_loop(struct thread_info *tinfo)
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
"proto_loop <thread %d>: waiting for SYNC from cat\n", tinfo->id);
while (options.s_state.load() == STATE_SYNC) {
const uint16_t nb_rx = rte_eth_rx_burst(
options.s_portid, tinfo->rxqid, rx_bufs, BURST_SIZE);
const uint16_t nb_rx = rte_eth_rx_burst(options.portid,
tinfo->rxqid, rx_bufs, BURST_SIZE);
if (nb_rx > 0) {
for (int i = 0; i < nb_rx; i++) {
struct pkt_hdr *each = check_valid_packet(
@ -195,13 +192,13 @@ proto_loop(struct thread_info *tinfo)
nullptr);
if (alloc_pkt_hdr(
options
.mbuf_pool,
mempool_get(
tinfo
->socket_id),
PKT_TYPE_SYNC_ACK,
&options
.s_master_cspec,
0,
&tx_buf,
0, &tx_buf,
&pkt_data) !=
0) {
rte_exit(
@ -209,7 +206,10 @@ proto_loop(struct thread_info *tinfo)
"failed to alloc pkt hdr\n");
}
tx_burst_all(options.s_portid, tinfo->txqid, &tx_buf, 1);
tx_burst_all(
options.portid,
tinfo->txqid,
&tx_buf, 1);
expected =
STATE_SYNC_ACK;
@ -240,6 +240,7 @@ proto_loop(struct thread_info *tinfo)
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"proto_loop <thread %d>: ignoring invalid packet %p.\n",
tinfo->id, (void *)rx_bufs[i]);
//dump_pkt(rx_bufs[i]);
}
rte_pktmbuf_free(rx_bufs[i]);
@ -268,16 +269,16 @@ pkt_loop(struct thread_info *tinfo)
srv_cspec.src = &options.s_host_spec;
srv_cspec.dst = &options.server_spec;
next_ts = nm_get_uptime_ns();
next_ts = topo_uptime_ns();
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "pkt_loop <thread %d>: entering\n",
tinfo->id);
while (options.s_state.load() == STATE_RUNNING) {
uint64_t now = nm_get_uptime_ns();
uint64_t now = topo_uptime_ns();
// always pop incoming packets
const uint16_t nb_rx = rte_eth_rx_burst(
options.s_portid, tinfo->rxqid, rx_bufs, BURST_SIZE);
const uint16_t nb_rx = rte_eth_rx_burst(options.portid,
tinfo->rxqid, rx_bufs, BURST_SIZE);
if (nb_rx > 0) {
for (int i = 0; i < nb_rx; i++) {
@ -309,7 +310,8 @@ pkt_loop(struct thread_info *tinfo)
pld_epoch->epoch);
id = epoch_get_id(epoch);
// printf("Load resp size : %d\n", rx_bufs[i]->data_len);
// printf("Load resp size : %d\n",
// rx_bufs[i]->data_len);
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"pkt_loop <thread %d>: packet %p epoch 0x%x id %d.\n",
@ -337,7 +339,7 @@ pkt_loop(struct thread_info *tinfo)
break;
case PKT_TYPE_FIN:
if (rte_is_same_ether_addr(
&each->eth_hdr.s_addr,
&each->eth_hdr.src_addr,
&options.s_master_spec
.mac_addr)) {
ntr(NTR_DEP_USER1,
@ -364,11 +366,11 @@ pkt_loop(struct thread_info *tinfo)
struct pkt_hdr *pkt_hdr;
if (alloc_pkt_hdr(
options.mbuf_pool,
mempool_get(
tinfo->socket_id),
PKT_TYPE_FIN_ACK,
&options.s_master_cspec,
0,
&tx_bufs[0],
0, &tx_bufs[0],
&pkt_hdr) != 0) {
rte_exit(EXIT_FAILURE,
"failed to allocate pkt hdr\n");
@ -386,7 +388,9 @@ pkt_loop(struct thread_info *tinfo)
rte_cpu_to_be_32(
total_loss);
tx_burst_all(options.s_portid, tinfo->txqid, &tx_bufs[0], 1);
tx_burst_all(options.portid,
tinfo->txqid, &tx_bufs[0],
1);
options.s_state.store(
STATE_FIN);
@ -487,9 +491,9 @@ pkt_loop(struct thread_info *tinfo)
// change dst port for every packet for RSS
srv_cspec.dst_port = dst_port_gen.next();
srv_cspec.src_port = src_port_gen.next();
if (alloc_pkt_hdr(options.mbuf_pool, PKT_TYPE_LOAD,
&srv_cspec, options.pkt_pad_sz, &tx_bufs[total_send],
&pkt_data) != 0) {
if (alloc_pkt_hdr(mempool_get(tinfo->socket_id),
PKT_TYPE_LOAD, &srv_cspec, options.pkt_pad_sz,
&tx_bufs[total_send], &pkt_data) != 0) {
rte_exit(EXIT_FAILURE,
"failed to allocate pkt hdr\n");
}
@ -497,7 +501,8 @@ pkt_loop(struct thread_info *tinfo)
pld_load = (struct pkt_payload_load *)pkt_data->payload;
pld_load->load = rte_cpu_to_be_32(
tinfo->load_gen->generate());
pld_load->which = rte_cpu_to_be_32(tinfo->which_dice(tinfo->which_rng));
pld_load->which = rte_cpu_to_be_32(
tinfo->which_dice(tinfo->which_rng));
unsigned int epoch = epoch_mk(tinfo->id, cur_epoch);
pld_load->epoch = rte_cpu_to_be_32(epoch);
cur_epoch++;
@ -514,26 +519,16 @@ pkt_loop(struct thread_info *tinfo)
total_send++;
}
tx_burst_all(options.s_portid, tinfo->txqid, tx_bufs, total_send);
// if (total_send > 0) {
// const uint16_t nb_tx = rte_eth_tx_burst(
// options.s_portid, tinfo->txqid, tx_bufs,
// total_send);
// if (nb_tx != total_send) {
// rte_exit(
// EXIT_FAILURE, "failed to send packet\n");
// }
// }
tx_burst_all(options.portid, tinfo->txqid, tx_bufs, total_send);
// check rage quit only when we have sent a packet
if (last_recv_ts == 0) {
last_recv_ts = nm_get_uptime_ns();
last_recv_ts = topo_uptime_ns();
}
if (nm_get_uptime_ns() - last_recv_ts >
options.rage_quit_time * MS2NS) {
if (topo_uptime_ns() >
options.rage_quit_time * MS2NS + last_recv_ts) {
rte_exit(EXIT_FAILURE,
"rat: thread %d waiting too long for resp. I QUIT!!\n",
"rat: thread %d waiting too long for resp. I F QUIT!\n",
tinfo->id);
}
}
@ -554,16 +549,16 @@ locore_main(void *tif)
uint32_t core_id = rte_lcore_id();
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
"locore_main <thread %d>: running on core %d...\n", tinfo->id,
core_id);
"locore_main <thread %d>: running on core %d rxqid %d txqid %d...\n", tinfo->id,
core_id, tinfo->rxqid, tinfo->txqid);
if (rte_eth_dev_socket_id(options.s_portid) > 0 &&
rte_eth_dev_socket_id(options.s_portid) != (int)rte_socket_id()) {
if (rte_eth_dev_socket_id(options.portid) > 0 &&
rte_eth_dev_socket_id(options.portid) != (int)rte_socket_id()) {
ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
"locore_main <thread %d>: WARNING, port %d is on remote NUMA node to "
"polling thread.\n\tPerformance will "
"not be optimal.\n",
tinfo->id, options.s_portid);
tinfo->id, options.portid);
}
if (options.slave_mode == 1) {
@ -575,7 +570,7 @@ locore_main(void *tif)
while (options.s_state.load() != STATE_RUNNING) {
}
// store the current timestamp
options.s_ts_begin.store(nm_get_uptime_ns());
options.s_ts_begin.store(topo_uptime_ns());
pkt_loop(tinfo);
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main <thread %d>: exited\n",
@ -584,99 +579,6 @@ locore_main(void *tif)
return 0;
}
static int
port_init(uint16_t portid, struct rte_mempool *mbuf_pool)
{
struct rte_eth_dev_info dev_info {
};
struct rte_eth_conf port_conf = port_conf_default;
struct rte_eth_txconf txconf {
};
struct rte_eth_rxconf rxconf {
};
uint16_t nb_rxd = RX_RING_SIZE;
uint16_t nb_txd = TX_RING_SIZE;
if (!rte_eth_dev_is_valid_port(portid)) {
return -1;
}
int ret = rte_eth_dev_info_get(portid, &dev_info);
if (ret != 0) {
return ret;
}
port_conf.rxmode.max_rx_pkt_len = mtu_to_pkt_size(options.port_mtu);;
port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_NONFRAG_IPV4_UDP |
ETH_RSS_L2_PAYLOAD | ETH_RSS_NONFRAG_IPV4_TCP;
port_conf.rx_adv_conf.rss_conf.rss_key = nullptr;
port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_UDP_CKSUM;
port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_IPV4_CKSUM;
port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_RSS_HASH;
port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM;
port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
port_conf.txmode.offloads |= DEV_TX_OFFLOAD_MBUF_FAST_FREE;
if (options.jumbo_frame_enabled) {
port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_JUMBO_FRAME;
}
/* Configure the Ethernet device. */
ret = rte_eth_dev_configure(
portid, options.s_num_threads, options.s_num_threads, &port_conf);
if (ret != 0)
return ret;
ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd, &nb_txd);
if (ret != 0)
return ret;
/* Allocate and set up 1 RX queue per thread . */
rxconf = dev_info.default_rxconf;
if (options.jumbo_frame_enabled) {
rxconf.offloads |= DEV_RX_OFFLOAD_JUMBO_FRAME;
}
rxconf.offloads = port_conf.rxmode.offloads;
for (uint32_t i = 0; i < options.s_num_threads; i++) {
ret = rte_eth_rx_queue_setup(portid,
options.s_thr_info.at(i)->rxqid, nb_rxd,
rte_eth_dev_socket_id(portid), &rxconf, mbuf_pool);
if (ret < 0)
return ret;
}
txconf = dev_info.default_txconf;
txconf.offloads = port_conf.txmode.offloads;
/* Allocate and set up 1 TX queue per Ethernet port. */
for (uint32_t i = 0; i < options.s_num_threads; i++) {
ret = rte_eth_tx_queue_setup(portid,
options.s_thr_info.at(i)->txqid, nb_txd,
rte_eth_dev_socket_id(portid), &txconf);
if (ret < 0)
return ret;
}
// set mtu
ret = rte_eth_dev_set_mtu(portid, options.port_mtu);
if (ret != 0)
return ret;
ret = rte_eth_dev_start(portid);
if (ret < 0)
return ret;
/* Display the port MAC address. */
struct rte_ether_addr addr {
};
ret = rte_eth_macaddr_get(portid, &addr);
// no promiscuous mode required
return ret;
}
static void
dump_options()
{
@ -694,13 +596,13 @@ dump_options()
" depth = %u\n"
" packet loss time threshold = %u\n"
" jumbo frame = %d\n"
" packet pad size = %d\n",
" packet pad size = %d\n"
" portid = %d\n",
ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING, options.run_time,
options.s_num_threads, options.rage_quit_time,
options.slave_mode, options.ia_gen, options.ld_gen,
options.target_qps, options.s_host_spec.ip, options.depth,
options.pkt_loss_delay_ms, options.jumbo_frame_enabled,
options.pkt_pad_sz);
options.s_num_threads, options.rage_quit_time, options.slave_mode,
options.ia_gen, options.ld_gen, options.target_qps,
options.s_host_spec.ip, options.depth, options.pkt_loss_delay_ms,
options.jumbo_frame_enabled, options.pkt_pad_sz, options.portid);
}
static void
@ -722,14 +624,13 @@ usage()
" -D: max number of packets in flight\n"
" -l: packet loss time threshold\n"
" -J: enable jumbo frame\n"
" -P: pad load packets to this size\n");
" -P: pad load packets to this size\n"
" -p: portid\n");
}
int
main(int argc, char *argv[])
{
unsigned int nb_ports;
struct rte_mempool *mbuf_pool;
struct thread_info *tinfo;
bool has_host_spec = false;
@ -749,8 +650,8 @@ main(int argc, char *argv[])
{
int c;
// parse arguments
while ((c = getopt(
argc, argv, "vht:s:SA:i:w:r:q:H:D:l:JP:")) != -1) {
while ((c = getopt(argc, argv,
"vht:s:SA:i:w:r:q:H:D:l:JP:p:")) != -1) {
switch (c) {
case 'v':
ntr_set_level(NTR_DEP_USER1,
@ -763,8 +664,8 @@ main(int argc, char *argv[])
options.run_time = strtol(optarg, nullptr, 10);
break;
case 's':
if (str_to_netspec(
optarg, &options.server_spec) != 0) {
if (str_to_netspec(optarg,
&options.server_spec) != 0) {
rte_exit(EXIT_FAILURE,
"invalid server net spec\n");
}
@ -776,9 +677,11 @@ main(int argc, char *argv[])
break;
case 'A':
cpulist_to_cpuset(optarg, &options.cpu_set);
options.s_num_threads = CPU_COUNT(&options.cpu_set);
options.s_num_threads = CPU_COUNT(
&options.cpu_set);
if (options.s_num_threads == 0) {
rte_exit(EXIT_FAILURE, "invalid cpu mask %s\n", optarg);
rte_exit(EXIT_FAILURE,
"invalid cpu mask %s\n", optarg);
}
break;
case 'i':
@ -790,17 +693,17 @@ main(int argc, char *argv[])
sizeof(options.ld_gen) - 1);
break;
case 'r':
options.rage_quit_time = strtol(
optarg, nullptr, 10);
options.rage_quit_time = strtol(optarg, nullptr,
10);
break;
case 'q':
options.target_qps = strtol(
optarg, nullptr, 10);
options.target_qps = strtol(optarg, nullptr,
10);
break;
case 'H':
has_host_spec = true;
if (str_to_netspec(
optarg, &options.s_host_spec) != 0) {
if (str_to_netspec(optarg,
&options.s_host_spec) != 0) {
rte_exit(EXIT_FAILURE,
"invalid host net spec.\n");
}
@ -812,8 +715,8 @@ main(int argc, char *argv[])
}
break;
case 'l':
options.pkt_loss_delay_ms = strtol(
optarg, nullptr, 10);
options.pkt_loss_delay_ms = strtol(optarg,
nullptr, 10);
if (options.pkt_loss_delay_ms == 0) {
options.pkt_loss_delay_ms = UINT32_MAX;
}
@ -823,75 +726,108 @@ main(int argc, char *argv[])
options.port_mtu = MAX_JUMBO_MTU;
break;
case 'P':
options.pkt_pad_sz = strtol(
optarg, nullptr, 10);
options.pkt_pad_sz = strtol(optarg, nullptr,
10);
break;
case 'p':
options.portid = strtol(optarg, nullptr, 10);
break;
default:
usage();
rte_exit(
EXIT_FAILURE, "unknown argument: %c\n", c);
rte_exit(EXIT_FAILURE, "unknown argument: %c\n",
c);
}
}
}
if (options.pkt_pad_sz != 0 && options.pkt_pad_sz > mtu_to_pkt_size(options.port_mtu)) {
rte_exit(EXIT_FAILURE, "pkt_pad_sz is too large for mtu %d\n", options.port_mtu);
if (options.pkt_pad_sz != 0 &&
options.pkt_pad_sz > mtu_to_pkt_size(options.port_mtu)) {
rte_exit(EXIT_FAILURE, "pkt_pad_sz is too large for mtu %d\n",
options.port_mtu);
}
if (!has_host_spec) {
rte_exit(EXIT_FAILURE, "Must specify host IP.\n");
}
// init nm
if (nm_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != 0) {
rte_exit(EXIT_FAILURE, "nm init failed!\n");
// init libtopo
if (topo_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) !=
0) {
rte_exit(EXIT_FAILURE, "libtopo init failed!\n");
}
if (nms_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) !=
0) {
rte_exit(EXIT_FAILURE, "libnms init failed!\n");
}
dump_options();
nb_ports = rte_eth_dev_count_avail();
if (nb_ports == 0) {
rte_exit(EXIT_FAILURE, "number of ports must be > 0\n");
// configure memory and port
struct port_conf pconf;
struct device_conf dconf;
struct mem_conf mconf;
portconf_get(options.portid, &pconf);
if (!pconf.timesync) {
ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
"main: timesync disabled. hw timestamp unavailable.\n ");
}
dconf.mtu = options.port_mtu;
dconf.num_threads = options.s_num_threads;
dconf.portid = options.portid;
dconf.rss_hf = pconf.rss_hf;
dconf.rx_offloads = pconf.rxoffload;
dconf.tx_offloads = pconf.txoffload;
dconf.timesync = pconf.timesync;
uint16_t portid = rte_eth_find_next(0);
if (portid == RTE_MAX_ETHPORTS) {
rte_exit(EXIT_FAILURE, "cannot find an available port\n");
}
options.s_portid = portid;
dconf.rx_fn = nullptr;
dconf.rx_user = nullptr;
dconf.rx_ring_sz = 2048;
dconf.tx_fn = nullptr;
dconf.tx_user = nullptr;
dconf.tx_ring_sz = 2048;
if (rte_eth_macaddr_get(portid, &options.s_host_spec.mac_addr) != 0) {
mconf.cache_size = 512;
mconf.priv_size = 0;
mconf.num_elements = (dconf.rx_ring_sz + dconf.tx_ring_sz) *
rte_lcore_count() / rte_socket_count();
mconf.data_room_size = RTE_MBUF_DEFAULT_BUF_SIZE + MAX_JUMBO_MTU -
MAX_STANDARD_MTU;
mconf.max_pools = -1;
dpdk_init(&dconf, &mconf);
if (rte_eth_macaddr_get(options.portid,
&options.s_host_spec.mac_addr) != 0) {
rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n",
portid);
options.portid);
}
// create a mbuf memory pool on the socket
mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", MBUF_MAX_COUNT,
MBUF_CACHE_SIZE, 0,
options.jumbo_frame_enabled ?
RTE_MBUF_DEFAULT_BUF_SIZE + (MAX_JUMBO_MTU - MAX_STANDARD_MTU) :
RTE_MBUF_DEFAULT_BUF_SIZE,
rte_eth_dev_socket_id(options.s_portid));
if (mbuf_pool == nullptr) {
rte_exit(EXIT_FAILURE, "cannot create mbuf pool\n");
}
options.mbuf_pool = mbuf_pool;
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
"Configured port %d with mac addr %x:%x:%x:%x:%x:%x\n",
options.portid, options.s_host_spec.mac_addr.addr_bytes[0],
options.s_host_spec.mac_addr.addr_bytes[1],
options.s_host_spec.mac_addr.addr_bytes[2],
options.s_host_spec.mac_addr.addr_bytes[3],
options.s_host_spec.mac_addr.addr_bytes[4],
options.s_host_spec.mac_addr.addr_bytes[5]);
unsigned int cpuset_idx = CPU_FFS(&options.cpu_set);
unsigned int tid = 0;
while(cpuset_idx != 0) {
while (cpuset_idx != 0) {
unsigned int lcore_id = cpuset_idx - 1;
tinfo = new thread_info;
tinfo->ia_gen = createGenerator(options.ia_gen);
tinfo->load_gen = createGenerator(options.ld_gen);
if (tinfo->ia_gen == nullptr || tinfo->load_gen == nullptr) {
rte_exit(
EXIT_FAILURE, "invalid ia_gen or ld_gen string\n");
rte_exit(EXIT_FAILURE,
"invalid ia_gen or ld_gen string\n");
}
tinfo->ia_gen->set_lambda((double)options.target_qps /
(double)(options.s_num_threads));
tinfo->id = tid;
tinfo->lcore_id = lcore_id;
tinfo->socket_id = rte_lcore_to_socket_id(lcore_id);
tinfo->rxqid = tid;
tinfo->txqid = tid;
options.s_thr_info.push_back(tinfo);
@ -901,19 +837,6 @@ main(int argc, char *argv[])
cpuset_idx = CPU_FFS(&options.cpu_set);
}
if (port_init(portid, mbuf_pool) != 0) {
rte_exit(EXIT_FAILURE, "cannot init port %d\n", portid);
}
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
"Configured port %d with mac addr %x:%x:%x:%x:%x:%x\n", portid,
options.s_host_spec.mac_addr.addr_bytes[0],
options.s_host_spec.mac_addr.addr_bytes[1],
options.s_host_spec.mac_addr.addr_bytes[2],
options.s_host_spec.mac_addr.addr_bytes[3],
options.s_host_spec.mac_addr.addr_bytes[4],
options.s_host_spec.mac_addr.addr_bytes[5]);
sleep(INIT_DELAY);
for (unsigned int i = 0; i < options.s_num_threads; i++) {
@ -958,7 +881,7 @@ main(int argc, char *argv[])
uint32_t qps;
uint32_t total_recv;
uint32_t total_loss;
calc_stats(nm_get_uptime_ns(), &qps, &total_recv, &total_loss);
calc_stats(topo_uptime_ns(), &qps, &total_recv, &total_loss);
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "qps = %d, recv = %d, loss = %d\n",
qps, total_recv, total_loss);
@ -969,7 +892,7 @@ main(int argc, char *argv[])
}
// clean up
rte_eth_dev_stop(portid);
dpdk_cleanup(&dconf);
return 0;
}

View File

@ -1,3 +1,4 @@
from lib2to3.refactor import get_fixers_from_package
import subprocess as sp
import time
import select
@ -95,12 +96,14 @@ def check_stderr(p, sel, exclude = []):# -> tuple[bool, list[str]]:
good = True
for e in err:
e = e.strip()
if len(e) == 0:
continue
for exc in exclude:
if not (exc in err):
good = False
for exc in exclude:
if exc in e:
good = True
break
return good, err
@ -124,9 +127,10 @@ def thr_check_stderr(p : sp.Popen, name: str, exclude):
if not status:
errthr_failed = True
local_failed = True
log_print("Error detected in \"" + name + "\":\n")
log_print("Error detected in \"" + name + "\":")
for e in err:
log_print(" " + e + "\n")
log_print(" \"" + e + "\"")
log_print("")
time.sleep(random.uniform(0.001, 0.1))
def errthr_start():

View File

@ -18,15 +18,15 @@ loadgen_load = "fixed:0"
# pkt_pad
jumbo_frame_threshold = 1518
pkt_pads = [
#"9018",
#1518",
"0",
"256",
"512",
"1024",
#"1518",
"1024"
#"2048",
#"4096",
#"8192",
#"9018"
#"8192"
]
pkt_pads_depth = {}
@ -74,25 +74,32 @@ memgen_target = [
# paths
test_dir = "/numam.d/build"
test_dir = "/numam.d/build/bin"
file_dir = os.path.dirname(os.path.realpath(__file__))
root_dir = os.path.join(file_dir,"..")
sample_filename = "sample.txt"
affinity = [
#"1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47",
"49,51,53,55,57,59,61,63,65,67,69,71,73,75,77,79,81,83,85,87,89,91,93,95"
"1,3,5,7,9,11,13,15,17,19,21,23",
"65,67,69,71,73,75,77,79,81,83,85,87,89,91,93,95,97,99,101,103,105,107,109,111,113,115,117,119,121,123,125,127",
"1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63",
"1,3,5,7,9,11,13,15",
"17,19,21,23,25,27,29,31",
"33,35,37,39,41,43,45,47",
"49,51,53,55,57,59,61,63"
]
master = ["skylake2.rcs.uwaterloo.ca"]
master_spec = ["192.168.123.10@3c:15:fb:c9:f3:36"]
server = ["skylake5.rcs.uwaterloo.ca"]
server_spec = ["192.168.123.13@3c:15:fb:c9:f3:28"]
master_cpumask = "2" # 1 thread
server = ["icelake2-int.rcs.uwaterloo.ca"]
server_spec = ["192.168.123.9@3c:ec:ef:61:39:f3"]
master = ["icelake2-int.rcs.uwaterloo.ca"]
master_spec = ["192.168.123.9@40:a6:b7:7c:86:10"]
#server = ["milan1-int.rcs.uwaterloo.ca"]
#server_spec = ["192.168.123.9@00:07:43:54:37:08"]
clients = ["skylake3.rcs.uwaterloo.ca", "skylake5.rcs.uwaterloo.ca", "skylake6.rcs.uwaterloo.ca"]# "skylake7.rcs.uwaterloo.ca", "skylake8.rcs.uwaterloo.ca"]
client_spec = ["192.168.123.12@3c:15:fb:c9:f3:4b", "192.168.123.13@3c:15:fb:c9:f3:28", "192.168.123.14@3c:15:fb:62:9b:2f"] #, "192.168.123.13@3c:15:fb:62:9c:be"]
clients = ["skylake2.rcs.uwaterloo.ca"]#, "skylake5.rcs.uwaterloo.ca", "skylake6.rcs.uwaterloo.ca"]# "skylake7.rcs.uwaterloo.ca", "skylake8.rcs.uwaterloo.ca"]
client_spec = ["192.168.123.12@3c:15:fb:c9:f3:36"]#, "192.168.123.13@3c:15:fb:c9:f3:28", "192.168.123.14@3c:15:fb:62:9b:2f"] #, "192.168.123.13@3c:15:fb:62:9c:be"]
client_cpumask = "1,3,5,7,9,11,13,15,17,19,21,23"
client_rage_quit = 1000 #1s
@ -191,7 +198,7 @@ def run_exp(affinity : str, ld : int, pkt_pad : str, aff_idx : int):
p = sp[0]
# launch stderr monitoring thread
exclude = ["Pseudo-terminal"]
exclude = ["Pseudo-terminal", "ice_", "i40e_"]
tc.errthr_create([p], master, exclude)
if not client_only:
tc.errthr_create(ssrv, server, exclude)

View File

@ -1,7 +1,8 @@
#!/bin/sh
dpdk_dir="/dpdk"
libtopo_dir="/libtopo"
root="$(dirname "$0")/.."
servers="skylake3.rcs.uwaterloo.ca" # skylake5.rcs.uwaterloo.ca skylake6.rcs.uwaterloo.ca skylake7.rcs.uwaterloo.ca skylake8.rcs.uwaterloo.ca"
servers="skylake2.rcs.uwaterloo.ca skylake3.rcs.uwaterloo.ca skylake5.rcs.uwaterloo.ca skylake6.rcs.uwaterloo.ca icelake2-int.rcs.uwaterloo.ca milan1-int.rcs.uwaterloo.ca"
rsync_flags="-az"
ssh_args="-o StrictHostKeyChecking=no -p77"
@ -18,8 +19,8 @@ echo "USER: $user"
compile() {
# separate these functions because we might change kernel (reboot) without needing to recompile
echo "====================$1===================="
#ssh $(echo $ssh_args $user@$1) "sudo reboot"
ssh $(echo $ssh_args $user@$1) "sudo sh -c \"rm -rf $dpdk_dir; mkdir -p $dpdk_dir; cd $dpdk_dir; git clone https://git.quacker.org/d/numam-dpdk; cd numam-dpdk; git checkout releases-13.0; meson -Denable_kmods=true build; cd build; ninja install\""
ssh $(echo $ssh_args $user@$1) "sudo sh -c \"sudo rm -rf $libtopo_dir; sudo rm -rf /usr/local/include/libtopo; sudo rm -rf /usr/local/lib/libtopo;sudo mkdir -p $libtopo_dir; sudo chmod 777 $libtopo_dir; cd $libtopo_dir; git clone https://git.quacker.org/d/libtopo; cd libtopo; mkdir build; cd build; cmake ../; sudo make install\""
ssh $(echo $ssh_args $user@$1) "sudo sh -c \"sudo pkg install -y meson pkgconf py38-pyelftools; sudo rm -rf $dpdk_dir; sudo mkdir -p $dpdk_dir; sudo chmod 777 $dpdk_dir; cd $dpdk_dir; git clone https://git.quacker.org/d/numam-dpdk; cd numam-dpdk; git checkout migration; CC=gcc CXX=g++ meson -Denable_kmods=true build; cd build; ninja install\""
wait
echo "$1 Done."
echo ""

View File

@ -1,7 +1,7 @@
#!/bin/sh
dpdk_dir="/numam.d"
root="$(dirname "$0")/.."
servers="icelake2-int.rcs.uwaterloo.ca" # skylake7.rcs.uwaterloo.ca skylake8.rcs.uwaterloo.ca icelake2-int.rcs.uwaterloo.ca"
servers="skylake2.rcs.uwaterloo.ca skylake3.rcs.uwaterloo.ca skylake5.rcs.uwaterloo.ca skylake6.rcs.uwaterloo.ca icelake2-int.rcs.uwaterloo.ca milan1-int.rcs.uwaterloo.ca"
rsync_flags="-rv -e \"ssh -p77\""
ssh_args="-o StrictHostKeyChecking=no -p77"

View File

@ -1,84 +0,0 @@
#include <unistd.h>
#include "defs.hh"
#include "nm.hh"
#include <ctime>
void test(const char * case_name, struct timespec * ts)
{
uint64_t slow;
uint64_t fast;
slow = get_uptime();
if (nanosleep(ts, nullptr) != 0) {
perror("nanosleep() interrupted!");
exit(-1);
}
slow = get_uptime() - slow;
fast = nm_get_uptime_ns();
if (nanosleep(ts, nullptr) != 0) {
perror("nanosleep() interrupted!");
exit(-1);
}
fast = nm_get_uptime_ns() - fast;
printf("%s: get_uptime(): %lu, nm_get_uptime_ns(): %lu\n", case_name, slow, fast);
}
int main()
{
struct timespec ts;
nm_init(0);
// 1s
ts.tv_nsec = 0;
ts.tv_sec = 1;
test("1s", &ts);
// 100ms
ts.tv_nsec = 100000000;
ts.tv_sec = 0;
test("100ms", &ts);
// 10ms
ts.tv_nsec = 10000000;
ts.tv_sec = 0;
test("10ms", &ts);
// 1ms
ts.tv_nsec = 1000000;
ts.tv_sec = 0;
test("1ms", &ts);
// 100us
ts.tv_nsec = 100000;
ts.tv_sec = 0;
test("100us", &ts);
// 10us
ts.tv_nsec = 10000;
ts.tv_sec = 0;
test("10us", &ts);
// 1us
ts.tv_nsec = 1000;
ts.tv_sec = 0;
test("1us", &ts);
// 100ns
ts.tv_nsec = 100;
ts.tv_sec = 0;
test("100ns", &ts);
// 10ns
ts.tv_nsec = 10;
ts.tv_sec = 0;
test("10ns", &ts);
// 1ns
ts.tv_nsec = 1;
ts.tv_sec = 0;
test("1ns", &ts);
return 0;
}

32
tests/nms_test.c Normal file
View File

@ -0,0 +1,32 @@
#include "nms.h"
#include <assert.h>
#include <stdio.h>
int main(void)
{
void * ret;
nms_init(1);
// duplicate init
nms_init(1);
// 1G
ret = nms_malloc(0, 1024 * 1024 * 1024);
assert(ret != NULL);
printf("1G: %p\n", ret);
// two 511Ms
ret = nms_malloc(0, 511 * 1024 * 1024);
assert(ret != NULL);
printf("511M: %p\n", ret);
ret = nms_malloc(0, 511 * 1024 * 1024);
assert(ret != NULL);
printf("511M: %p\n", ret);
// another 1G
ret = nms_malloc(0, 1024 * 1024 * 1024);
assert(ret != NULL);
printf("1G: %p\n", ret);
return 0;
}

View File

@ -1,8 +1,9 @@
#include "nm.hh"
#include "gen.hh"
#include <cstdlib>
#include "ntr.h"
#include <getopt.h>
#include <unistd.h>
#include <topo.h>
static void
usage()
@ -12,7 +13,9 @@ usage()
" -v: verbose mode\n"
" -b: MLG bytes per second\n"
" -x: MLG thread affinity mask\n"
" -X: MLG target domain affinity mask\n");
" -X: MLG target domain affinity mask\n"
" -S: shared buffer\n"
" -i: iterations\n");
fflush(stdout);
}
@ -21,14 +24,15 @@ int main(int argc, char * argv[])
ntr_init();
ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_WARNING);
unsigned long long mlg_arrsz = 0;
uint32_t mlg_iter = -1;
unsigned long long mlg_dmask = 0;
unsigned long long mlg_cmask = 0;
size_t arr_sz = 0;
uint32_t iter = -1;
cpuset_t threads;
int shared_buffer = 0;
cpuset_t domain_mask;
{
int c;
// parse arguments
while ((c = getopt(argc, argv, "hb:X:x:vi:")) != -1) {
while ((c = getopt(argc, argv, "hb:X:x:vi:S")) != -1) {
switch (c) {
case 'v':
ntr_set_level(NTR_DEP_USER1,
@ -38,18 +42,19 @@ int main(int argc, char * argv[])
usage();
exit(0);
case 'b':
mlg_arrsz = strtoull(optarg, nullptr, 10);
arr_sz = strtoull(optarg, nullptr, 10);
break;
case 'i':
mlg_iter = strtoul(optarg, nullptr, 10);
iter = strtoul(optarg, nullptr, 10);
break;
case 'X':
mlg_dmask = strtoull(
optarg, nullptr, 16);
cpulist_to_cpuset(optarg, &domain_mask);
break;
case 'x':
mlg_cmask = strtoull(
optarg, nullptr, 16);
cpulist_to_cpuset(optarg, &threads);
break;
case 'S':
shared_buffer = 1;
break;
default:
usage();
@ -58,18 +63,22 @@ int main(int argc, char * argv[])
}
}
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "MLG: [size: %llu, iter: %u, threads: 0x%llx, domain: 0x%llx]\n", mlg_arrsz, mlg_iter, mlg_cmask, mlg_dmask);
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "MLG: [size: %ld iter: %u, # threads: 0x%d, domain: 0x%ld]\n", arr_sz, iter, CPU_COUNT(&threads), CPU_FFS(&domain_mask) - 1);
// init nm
if (nm_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != 0) {
fprintf(stderr, "nm init failed!\n");
// init topo
if (topo_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != 0) {
fprintf(stderr, "libtopo init failed!\n");
exit(1);
}
bool success = false;
memload_generator * mgen = new memload_generator(mlg_cmask, mlg_dmask, mlg_arrsz, mlg_iter, &success);
memload_generator::memload_generator_options opts;
opts.chunk_size = arr_sz;
opts.iteration = iter;
opts.shared_buffer = shared_buffer;
opts.verbose = 1;
mgen->start();
auto mgen = new memload_generator(&threads, &domain_mask, &opts, &success);
while(!mgen->check_done()) {
usleep(10000);
}
@ -78,6 +87,8 @@ int main(int argc, char * argv[])
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
"main: MLG bps = %ld ~= %ldM\n", bps, bps / 1024 / 1024);
delete mgen;
return 0;
}