From 565dbca27804bdf90fc23c9fe8fe4c8be4ece1ec Mon Sep 17 00:00:00 2001 From: quackerd Date: Wed, 22 Jun 2022 23:40:48 +0800 Subject: [PATCH] latest dpdk & refactoring --- .clang-format | 4 + CMakeLists.txt | 67 ++-- inc/defs.hh | 3 +- inc/gen.hh | 49 ++- inc/net/{util.hh => netsup.hh} | 67 +++- inc/net/pkt.hh | 66 ++-- inc/nm.hh | 103 ----- inc/nms.h | 20 + libgen/loadgen.cc | 136 +++++++ libnm/alloc.cc | 113 ------ libnm/loadgen.cc | 217 ---------- libnm/nm.cc | 72 ---- libnm/nmp.hh | 9 - libnm/topo.cc | 204 ---------- libnms/alloc.c | 196 ++++++++++ net/cat.cc | 397 ++++++++----------- net/khat.cc | 696 +++++++++++++++------------------ net/libnetsup/dpdk.cc | 207 ++++++++++ net/libnetsup/portconf.cc | 59 +++ net/rat.cc | 393 ++++++++----------- scripts/libs/libtc.py | 12 +- scripts/run.py | 35 +- scripts/setup_dpdk.sh | 7 +- scripts/setup_program.sh | 2 +- test/ts.cc | 84 ---- tests/nms_test.c | 32 ++ util/memloadgen.cc | 49 ++- 27 files changed, 1534 insertions(+), 1765 deletions(-) rename inc/net/{util.hh => netsup.hh} (74%) delete mode 100644 inc/nm.hh create mode 100644 inc/nms.h create mode 100644 libgen/loadgen.cc delete mode 100644 libnm/alloc.cc delete mode 100644 libnm/loadgen.cc delete mode 100644 libnm/nm.cc delete mode 100644 libnm/nmp.hh delete mode 100644 libnm/topo.cc create mode 100644 libnms/alloc.c create mode 100644 net/libnetsup/dpdk.cc create mode 100644 net/libnetsup/portconf.cc delete mode 100644 test/ts.cc create mode 100644 tests/nms_test.c diff --git a/.clang-format b/.clang-format index 3823fa5..1c1367a 100644 --- a/.clang-format +++ b/.clang-format @@ -52,7 +52,11 @@ ForEachMacros: - ARB_FOREACH_REVERSE - ARB_FOREACH_REVERSE_FROM - ARB_FOREACH_REVERSE_SAFE + - BIT_FOREACH_ISCLR + - BIT_FOREACH_ISSET - CPU_FOREACH + - CPU_FOREACH_ISCLR + - CPU_FOREACH_ISSET - FOREACH_THREAD_IN_PROC - FOREACH_PROC_IN_SYSTEM - FOREACH_PRISON_CHILD diff --git a/CMakeLists.txt b/CMakeLists.txt index 7fa69e9..8e2b94a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,68 +4,79 @@ project(khat) list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}") find_package(PkgConfig REQUIRED) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY lib) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY bin) -pkg_check_modules(HWLOC hwloc REQUIRED) pkg_check_modules(DPDK libdpdk) pkg_check_modules(SPDK spdk_event_bdev spdk_env_dpdk) pkg_check_modules(SPDK_SYS spdk_syslibs) pkg_check_modules(UUID uuid) -# get_filename_component(ISAL_LIB_PATH ${SPDK_INCLUDE_DIRS} DIRECTORY) -# get_filename_component(ISAL_LIB_PATH ${ISAL_LIB_PATH} DIRECTORY) -# set(ISAL_LIB_PATH ${ISAL_LIB_PATH}/isa-l/.libs) +pkg_check_modules(TOPO bsdtopo) set(CC_FLAGS -O2 -g -Wall -Wextra -Werror -std=c++11 -Wno-deprecated-declarations -Wno-address-of-packed-member -Wno-zero-length-array -Wno-gnu-zero-variadic-macro-arguments - -msse4 - -mavx) + -march=native) + +set(C_FLAGS -O2 -g -Wall -Wextra -Werror -std=c17 + -Wno-deprecated-declarations + -Wno-address-of-packed-member + -Wno-zero-length-array + -Wno-gnu-zero-variadic-macro-arguments + -march=native) + include_directories(${CMAKE_SOURCE_DIR}/inc) +include_directories() -set(LIBNM_CC_FLAGS -O2 -g -Wall -Wextra -Werror -std=c++11 -mavx -msse4) set(LIBNTR_C_FLAGS -O2 -g -Wall -Wextra -Werror -std=c11) set(LIBGEN_CC_FLAGS -O2 -g -Wall -Wextra -Werror -std=c++11) -add_library(ntr STATIC libntr/ntr.c) +add_library(ntr SHARED libntr/ntr.c) target_compile_options(ntr PRIVATE ${LIBNTR_C_FLAGS}) -add_library(gen STATIC libgen/generator.cc) -target_compile_options(gen PRIVATE ${LIBGEN_CC_FLAGS}) +add_library(gen SHARED libgen/generator.cc libgen/loadgen.cc) +target_link_libraries(gen PRIVATE pthread ntr ${TOPO_LINK_LIBRARIES} nms) +target_compile_options(gen PRIVATE ${LIBGEN_CC_FLAGS} ${TOPO_CFLAGS}) -add_library(nm STATIC libnm/nm.cc libnm/alloc.cc libnm/loadgen.cc libnm/topo.cc) -target_include_directories(nm PRIVATE ${HWLOC_INCLUDE_DIRS}) -target_link_libraries(nm PRIVATE gen ${HWLOC_LINK_LIBRARIES}) -target_compile_options(nm PRIVATE ${LIBNM_CC_FLAGS} ${HWLOC_CFLAGS}) +add_library(netsup SHARED net/libnetsup/dpdk.cc net/libnetsup/portconf.cc) +target_link_libraries(netsup PRIVATE ntr ${DPDK_LINK_LIBRARIES}) +target_compile_options(netsup PRIVATE ${LIBGEN_CC_FLAGS} ${DPDK_CFLAGS}) + +add_library(nms SHARED libnms/alloc.c) +target_link_libraries(nms PRIVATE ${TOPO_LINK_LIBRARIES}) +target_compile_options(nms PRIVATE ${TOPO_CFLAGS}) add_executable(khat EXCLUDE_FROM_ALL net/khat.cc) -target_link_libraries(khat PRIVATE pthread nm ntr ${DPDK_LINK_LIBRARIES}) -target_compile_options(khat PRIVATE ${CC_FLAGS} ${DPDK_CFLAGS}) +target_link_libraries(khat PRIVATE pthread ntr gen netsup nms ${DPDK_LINK_LIBRARIES} ${TOPO_LINK_LIBRARIES}) +target_compile_options(khat PRIVATE ${CC_FLAGS} ${DPDK_CFLAGS} ${TOPO_CFLAGS}) add_executable(cat EXCLUDE_FROM_ALL net/cat.cc) -target_link_libraries(cat PRIVATE pthread nm ntr gen ${DPDK_LINK_LIBRARIES}) -target_compile_options(cat PRIVATE ${CC_FLAGS} ${DPDK_CFLAGS}) +target_link_libraries(cat PRIVATE pthread ntr gen netsup nms ${DPDK_LINK_LIBRARIES} ${TOPO_LINK_LIBRARIES}) +target_compile_options(cat PRIVATE ${CC_FLAGS} ${DPDK_CFLAGS} ${TOPO_CFLAGS}) add_executable(rat EXCLUDE_FROM_ALL net/rat.cc) -target_link_libraries(rat PRIVATE pthread nm ntr gen ${DPDK_LINK_LIBRARIES}) -target_compile_options(rat PRIVATE ${CC_FLAGS} ${DPDK_CFLAGS}) +target_link_libraries(rat PRIVATE pthread ntr gen netsup nms ${DPDK_LINK_LIBRARIES} ${TOPO_LINK_LIBRARIES}) +target_compile_options(rat PRIVATE ${CC_FLAGS} ${DPDK_CFLAGS} ${TOPO_CFLAGS}) add_executable(birb EXCLUDE_FROM_ALL storage/birb.cc storage/io_gen.cc storage/drivers/bdev.cc storage/drivers/bdev_thread.cc storage/drivers/nvme.cc storage/drivers/nvme_thread.cc) target_include_directories(birb PRIVATE ${SPDK_INCLUDE_DIRS} ${DPDK_INCLUDE_DIRS} ${UUID_INCLUDE_DIRS}) target_compile_options(birb PRIVATE ${CC_FLAGS} ${SPDK_CFLAGS} ${UUID_CFLAGS}) target_link_directories(birb PRIVATE ${SPDK_LIBRARY_DIRS} ${SPDK_SYS_STATIC_LIBRARY_DIRS} ${UUID_LIBRARY_DIRS}) -target_link_libraries(birb PRIVATE pthread nm ntr gen -Wl,--whole-archive ${SPDK_LIBRARIES} -Wl,--no-whole-archive ${SPDK_SYS_STATIC_LIBRARIES}) +target_link_libraries(birb PRIVATE pthread ntr gen -Wl,--whole-archive ${SPDK_LIBRARIES} -Wl,--no-whole-archive ${SPDK_SYS_STATIC_LIBRARIES}) add_executable(birb_posix EXCLUDE_FROM_ALL storage/birb_posix.cc storage/io_gen.cc) target_compile_options(birb_posix PRIVATE ${CC_FLAGS}) -target_link_libraries(birb_posix PRIVATE pthread nm ntr gen) +target_link_libraries(birb_posix PRIVATE pthread ntr gen) add_executable(memloadgen util/memloadgen.cc) -target_link_libraries(memloadgen PRIVATE pthread nm ntr) -target_compile_options(memloadgen PRIVATE ${CC_FLAGS}) +target_link_libraries(memloadgen PRIVATE pthread gen ntr ${TOPO_LINK_LIBRARIES}) +target_compile_options(memloadgen PRIVATE ${CC_FLAGS} ${TOPO_CFLAGS}) -add_executable(test_ts test/ts.cc) -set_target_properties(test_ts PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/test) -target_link_libraries(test_ts PRIVATE nm) -target_compile_options(test_ts PRIVATE ${CC_FLAGS}) +add_executable(nms_test tests/nms_test.c) +set_target_properties(nms_test PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/tests) +target_link_libraries(nms_test PRIVATE nms) +target_compile_options(nms_test PRIVATE ${C_FLAGS}) \ No newline at end of file diff --git a/inc/defs.hh b/inc/defs.hh index 8a07b13..78ae8ee 100644 --- a/inc/defs.hh +++ b/inc/defs.hh @@ -1,13 +1,12 @@ #pragma once -#include #include #include #include #include #include +#include #include -#include #define DISALLOW_EVIL_CONSTRUCTORS(TypeName) \ TypeName(const TypeName &) = delete; \ diff --git a/inc/gen.hh b/inc/gen.hh index 5fb985a..a4c8a3e 100644 --- a/inc/gen.hh +++ b/inc/gen.hh @@ -7,10 +7,6 @@ #pragma once -#include - -#include - #include #include #include @@ -23,6 +19,10 @@ #include #include +#include + +#include "defs.hh" + #define D(fmt, ...) #define DIE(fmt, ...) (void)0; @@ -292,3 +292,44 @@ Generator *createGenerator(std::string str); Generator *createFacebookKey(); Generator *createFacebookValue(); Generator *createFacebookIA(); + +// memload generator +class memload_generator { + public: + struct memload_generator_options { + constexpr static size_t ITERATION_MAX = -1; + size_t chunk_size {512 * 1024 * 1024}; + size_t iteration {ITERATION_MAX}; + int verbose {0}; + bool shared_buffer {true}; + }; + + private: + DISALLOW_EVIL_CONSTRUCTORS(memload_generator); + struct thread_info { + pthread_t pthr; + void *from_buffer; + void *to_buffer; + std::atomic *stop; + + int tid; + + struct memload_generator_options * opts; + + // stat keeping + std::atomic num_trans; + uint64_t begin_ts; + uint64_t end_ts; + }; + + std::vector thr_infos; + std::atomic end; + struct memload_generator_options opts; + static void *worker_thrd(void *_tinfo); + + public: + memload_generator(cpuset_t * threads, cpuset_t * target_domain, struct memload_generator_options * opt, bool *success); + uint64_t get_bps(); + bool check_done(); + ~memload_generator(); +}; diff --git a/inc/net/util.hh b/inc/net/netsup.hh similarity index 74% rename from inc/net/util.hh rename to inc/net/netsup.hh index 4937573..f80de9a 100644 --- a/inc/net/util.hh +++ b/inc/net/netsup.hh @@ -1,22 +1,63 @@ #pragma once -#include -#include +#include + +#include "rte_ethdev.h" +#include "rte_ether.h" + +#define MAX_NUMA_NODES (64) + +struct device_conf { + int portid; + uint16_t tx_ring_sz; + uint16_t rx_ring_sz; + int num_threads; + int mtu; + uint64_t rx_offloads; + uint64_t tx_offloads; + uint64_t rss_hf; + + rte_tx_callback_fn tx_fn; + void * tx_user; + + rte_rx_callback_fn rx_fn; + void * rx_user; + + bool timesync; +}; + +struct mem_conf { + int num_elements; + int cache_size; + int data_room_size; + int priv_size; + unsigned int max_pools; +}; constexpr static uint16_t MIN_RANDOM_PORT = 1000; constexpr static uint16_t DEFAULT_RAT_PORT = 1234; -constexpr static unsigned int INIT_DELAY = 1; +constexpr static unsigned int INIT_DELAY = 3; constexpr static unsigned int MAX_NODES = 64; -static inline void -tx_burst_all(int portid, int txqid, struct rte_mbuf ** tx_bufs, int sz) -{ - int remaining = sz; - while(remaining > 0) { - remaining -= rte_eth_tx_burst( - portid, txqid, &tx_bufs[sz - remaining], - remaining); - } -} +void +dpdk_init(struct device_conf *dconf, struct mem_conf *mconf); + +void +dpdk_cleanup(struct device_conf *dconf); + +struct rte_mempool * +mempool_get(int nodeid); + +struct port_conf { + const char * driver_name; + uint64_t rxoffload; + uint64_t txoffload; + uint64_t rss_hf; + bool timesync; +}; + +int +portconf_get(int portid, struct port_conf * out); + // constexpr static int LATENCY_MEASURE_TIMES = 10000; diff --git a/inc/net/pkt.hh b/inc/net/pkt.hh index bf9da5e..babcaba 100644 --- a/inc/net/pkt.hh +++ b/inc/net/pkt.hh @@ -10,8 +10,7 @@ #include #include -#include "nm.hh" -#include "util.hh" +#include "defs.hh" #include @@ -24,11 +23,23 @@ constexpr static uint32_t MAX_JUMBO_MTU = 9000; constexpr static uint32_t MAX_STANDARD_MTU = 1500; -static inline int mtu_to_pkt_size(int mtu) +static inline int +mtu_to_pkt_size(int mtu) { return mtu + RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN; } +static inline void +tx_burst_all(int portid, int txqid, struct rte_mbuf ** tx_bufs, int sz) +{ + int remaining = sz; + while(remaining > 0) { + remaining -= rte_eth_tx_burst( + portid, txqid, &tx_bufs[sz - remaining], + remaining); + } +} + constexpr static uint32_t ETHER_FRAME_MAGIC = 0xDCDCE5E5; const static struct rte_ether_addr POU_MAC { 0x01, 0x00, 0x5e, 0x00, 0x01, 0x81 @@ -87,7 +98,7 @@ pkt_hdr_to_netspec(struct pkt_hdr *pkt, struct net_spec *src, uint16_t *src_port, struct net_spec *dst, uint16_t *dst_port) { if (src != nullptr) { - rte_ether_addr_copy(&pkt->eth_hdr.s_addr, &src->mac_addr); + rte_ether_addr_copy(&pkt->eth_hdr.src_addr, &src->mac_addr); src->ip = rte_be_to_cpu_32(pkt->ipv4_hdr.src_addr); } @@ -96,7 +107,7 @@ pkt_hdr_to_netspec(struct pkt_hdr *pkt, struct net_spec *src, } if (dst != nullptr) { - rte_ether_addr_copy(&pkt->eth_hdr.d_addr, &dst->mac_addr); + rte_ether_addr_copy(&pkt->eth_hdr.dst_addr, &dst->mac_addr); dst->ip = rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr); } @@ -203,7 +214,7 @@ class rdport_generator { , cur(0) , dist(0, MAX_PORT - min_port) { - gen.seed(nm_get_uptime_ns()); + gen.seed(get_uptime()); cur = dist(gen); } uint16_t next() @@ -224,23 +235,23 @@ class rdport_generator { (rte_be_to_cpu_32(pkt->ipv4_hdr.src_addr) >> 8) & 0xff, \ (rte_be_to_cpu_32(pkt->ipv4_hdr.src_addr) >> 0) & 0xff, \ rte_be_to_cpu_16(pkt->udp_hdr.src_port), \ - pkt->eth_hdr.s_addr.addr_bytes[0], \ - pkt->eth_hdr.s_addr.addr_bytes[1], \ - pkt->eth_hdr.s_addr.addr_bytes[2], \ - pkt->eth_hdr.s_addr.addr_bytes[3], \ - pkt->eth_hdr.s_addr.addr_bytes[4], \ - pkt->eth_hdr.s_addr.addr_bytes[5], \ + pkt->eth_hdr.src_addr.addr_bytes[0], \ + pkt->eth_hdr.src_addr.addr_bytes[1], \ + pkt->eth_hdr.src_addr.addr_bytes[2], \ + pkt->eth_hdr.src_addr.addr_bytes[3], \ + pkt->eth_hdr.src_addr.addr_bytes[4], \ + pkt->eth_hdr.src_addr.addr_bytes[5], \ (rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr) >> 24) & 0xff, \ (rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr) >> 16) & 0xff, \ (rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr) >> 8) & 0xff, \ (rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr) >> 0) & 0xff, \ rte_be_to_cpu_16(pkt->udp_hdr.dst_port), \ - pkt->eth_hdr.d_addr.addr_bytes[0], \ - pkt->eth_hdr.d_addr.addr_bytes[1], \ - pkt->eth_hdr.d_addr.addr_bytes[2], \ - pkt->eth_hdr.d_addr.addr_bytes[3], \ - pkt->eth_hdr.d_addr.addr_bytes[4], \ - pkt->eth_hdr.d_addr.addr_bytes[5], rte_be_to_cpu_16(pkt->type)) + pkt->eth_hdr.dst_addr.addr_bytes[0], \ + pkt->eth_hdr.dst_addr.addr_bytes[1], \ + pkt->eth_hdr.dst_addr.addr_bytes[2], \ + pkt->eth_hdr.dst_addr.addr_bytes[3], \ + pkt->eth_hdr.dst_addr.addr_bytes[4], \ + pkt->eth_hdr.dst_addr.addr_bytes[5], rte_be_to_cpu_16(pkt->type)) static inline void print_mac(struct rte_ether_addr *mac) @@ -253,7 +264,7 @@ print_mac(struct rte_ether_addr *mac) static inline void print_ipv4(uint32_t ip) { - printf("%d-%d-%d-%d", (ip >> 24) & 0xff, (ip >> 16) & 0xff, + printf("%d.%d.%d.%d", (ip >> 24) & 0xff, (ip >> 16) & 0xff, (ip >> 8) & 0xff, (ip >> 0) & 0xff); } @@ -276,10 +287,10 @@ dump_pkt(struct rte_mbuf *pkt) "Packet %p: Length 0x%x\n", (void *)pkt, rte_pktmbuf_data_len(pkt)); printf(" Ethernet header:\n"); printf(" Src:"); - print_mac(ð_hdr->s_addr); + print_mac(ð_hdr->src_addr); printf("\n"); printf(" Dst:"); - print_mac(ð_hdr->d_addr); + print_mac(ð_hdr->dst_addr); printf("\n"); printf(" Type: 0x%x\n", rte_be_to_cpu_16(eth_hdr->ether_type)); @@ -339,11 +350,11 @@ construct_pkt_hdr( // construct l2 header eth_hdr = &pkt_data->eth_hdr; - rte_ether_addr_copy(&conn->src->mac_addr, ð_hdr->s_addr); + rte_ether_addr_copy(&conn->src->mac_addr, ð_hdr->src_addr); if (is_ts_pkt) { - rte_ether_addr_copy(&POU_MAC, ð_hdr->d_addr); + rte_ether_addr_copy(&POU_MAC, ð_hdr->dst_addr); } else { - rte_ether_addr_copy(&conn->dst->mac_addr, ð_hdr->d_addr); + rte_ether_addr_copy(&conn->dst->mac_addr, ð_hdr->dst_addr); } eth_hdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4); buf->l2_len = sizeof(struct rte_ether_hdr); @@ -378,10 +389,13 @@ construct_pkt_hdr( udp_hdr->dgram_cksum = 0; /* No UDP checksum. */ udp_hdr->dgram_len = total_sz - sizeof(struct rte_ether_hdr) - sizeof(struct rte_ipv4_hdr) - sizeof(struct rte_udp_hdr); buf->l4_len = sizeof(struct rte_udp_hdr); + buf->ol_flags |= RTE_MBUF_F_TX_IPV4; + buf->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM; + buf->ol_flags |= RTE_MBUF_F_TX_UDP_CKSUM; if (is_ts_pkt) { // set misc flags - buf->ol_flags |= PKT_TX_IEEE1588_TMST; + buf->ol_flags |= RTE_MBUF_F_TX_IEEE1588_TMST; pkt_data->ptp_hdr.ptp_ver = 0x2; // VER 2 pkt_data->ptp_hdr.ptp_msg_type = 0x0; // SYNC } else { @@ -463,7 +477,7 @@ check_valid_packet(struct rte_mbuf *pkt, const struct rte_ether_addr *host_mac) } if (!rte_is_same_ether_addr( - expected_mac, &pkt_data->eth_hdr.d_addr)) + expected_mac, &pkt_data->eth_hdr.dst_addr)) return nullptr; } diff --git a/inc/nm.hh b/inc/nm.hh deleted file mode 100644 index 388b92e..0000000 --- a/inc/nm.hh +++ /dev/null @@ -1,103 +0,0 @@ -#pragma once - -#include -#include "gen.hh" -#include "defs.hh" - -#include -#include -#include -#include - -constexpr static unsigned int NM_LEVEL_NUMA = 0; -constexpr static unsigned int NM_LEVEL_CPU = 1; -constexpr static unsigned int NM_LEVEL_CORE = 2; -constexpr static unsigned int NM_MAX_LEVEL = NM_LEVEL_CORE + 1; - -constexpr static int NM_MAX_OBJS_PER_LVL = 256; - -// misc functions - -// 0 on success -1 on error -int nm_init(int verbosity); - -uint64_t nm_tsc2ns(uint64_t tsc); - -uint64_t nm_get_uptime_ns(); - -// topology stuff -struct nm_obj; - -struct nm_obj * -nm_find_parent_obj(struct nm_obj * start, int parent_level); - -int -nm_obj_count(int level); - -struct nm_obj * -nm_obj_from_id(int level, int id); - -struct nm_obj * -nm_obj_find_parent(struct nm_obj * start, int parent_level); - -int -nm_obj_get_id(struct nm_obj * obj); - -int -nm_obj_get_level(struct nm_obj * obj); - -static inline int -nm_get_node_from_core(int coreid) -{ - return nm_obj_get_id(nm_obj_find_parent(nm_obj_from_id(NM_LEVEL_CORE, coreid), NM_LEVEL_NUMA)); -} - -// memload generator -class memload_generator { - private: - DISALLOW_EVIL_CONSTRUCTORS(memload_generator); - struct thread_info { - pthread_t pthr; - uint32_t iter; - uint32_t array_size; - std::atomic init_status; - std::atomic *state; - std::atomic num_trans; - constexpr static int INIT_START = 0; - constexpr static int INIT_SUCCESS = 1; - constexpr static int INIT_FAILED = 2; - void *from_region; - void *to_region; - uint32_t to_domainid; - uint32_t from_domainid; - uint64_t begin_ts; - uint64_t stop_ts; - }; - std::vector thr_infos; - - std::atomic state; - constexpr static uint32_t STATE_READY = 0; - constexpr static uint32_t STATE_START = 1; - constexpr static uint32_t STATE_STOP = 2; - - uint32_t array_size; - uint32_t iteration; - - static void *worker_thrd(void *_tinfo); - - public: - memload_generator(uint64_t from_cmask, uint64_t to_cmask, uint32_t array_sz, uint32_t iteration, - bool *success); - void start(); - void stop(); - uint64_t get_bps(); - bool check_done(); - ~memload_generator(); -}; - -// allocators -void * -nm_malloc(unsigned int node, size_t size); - -void -nm_free(unsigned int node, void * addr); diff --git a/inc/nms.h b/inc/nms.h new file mode 100644 index 0000000..45166d2 --- /dev/null +++ b/inc/nms.h @@ -0,0 +1,20 @@ +#pragma once + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +int +nms_init(int verbose); + +void * +nms_malloc(int nodeid, size_t sz); + +void +nms_free(int nodeid, void * addr); + +#ifdef __cplusplus +} +#endif // __cplusplus diff --git a/libgen/loadgen.cc b/libgen/loadgen.cc new file mode 100644 index 0000000..bc79d90 --- /dev/null +++ b/libgen/loadgen.cc @@ -0,0 +1,136 @@ +#include +#include +#include +#include + +#include +#include +#include +#include +#include + + +#include "gen.hh" +#include "nms.h" + +void * +memload_generator::worker_thrd(void *_tinfo) +{ + auto *tinfo = (struct thread_info *)_tinfo; + long tid; + thr_self(&tid); + + if (tinfo->opts->verbose) { + fprintf( + stdout, "memload_generator : running...\n", tid); + } + tinfo->begin_ts = topo_uptime_ns(); + while (tinfo->num_trans.load() < tinfo->opts->iteration) { + memcpy((char *)tinfo->from_buffer, (char *)tinfo->to_buffer, tinfo->opts->chunk_size); + tinfo->end_ts = topo_uptime_ns(); + tinfo->num_trans.fetch_add(1); + } + + if (tinfo->opts->verbose) { + fprintf( + stdout, "memload_generator : exiting...\n", tid); + } + return nullptr; +} + +memload_generator::memload_generator(cpuset_t * threads, cpuset_t * target_domain, struct memload_generator_options * opt, bool *success) +{ + *success = false; + end.store(0); + std::memcpy(&this->opts, opt, sizeof(memload_generator_options)); + + int nextcore = CPU_FFS(threads) - 1; + int target_domain_id = CPU_FFS(target_domain) - 1; + int num_cores = CPU_COUNT(threads); + if (target_domain_id < 0 || num_cores == 0) { + return; + } + + void * local_buffer; + void * target_buffer; + if (opts.shared_buffer) { + local_buffer = nms_malloc(topo_core_to_numa(nextcore), opt->chunk_size); + target_buffer = nms_malloc(target_domain_id, opt->chunk_size); + } + + int tid = 0; + while (nextcore != -1) { + auto *info = new struct thread_info; + cpuset_t cpuset; + pthread_attr_t attr; + + info->stop = &this->end; + info->num_trans.store(0); + info->begin_ts = 0; + info->end_ts = 0; + info->opts = &this->opts; + if (opt->shared_buffer) { + info->from_buffer = local_buffer; + info->to_buffer = target_buffer; + } else { + info->from_buffer = nms_malloc(topo_core_to_numa(nextcore), opt->chunk_size); + info->to_buffer = nms_malloc(target_domain_id, opt->chunk_size); + } + + CPU_ZERO(&cpuset); + CPU_SET(nextcore, &cpuset); + pthread_attr_init(&attr); + pthread_attr_setaffinity_np(&attr, sizeof(cpuset_t), &cpuset); + pthread_create(&info->pthr, &attr, worker_thrd, info); + + if (opts.verbose) { + fprintf(stdout, + "memload_generator: created thread %d on core %d target domain %d\n", tid, + nextcore, target_domain_id); + } + + thr_infos.push_back(info); + + CPU_CLR(nextcore, threads); + nextcore = CPU_FFS(threads) - 1; + tid++; + } + + *success = true; + if (opts.verbose) { + fprintf(stdout, + "memload_generator: exiting constructor. Success: %d...\n", + success ? 1 : 0); + } +} + +bool +memload_generator::check_done() +{ + bool done = true; + for (auto i : thr_infos) { + done = done && (i->num_trans.load() >= this->opts.iteration); + } + return done; +} + +uint64_t +memload_generator::get_bps() +{ + uint64_t total_transactions = 0; + uint64_t total_time = 0; + for (auto i : thr_infos) { + total_transactions += i->num_trans.load(); + total_time = (i->end_ts - i->begin_ts) + total_time; + } + return (double)(total_transactions * this->opts.chunk_size) / ((double)total_time / thr_infos.size() / S2NS); +} + +memload_generator::~memload_generator() +{ + for (auto i : thr_infos) { + // XXX: free + pthread_join(i->pthr, NULL); + delete i; + } +} diff --git a/libnm/alloc.cc b/libnm/alloc.cc deleted file mode 100644 index ebed2cb..0000000 --- a/libnm/alloc.cc +++ /dev/null @@ -1,113 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include - -#include "nmp.hh" - -static pthread_mutex_t alloc_lock; -static constexpr unsigned int MEM_OBJ_SIZE = 4096; // 4k -static constexpr unsigned int MEM_OBJ_NUM = 1024 * 256; // 4k * 1024 * 256 = 1GB per region -static constexpr unsigned int MEM_REGION_NUM = 4; // 4 x 1GB = 4GB total -static int nm_mem_idx[NM_MAX_OBJS_PER_LVL]; -static int nm_mem_region_idx[NM_MAX_OBJS_PER_LVL]; -static void* nm_mem_regions[NM_MAX_OBJS_PER_LVL][MEM_REGION_NUM]; - -int -nm_alloc_init() -{ - long tid; - thr_self(&tid); - domainset_t orig_dom; - int orig_policy; - - pthread_mutex_init(&alloc_lock, nullptr); - - DOMAINSET_ZERO(&orig_dom); - - // save existing thread's allocation strategy - int ret = cpuset_getdomain(CPU_LEVEL_WHICH, CPU_WHICH_TID, tid, sizeof(orig_dom), &orig_dom, &orig_policy); - if (ret != 0) { - return ret; - } - - domainset_t tmp_domain; - for (int i = 0; i < nm_obj_count(NM_LEVEL_NUMA); i++) { - DOMAINSET_ZERO(&tmp_domain); - DOMAINSET_SET(i, &tmp_domain); - - ret = cpuset_setdomain(CPU_LEVEL_WHICH, CPU_WHICH_TID, tid, sizeof(tmp_domain), &tmp_domain, DOMAINSET_POLICY_PREFER); - if (ret != 0) { - if (nm_get_verbose() > 0) { - fprintf(stdout, "libnm: cpuset_setdomain failed with %d\n", errno); - } - return ret; - } - - for (unsigned int j = 0; j < MEM_REGION_NUM; j++) { - if ((nm_mem_regions[i][j] = mmap(nullptr, MEM_OBJ_NUM * MEM_OBJ_SIZE, PROT_READ | PROT_WRITE, - MAP_ANON | MAP_ALIGNED_SUPER | MAP_NOCORE | MAP_PRIVATE | MAP_NOSYNC, - -1, 0)) == MAP_FAILED) { - if (nm_get_verbose() > 0) { - fprintf(stdout, "libnm: mmap failed with %d\n", errno); - } - return -1; - } - - // touch the pages to prefault the pages - for (unsigned int k = 0; k < MEM_OBJ_NUM; k++) { - *(uint32_t*)((char*)nm_mem_regions[i][j] + k * MEM_OBJ_SIZE) = 0; - } - - if (nm_get_verbose() > 0) { - fprintf(stdout, "libnm: reserved %u bytes (%u MB) on node %d. vaddr: 0x%p\n", MEM_OBJ_NUM * MEM_OBJ_SIZE, MEM_OBJ_SIZE * MEM_OBJ_NUM / 1024 / 1024, i, nm_mem_regions[i][j]); - } - } - - nm_mem_idx[i] = 0; - nm_mem_region_idx[i] = 0; - } - - // restore existing thread's allocation strategy - ret = cpuset_setdomain(CPU_LEVEL_WHICH, CPU_WHICH_TID, tid, sizeof(orig_dom), &orig_dom, orig_policy); - - return ret; -} - -void * -nm_malloc(unsigned int node, size_t size) -{ - void * ret = nullptr; - int num_objs = (size + MEM_OBJ_SIZE - 1) / MEM_OBJ_SIZE; - bool retry = false; - - pthread_mutex_lock(&alloc_lock); - int cur_region = nm_mem_region_idx[node]; - int cur_idx = nm_mem_idx[node]; - -retry: - if ((int)MEM_OBJ_NUM - cur_idx >= num_objs) { - ret = (char*)nm_mem_regions[node][cur_region] + MEM_OBJ_SIZE * cur_idx; - nm_mem_region_idx[node] = cur_region; - nm_mem_idx[node] = cur_idx + num_objs; - } else if (!retry && (cur_region < (int)MEM_REGION_NUM)) { - // check next region - cur_region++; - cur_idx = 0; - retry = true; - goto retry; - } - pthread_mutex_unlock(&alloc_lock); - - return ret; -} - -void -nm_free(unsigned int node ATTR_UNUSED, void * addr ATTR_UNUSED) -{ - // dummy function -} \ No newline at end of file diff --git a/libnm/loadgen.cc b/libnm/loadgen.cc deleted file mode 100644 index c0d64dd..0000000 --- a/libnm/loadgen.cc +++ /dev/null @@ -1,217 +0,0 @@ -#include "nmp.hh" -#include - -#include -#include -#include -#include -#include -#include -#include - -void * -memload_generator::worker_thrd(void *_tinfo) -{ - auto *tinfo = (struct thread_info *)_tinfo; - long tid; - thr_self(&tid); - - if (nm_get_verbose() > 0) { - fprintf(stdout, - "memload_generator : from domain %d to %d\n", - tid, tinfo->from_domainid, tinfo->to_domainid); - } - - if (tinfo->from_region == nullptr) { - tinfo->from_region = nm_malloc(tinfo->from_domainid, tinfo->array_size); - } - - if (tinfo->to_region == nullptr) { - tinfo->to_region = nm_malloc(tinfo->to_domainid, tinfo->array_size); - } - - if (tinfo->from_region == nullptr || tinfo->to_region == nullptr) { - tinfo->init_status.store(thread_info::INIT_FAILED); - if (nm_get_verbose() > 0) { - fprintf(stderr, - "memload_generator : failed to allocate memory\n", tid); - } - return nullptr; - } - - // populate the region with 1/2/3s - memset((char*)tinfo->from_region, 1, tinfo->array_size); - memset((char*)tinfo->to_region, 2, tinfo->array_size); - - tinfo->init_status.store(thread_info::INIT_SUCCESS); - - if (nm_get_verbose() > 0) { - fprintf(stdout, - "memload_generator : init finished, from: %p, to: %p, waiting for start...\n", - tid, tinfo->from_region, tinfo->to_region); - } - - while (tinfo->state->load() == STATE_READY) { - }; - - if (nm_get_verbose() > 0) { - fprintf( - stdout, "memload_generator : running...\n", tid); - } - tinfo->begin_ts = nm_get_uptime_ns(); - while (tinfo->state->load() == STATE_START) { - if (tinfo->num_trans.load() < tinfo->iter) { - // generate traffic - memcpy((char *)tinfo->to_region, (char *)tinfo->from_region, tinfo->array_size); - tinfo->num_trans.fetch_add(1); - if (tinfo->num_trans.load() >= tinfo->iter) { - tinfo->stop_ts = nm_get_uptime_ns(); - } - } else { - usleep(1000 * 100); - } - } - - //nm_free(tinfo->from_domainid, tinfo->from_region); - //nm_free(tinfo->to_domainid, tinfo->to_region); - - if (nm_get_verbose() > 0) { - fprintf( - stdout, "memload_generator : exiting...\n", tid); - } - return nullptr; -} - -memload_generator::memload_generator( - uint64_t from_cmask, uint64_t to_cmask, uint32_t array_sz, uint32_t iteration, bool *success) -{ - *success = false; - state.store(STATE_READY); - this->array_size = array_sz; - this->iteration = iteration; - int nextcore; - int to_coreid = cmask_get_next_cpu(&to_cmask); - int num_cores = cmask_get_num_cpus(from_cmask); - cpuset_t cpuset; - - if (to_coreid == NEXT_CPU_NULL || num_cores == 0) { - return; - } - - void * from = nm_malloc(nm_obj_get_id(nm_obj_find_parent(nm_obj_from_id(NM_LEVEL_CORE, to_coreid), NM_LEVEL_NUMA)), array_sz); - nextcore = cmask_get_next_cpu(&from_cmask); - void * to = nm_malloc(nm_obj_get_id(nm_obj_find_parent(nm_obj_from_id(NM_LEVEL_CORE, nextcore), NM_LEVEL_NUMA)), array_sz); - while (nextcore != NEXT_CPU_NULL) { - auto *info = new struct thread_info; - pthread_attr_t attr; - info->num_trans.store(0); - info->to_domainid = nm_obj_get_id(nm_obj_find_parent(nm_obj_from_id(NM_LEVEL_CORE, to_coreid), NM_LEVEL_NUMA)); - info->from_domainid = nm_obj_get_id(nm_obj_find_parent(nm_obj_from_id(NM_LEVEL_CORE, nextcore), NM_LEVEL_NUMA)); - info->init_status.store(thread_info::INIT_START); - info->state = &state; - info->iter = this->iteration; - info->array_size = this->array_size; - - info->from_region = from; - info->to_region = to; - - CPU_ZERO(&cpuset); - CPU_SET(nextcore, &cpuset); - pthread_attr_init(&attr); - pthread_attr_setaffinity_np(&attr, sizeof(cpuset_t), &cpuset); - pthread_create(&info->pthr, &attr, worker_thrd, info); - - if (nm_get_verbose() > 0) { - fprintf(stdout, - "memload_generator: created thread on core %d\n", - nextcore); - } - - thr_infos.push_back(info); - nextcore = cmask_get_next_cpu(&from_cmask); - } - - if (nm_get_verbose() > 0) { - fprintf( - stdout, "memload_generator: waiting for thread init...\n"); - } - - bool failed = false; - uint num_success = 0; - while (num_success < thr_infos.size() && !failed) { - num_success = 0; - for (auto i : thr_infos) { - if (i->init_status.load() == - thread_info::INIT_SUCCESS) { - num_success++; - } - if (i->init_status.load() == thread_info::INIT_FAILED) { - failed = true; - } - } - } - - *success = num_success == thr_infos.size(); - if (nm_get_verbose() > 0) { - fprintf(stdout, - "memload_generator: exiting constructor. Success: %d...\n", - success ? 1 : 0); - } -} - -void -memload_generator::start() -{ - if (this->state.load() == STATE_READY) { - state.store(STATE_START); - } -} - -void -memload_generator::stop() -{ - if (this->state.load() == STATE_START) { - state.store(STATE_STOP); - } -} - -bool -memload_generator::check_done() -{ - bool done = true; - for (auto i : thr_infos) { - done = done && (i->num_trans.load() >= this->iteration); - } - if (done) { - stop(); - } - return done; -} - -uint64_t -memload_generator::get_bps() -{ - if (this->state.load() == STATE_STOP) { - uint64_t total_transactions = 0; - uint64_t total_time = 0; - for (auto i : thr_infos) { - total_transactions += i->num_trans.load(); - total_time = (i->stop_ts - i->begin_ts) + total_time; - } - return (double)(total_transactions * this->array_size) / ((double)total_time / thr_infos.size() / S2NS); - } else { - return 0; - } -} - -memload_generator::~memload_generator() -{ - if (this->state.load() != STATE_STOP) { - stop(); - } - - for (auto i : thr_infos) { - pthread_join(i->pthr, nullptr); - delete i; - } -} diff --git a/libnm/nm.cc b/libnm/nm.cc deleted file mode 100644 index a487245..0000000 --- a/libnm/nm.cc +++ /dev/null @@ -1,72 +0,0 @@ -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "nmp.hh" -#include "defs.hh" - -static const char *SYSCTL_TSC = "machdep.tsc_freq"; -static uint64_t sysctl_tsc_freq = 0; - -static int verbose = 0; - -int nm_get_verbose() -{ - return verbose; -} - -uint64_t -nm_get_uptime_ns() -{ - unsigned int dummy; - _mm_lfence(); - uint64_t tsc = __rdtscp(&dummy); - _mm_lfence(); - return nm_tsc2ns(tsc); -} - -uint64_t -nm_tsc2ns(uint64_t tsc) -{ - return (uint64_t)( - (double)tsc / (double)sysctl_tsc_freq * S2NS); -} - -// 0 on success -// -1 on error -int -nm_init(int verbosity) -{ - int ret; - size_t sz = sizeof(sysctl_tsc_freq); - verbose = verbosity; - - // init nm_tsc2ns - if ((ret = sysctlbyname( - SYSCTL_TSC, &sysctl_tsc_freq, &sz, nullptr, 0)) < 0) { - if (nm_get_verbose()) { - fprintf(stderr, - "libnm: failed to query tsc frequency via sysctl (%d)\n", errno); - } - return ret; - } - - if (nm_get_verbose()) { - fprintf(stdout, "libnm: tsc frequency: %lu\n", sysctl_tsc_freq); - } - - ret = nm_topo_init(); - if (ret != 0) { - return ret; - } - - ret = nm_alloc_init(); - - return ret; -} diff --git a/libnm/nmp.hh b/libnm/nmp.hh deleted file mode 100644 index 48e377c..0000000 --- a/libnm/nmp.hh +++ /dev/null @@ -1,9 +0,0 @@ -#pragma once - -#include "nm.hh" - -int nm_topo_init(); - -int nm_alloc_init(); - -int nm_get_verbose(); diff --git a/libnm/topo.cc b/libnm/topo.cc deleted file mode 100644 index 716728b..0000000 --- a/libnm/topo.cc +++ /dev/null @@ -1,204 +0,0 @@ -#include "nmp.hh" - -#include -#include -#include - -constexpr static int NM_MAX_CHILDREN = 128; - -struct nm_obj { - int level; - int id; - struct nm_obj *parent; - - int num_children; - struct nm_obj * children[NM_MAX_CHILDREN]; -}; - -static int size_tbl[NM_MAX_LEVEL] = { 0 }; -static struct nm_obj * obj_tbl[NM_MAX_LEVEL][NM_MAX_OBJS_PER_LVL]; - -static hwloc_obj_t -get_parent_type(hwloc_obj_t obj, hwloc_obj_type_t type) -{ - obj = obj->parent; - while (obj != nullptr) { - if (obj->type == type) { - return obj; - } - obj = obj->parent; - } - return nullptr; -} - -// static int -// obj_comparator(const void * a, const void * b) -// { -// return ((struct nm_obj *)a)->id - ((struct nm_obj *)b)->id; -// } - -static inline int -is_level_valid(int level) -{ - return level < (int)NM_MAX_LEVEL; -} - -int -nm_obj_count(int level) -{ - if (is_level_valid(level)) { - return size_tbl[level]; - } - return -1; -} - -struct nm_obj * -nm_obj_from_id(int level, int id) -{ - if (is_level_valid(level) && id <= size_tbl[level]) { - return obj_tbl[level][id]; - } - return nullptr; -} - -struct nm_obj * -nm_obj_find_parent(struct nm_obj * start, int parent_level) -{ - struct nm_obj * ret = nullptr; - while (start != nullptr) { - if (parent_level == start->level) { - ret = start; - break; - } - start = start->parent; - } - return ret; -} - -int -nm_obj_get_id(struct nm_obj * obj) -{ - return obj->id; -} - -int -nm_obj_get_level(struct nm_obj * obj) -{ - return obj->level; -} - -static int -validate_objs(int level) -{ - for (int i = 0; i < size_tbl[level]; i++) { - struct nm_obj * each = obj_tbl[level][i]; - if (each->id != i) { - return 0; - } - } - return 1; -} - -static int -add_obj(int id, int level, struct nm_obj * parent) -{ - if (size_tbl[level] >= NM_MAX_OBJS_PER_LVL) { - - return -1; - } - - auto each = (struct nm_obj *)malloc(sizeof(struct nm_obj)); - - each->id = id; - each->level = level; - each->num_children = 0; - each->parent = parent; - - if (parent != nullptr) { - // add children - if (parent->num_children >= NM_MAX_CHILDREN) { - return -1; - } else { - parent->children[parent->num_children] = each; - parent->num_children++; - } - } - - obj_tbl[level][size_tbl[level]] = each; - size_tbl[level]++; - - return 0; -} - -static int -add_level(hwloc_topology * topo, hwloc_obj_type_t hwloc_level, hwloc_obj_type_t hwloc_plevel, int level, int plevel) -{ - // populate numa nodes - hwloc_obj_t obj = nullptr; - hwloc_obj_t pobj = nullptr; - struct nm_obj *parent = nullptr; - while (true) { - - obj = hwloc_get_next_obj_by_type(topo, hwloc_level, obj); - if (obj == nullptr) { - break; - } - - pobj = get_parent_type(obj, hwloc_plevel); - - if (pobj != nullptr) { - parent = obj_tbl[plevel][pobj->logical_index]; - } - - if (add_obj(obj->logical_index, level, parent) != 0) { - if (nm_get_verbose() > 0) { - fprintf(stderr, "libnm: failed to add object %d.\n", obj->logical_index); - } - return -1; - } - - if (nm_get_verbose() > 0) { - fprintf(stdout, "libnm: identified id %d type %d parent %d type %d\n", obj->logical_index, level, parent == nullptr ? -1 : parent->id, plevel); - } - } - - // sort - // std::qsort(obj_tbl[level], size_tbl[level], sizeof(struct nm_obj), obj_comparator); - if (!validate_objs(level)) { - if (nm_get_verbose() > 0) { - fprintf(stdout, "libnm: objects are shuffled at level %d.\n", level); - } - return -1; - } - - return 0; -} - -int -nm_topo_init() -{ - int ret; - - // init numa stuff - hwloc_topology *topo; - if ((ret = hwloc_topology_init(&topo)) != 0) { - return ret; - } - - if ((ret = hwloc_topology_load(topo)) != 0) - return ret; - - if ((ret = add_level(topo, HWLOC_OBJ_PACKAGE, HWLOC_OBJ_PACKAGE, NM_LEVEL_NUMA, NM_LEVEL_NUMA)) != 0) { - return ret; - } - - if ((ret = add_level(topo, HWLOC_OBJ_CORE, HWLOC_OBJ_PACKAGE, NM_LEVEL_CPU, NM_LEVEL_NUMA)) != 0) { - return ret; - } - - if ((ret = add_level(topo, HWLOC_OBJ_PU, HWLOC_OBJ_CORE, NM_LEVEL_CORE, NM_LEVEL_CPU)) != 0) { - return ret; - } - - return ret; -} diff --git a/libnms/alloc.c b/libnms/alloc.c new file mode 100644 index 0000000..9a91667 --- /dev/null +++ b/libnms/alloc.c @@ -0,0 +1,196 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define MAX_NUMA_DOMAINS (64) +#define MAX_REGIONS (64) +#define REGION_SIZE (1024 * 1024 * 1024) +#define MALLOC_UNIT (4096) + +struct nms_region { + uintptr_t start_addr; + size_t size; + size_t occupied; +}; + +struct nms_desc { + // alloc + pthread_mutex_t alloc_lock; + + struct nms_region regions[MAX_NUMA_DOMAINS][MAX_REGIONS]; + int region_sz[MAX_NUMA_DOMAINS]; +}; + +static _Atomic(int) initialized = 0; +static struct nms_desc g_desc; + +static void * +nms_alloc_region(int nodeid, size_t sz) +{ + long tid; + domainset_t orig_dom; + int orig_policy; + void * region; + + thr_self(&tid); + DOMAINSET_ZERO(&orig_dom); + + // save existing thread's allocation strategy + int ret = cpuset_getdomain(CPU_LEVEL_WHICH, CPU_WHICH_TID, tid, sizeof(orig_dom), &orig_dom, &orig_policy); + if (ret != 0) { + fprintf(stderr, "libnms: cpuset_getdomain failed with %d\n", errno); + return NULL; + } + + domainset_t tmp_domain; + DOMAINSET_ZERO(&tmp_domain); + DOMAINSET_SET(nodeid, &tmp_domain); + + ret = cpuset_setdomain(CPU_LEVEL_WHICH, CPU_WHICH_TID, tid, sizeof(tmp_domain), &tmp_domain, DOMAINSET_POLICY_ROUNDROBIN); + if (ret != 0) { + fprintf(stderr, "libnms: cpuset_setdomain failed with %d\n", errno); + return NULL; + } + + if ((region = mmap(NULL, REGION_SIZE, PROT_READ | PROT_WRITE, MAP_ANON | MAP_ALIGNED_SUPER | MAP_NOCORE | MAP_PRIVATE | MAP_NOSYNC, -1, 0)) == MAP_FAILED) { + fprintf(stderr, "libnms: mmap failed with %d\n", errno); + return NULL; + } + + // touch the pages to prefault the pages + for (size_t i = 0; i < REGION_SIZE; i += MALLOC_UNIT) { + *(uint8_t*)((uintptr_t)region + i) = 0; + } + + // restore existing thread's allocation strategy + ret = cpuset_setdomain(CPU_LEVEL_WHICH, CPU_WHICH_TID, tid, sizeof(orig_dom), &orig_dom, orig_policy); + if (ret != 0) { + fprintf(stderr, "libnms: cpuset_setdomain failed with %d\n", errno); + munmap(region, REGION_SIZE); + return NULL; + } + + return region; +} + +static int +nms_desc_init(struct nms_desc * desc, int verbose) +{ + memset(desc, 0, sizeof(struct nms_desc)); + pthread_mutex_init(&desc->alloc_lock, NULL); + return 0; +} + +static void * +nms_region_malloc(struct nms_region * region, size_t size) +{ + void * ret = NULL; + if (region->size >= region->occupied + size) { + ret = (void *)(region->start_addr + region->occupied); + region->occupied += size; + region->occupied = (region->occupied + MALLOC_UNIT - 1) & ~(MALLOC_UNIT - 1); + } + return ret; +} + +static int +nms_desc_add_region(struct nms_desc * desc, int nodeid, size_t size) +{ + void * ret; + int idx; + + ret = nms_alloc_region(nodeid, REGION_SIZE); + if (ret == NULL) { + fprintf(stderr, "libnms: failed to allocate region on node %d\n", nodeid); + return ENOMEM; + } + + desc->region_sz[nodeid]++; + idx = desc->region_sz[nodeid] - 1; + desc->regions[nodeid][idx].start_addr = (uintptr_t)ret; + desc->regions[nodeid][idx].occupied = 0; + desc->regions[nodeid][idx].size = REGION_SIZE; + + return 0; +} + +static void * +nms_desc_malloc(struct nms_desc * desc, unsigned int nodeid, size_t size) +{ + void * ret = NULL; + int idx; + int new_region = 0; + + if (size > REGION_SIZE) { + return NULL; + } + + pthread_mutex_lock(&desc->alloc_lock); + +retry: + if (desc->region_sz[nodeid] > 0) { + idx = desc->region_sz[nodeid] - 1; + ret = nms_region_malloc(&desc->regions[nodeid][idx], size); + } + + if (ret == NULL) { + // we need a new region + if (nms_desc_add_region(desc, nodeid, REGION_SIZE) != 0) { + pthread_mutex_unlock(&desc->alloc_lock); + return NULL; + } + fprintf(stdout, "libnms: request of size %zu -> allocated new region on node %d\n", size, nodeid); + goto retry; + } + + pthread_mutex_unlock(&desc->alloc_lock); + return ret; +} + +static void +nms_desc_free(struct nms_desc * desc __attribute__((unused)), unsigned int node __attribute__((unused)), void * addr __attribute__((unused))) +{ + // dummy function +} + +int +nms_init(int verbose) +{ + int expected = 0; + if (atomic_compare_exchange_strong(&initialized, &expected, 2)) { + nms_desc_init(&g_desc, verbose); + atomic_store(&initialized, 1); + } else { + while(atomic_load(&initialized) != 1) { + } + fprintf(stdout,"libnms: already initialized.\n"); + } + + return 0; +} + +void * +nms_malloc(int nodeid, size_t sz) +{ + assert(atomic_load(&initialized) == 1); + return nms_desc_malloc(&g_desc, nodeid, sz); +} + +void +nms_free(int nodeid, void * addr) +{ + assert(atomic_load(&initialized) == 1); + nms_desc_free(&g_desc, nodeid, addr); +} + diff --git a/net/cat.cc b/net/cat.cc index 6c60f98..938efe3 100644 --- a/net/cat.cc +++ b/net/cat.cc @@ -1,3 +1,12 @@ +#include +#include +#include +#include +#include +#include +#include + +#include #include #include #include @@ -10,29 +19,16 @@ #include #include -#include "gen.hh" -#include "nm.hh" #include "ntr.h" +#include "gen.hh" +#include "net/netsup.hh" #include "net/pkt.hh" -#include "net/util.hh" +#include "nms.h" -#include -#include -#include -#include -#include - -#define MBUF_MAX_COUNT (rte_lcore_count() * 4096) -constexpr static unsigned int MBUF_CACHE_SIZE = 512; -constexpr static unsigned int RX_RING_SIZE = 1024; -constexpr static unsigned int TX_RING_SIZE = 1024; constexpr static unsigned int BURST_SIZE = 32; constexpr static unsigned int MAX_SLAVES = 32; constexpr static unsigned int SLAVES_MAX_WAIT_MS = 1000; -static const struct rte_eth_conf port_conf_default { -}; - struct datapt { uint32_t epoch; uint32_t valid; @@ -59,23 +55,21 @@ struct options_t { char ia_gen_str[256] = "fixed"; unsigned int target_qps { 0 }; unsigned int master_mode { 0 }; - struct net_spec server_spec { - }; + struct net_spec server_spec { }; cpuset_t cpu_set = CPUSET_T_INITIALIZER(0x2); // 2nd core std::vector slaves; uint32_t pkt_loss_failure_threshold { 0 }; uint32_t pkt_loss_time_ms { UINT32_MAX }; + int portid { 0 }; // states - struct rte_mempool *mbuf_pool { nullptr }; - struct net_spec s_host_spec { - }; + struct net_spec s_host_spec { }; struct conn_spec s_host_conn { .src = &s_host_spec, .dst = &server_spec, .dst_port = POU_PORT }; - uint16_t s_portid { 0 }; unsigned int s_rxqid { 0 }; unsigned int s_txqid { 0 }; + unsigned int s_socketid { 0 }; // for qps calculation std::atomic s_recved_pkts { 0 }; std::atomic s_pkt_loss { 0 }; @@ -97,14 +91,13 @@ struct options_t { static struct options_t options; static uint16_t -rx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused, +rx_add_timestamp(uint16_t port, uint16_t qidx __rte_unused, struct rte_mbuf **pkts, uint16_t nb_pkts, uint16_t max_pkts __rte_unused, void *_ __rte_unused) { - uint64_t now = nm_tsc2ns(rte_rdtsc()); + uint64_t now = topo_uptime_ns(); struct pkt_hdr *pkt_data; - struct timespec ts { - }; + struct timespec ts { }; int ret; if (options.s_state != STATE_SENT) { @@ -112,8 +105,8 @@ rx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused, } for (int i = 0; i < nb_pkts; i++) { - pkt_data = check_valid_packet( - pkts[i], &options.s_host_spec.mac_addr); + pkt_data = check_valid_packet(pkts[i], + &options.s_host_spec.mac_addr); if (pkt_data == nullptr) { ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, @@ -165,16 +158,16 @@ static uint16_t tx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused, struct rte_mbuf **pkts, uint16_t nb_pkts, void *_ __rte_unused) { - uint64_t now = nm_tsc2ns(rte_rdtsc()); + uint64_t now = topo_uptime_ns(); struct pkt_hdr *pkt_data; - if (options.s_state != STATE_SENT) { - return nb_pkts; - } + // if (options.s_state != STATE_SENT) { + // return nb_pkts; + // } for (int i = 0; i < nb_pkts; i++) { - pkt_data = check_valid_packet( - pkts[i], &options.s_host_spec.mac_addr); + pkt_data = check_valid_packet(pkts[i], + &options.s_host_spec.mac_addr); if (pkt_data == nullptr) { ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, @@ -214,6 +207,7 @@ static void send_all_slaves(uint16_t type) { struct rte_mbuf *tx_bufs[MAX_SLAVES]; + //struct rte_eth_stats stats; struct conn_spec cspec; cspec.src = &options.s_host_spec; @@ -224,13 +218,18 @@ send_all_slaves(uint16_t type) for (unsigned int i = 0; i < options.slaves.size(); i++) { struct pkt_hdr *hdr; cspec.dst = options.slaves.at(i); - if (alloc_pkt_hdr(options.mbuf_pool, type, &cspec, 0, &tx_bufs[i], - &hdr) != 0) { + if (alloc_pkt_hdr(mempool_get(options.s_socketid), type, &cspec, 0, + &tx_bufs[i], &hdr) != 0) { rte_exit(EXIT_FAILURE, "failed to alloc packet\n"); } } + + // if (rte_eth_stats_get(options.portid, &stats) != 0 ) { + // rte_exit(EXIT_FAILURE, "failed!"); + // } + // printf("send_all_slaves: ipackets %lu, opackets %lu, ierrors %lu, oerrors %lu\n", stats.ipackets, stats.opackets, stats.ierrors, stats.oerrors); - if (rte_eth_tx_burst(options.s_portid, options.s_txqid, tx_bufs, + if (rte_eth_tx_burst(options.portid, options.s_txqid, tx_bufs, options.slaves.size()) != options.slaves.size()) { rte_exit(EXIT_FAILURE, "failed to send some packets\n"); } @@ -243,14 +242,14 @@ wait_for_slaves(uint16_t etype, struct rte_mbuf **out) { struct rte_mbuf *tx_bufs[MAX_SLAVES]; bool stop = false; - const uint64_t start = nm_get_uptime_ns(); + const uint64_t start = topo_uptime_ns(); std::vector recved; uint32_t tot = 0; while (!stop) { - uint64_t now = nm_get_uptime_ns(); - const uint16_t nb_rx = rte_eth_rx_burst( - options.s_portid, options.s_rxqid, tx_bufs, MAX_SLAVES); + uint64_t now = topo_uptime_ns(); + const uint16_t nb_rx = rte_eth_rx_burst(options.portid, + options.s_rxqid, tx_bufs, MAX_SLAVES); if (nb_rx > 0) { for (unsigned int i = 0; i < nb_rx; i++) { @@ -275,7 +274,7 @@ wait_for_slaves(uint16_t etype, struct rte_mbuf **out) if (rte_is_same_ether_addr( &eaddr->mac_addr, &each->eth_hdr - .s_addr)) { + .src_addr)) { invalid = false; break; } @@ -298,7 +297,7 @@ wait_for_slaves(uint16_t etype, struct rte_mbuf **out) if (rte_is_same_ether_addr( eaddr, &each->eth_hdr - .s_addr)) { + .src_addr)) { invalid = true; break; } @@ -314,7 +313,8 @@ wait_for_slaves(uint16_t etype, struct rte_mbuf **out) goto end_loop; } - recved.push_back(&each->eth_hdr.s_addr); + recved.push_back( + &each->eth_hdr.src_addr); if (recved.size() == options.slaves.size()) { @@ -337,9 +337,15 @@ wait_for_slaves(uint16_t etype, struct rte_mbuf **out) } } + // struct rte_eth_stats stats; + // if (rte_eth_stats_get(options.portid, &stats) != 0 ) { + // rte_exit(EXIT_FAILURE, "failed!"); + // } + //printf("wait_slaves : ipackets %lu, opackets %lu, ierrors %lu, oerrors %lu\n", stats.ipackets, stats.opackets, stats.ierrors, stats.oerrors); + if (now - start > SLAVES_MAX_WAIT_MS * MS2NS) { - rte_exit( - EXIT_FAILURE, "cat: waiting for too long %d. I QUIT!!", etype); + rte_exit(EXIT_FAILURE, + "cat: waiting for too long %d. I QUIT!!", etype); } } } @@ -356,25 +362,25 @@ pkt_loop() bool recv_stat = true; bool recv_resp = true; - if (rte_eth_dev_socket_id(options.s_portid) > 0 && - rte_eth_dev_socket_id(options.s_portid) != (int)rte_socket_id()) { + if (rte_eth_dev_socket_id(options.portid) > 0 && + rte_eth_dev_socket_id(options.portid) != (int)rte_socket_id()) { ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "locore_main: WARNING, port %d is on remote NUMA node to " "polling thread.\n\tPerformance will " "not be optimal.\n", - options.s_portid); + options.portid); } - uint64_t next_ts = nm_get_uptime_ns(); + uint64_t next_ts = topo_uptime_ns(); uint64_t last_send_ts = next_ts; bool is_last_pkt_lost = false; uint32_t num_cts_pkt_lost = 0; while (!options.s_stop.load()) { - uint64_t now = nm_get_uptime_ns(); + uint64_t now = topo_uptime_ns(); // always pop incoming packets - const uint16_t nb_rx = rte_eth_rx_burst( - options.s_portid, options.s_rxqid, rx_bufs, BURST_SIZE); + const uint16_t nb_rx = rte_eth_rx_burst(options.portid, + options.s_rxqid, rx_bufs, BURST_SIZE); if (nb_rx > 0) { for (int i = 0; i < nb_rx; i++) { @@ -398,7 +404,7 @@ pkt_loop() uint16_t type = rte_be_to_cpu_16(each->type); NTR_PKT(NTR_DEP_USER1, NTR_LEVEL_DEBUG, each, - "locore_main: "); + "locore_main: received packet %p ", each); struct pkt_payload_epoch *pld_epoch; struct pkt_payload_stat *pld_stat; uint32_t epoch; @@ -409,6 +415,8 @@ pkt_loop() epoch = rte_be_to_cpu_32( pld_epoch->epoch); + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "lcore_main: PROBE_RESP received packet %p epoch %d\n", each, epoch); + if (options.s_last_datapt == nullptr || epoch != options.s_last_datapt->epoch) { @@ -466,9 +474,10 @@ pkt_loop() if (options.s_state == STATE_SENT) { // check if hw ts is read if (!read_tx) { + int ret; struct timespec ts; - if (rte_eth_timesync_read_tx_timestamp( - options.s_portid, &ts) == 0) { + if ((ret = rte_eth_timesync_read_tx_timestamp( + options.portid, &ts)) == 0) { ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "locore_main: read hw tx timestamp %lu.\n", (ts.tv_nsec + ts.tv_sec * S2NS)); @@ -553,7 +562,7 @@ pkt_loop() S2NS); options.s_host_conn.src_port = port_gen.next(); - if (alloc_pkt_hdr(options.mbuf_pool, + if (alloc_pkt_hdr(mempool_get(options.s_socketid), PKT_TYPE_PROBE, &options.s_host_conn, 0, &tx_buf, &pkt_data) != 0) { rte_exit(EXIT_FAILURE, @@ -570,24 +579,25 @@ pkt_loop() options.s_last_datapt->valid = options.s_record.load(); - read_tx = false; - recv_resp = false; - recv_stat = false; last_send_ts = now; - options.s_state = STATE_SENT; - + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, - "locore_main: sending packet %p with epoch %d\n", + "locore_main: sending packet 0x%p with epoch %d\n", (void *)tx_buf, epoch); - const uint16_t nb_tx = rte_eth_tx_burst( - options.s_portid, options.s_txqid, &tx_buf, - 1); + const uint16_t nb_tx = + rte_eth_tx_burst(options.portid, + options.s_txqid, &tx_buf, 1); if (nb_tx != 1) { rte_exit(EXIT_FAILURE, "failed to send packet 0x%p, epoch %d\n", (void *)tx_buf, epoch); } + + read_tx = false; + recv_resp = false; + recv_stat = false; + options.s_state = STATE_SENT; } } } @@ -611,9 +621,9 @@ locore_main(void *tif __rte_unused) wait_for_slaves(PKT_TYPE_SYNC_ACK, nullptr); } - options.s_start_time.store(nm_get_uptime_ns()); + options.s_start_time.store(topo_uptime_ns()); pkt_loop(); - options.s_end_time.store(nm_get_uptime_ns()); + options.s_end_time.store(topo_uptime_ns()); if (options.master_mode == 1) { ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, @@ -627,8 +637,8 @@ locore_main(void *tif __rte_unused) for (unsigned int i = 0; i < options.slaves.size(); i++) { // these packets already underwent validity check in // wait_for_slaves - auto pkt_hdr = rte_pktmbuf_mtod( - mbufs[i], struct pkt_hdr *); + auto pkt_hdr = rte_pktmbuf_mtod(mbufs[i], + struct pkt_hdr *); auto pld_qps = (struct pkt_payload_qps *) pkt_hdr->payload; uint32_t qps = rte_be_to_cpu_32(pld_qps->qps); @@ -650,95 +660,6 @@ locore_main(void *tif __rte_unused) return 0; } -static int -port_init(uint16_t portid, struct rte_mempool *mbuf_pool) -{ - struct rte_eth_dev_info dev_info { - }; - struct rte_eth_conf port_conf = port_conf_default; - struct rte_eth_txconf txconf { - }; - struct rte_eth_rxconf rxconf { - }; - - uint16_t nb_rxd = RX_RING_SIZE; - uint16_t nb_txd = TX_RING_SIZE; - - if (!rte_eth_dev_is_valid_port(portid)) { - return -1; - } - - int ret = rte_eth_dev_info_get(portid, &dev_info); - if (ret != 0) { - return ret; - } - - port_conf.rxmode.max_rx_pkt_len = RTE_ETHER_MAX_LEN; - port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; - port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_RSS_HASH; - port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_UDP_CKSUM; - port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_IPV4_CKSUM; - port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM; - port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM; - port_conf.txmode.offloads |= DEV_TX_OFFLOAD_MBUF_FAST_FREE; - - /* Configure the Ethernet device. */ - ret = rte_eth_dev_configure(portid, 1, 1, &port_conf); - if (ret != 0) - return ret; - - ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd, &nb_txd); - if (ret != 0) - return ret; - - /* Allocate and set up 1 RX queue per thread . */ - rxconf = dev_info.default_rxconf; - rxconf.offloads = port_conf.rxmode.offloads; - for (uint32_t i = 0; i < 1; i++) { - ret = rte_eth_rx_queue_setup(portid, options.s_rxqid, nb_rxd, - rte_eth_dev_socket_id(portid), &rxconf, mbuf_pool); - if (ret < 0) - return ret; - } - - txconf = dev_info.default_txconf; - txconf.offloads = port_conf.txmode.offloads; - /* Allocate and set up 1 TX queue per Ethernet port. */ - for (uint32_t i = 0; i < 1; i++) { - ret = rte_eth_tx_queue_setup(portid, options.s_txqid, nb_txd, - rte_eth_dev_socket_id(portid), &txconf); - if (ret < 0) - return ret; - } - - ret = rte_eth_dev_start(portid); - if (ret < 0) - return ret; - - /* Display the port MAC address. */ - struct rte_ether_addr addr { - }; - ret = rte_eth_macaddr_get(portid, &addr); - if (ret != 0) - return ret; - - ret = rte_eth_timesync_enable(portid); - if (ret != 0) - return ret; - - /* Enable RX in promiscuous mode for the Ethernet device. */ - ret = rte_eth_promiscuous_enable(portid); - if (ret != 0) - return ret; - - rte_eth_add_tx_callback( - portid, options.s_rxqid, tx_add_timestamp, nullptr); - rte_eth_add_rx_callback( - portid, options.s_txqid, rx_add_timestamp, nullptr); - - return 0; -} - static void dump_options() { @@ -753,11 +674,13 @@ dump_options() " target qps = %d\n" " host IP = 0x%x\n" " pkt loss time = %u\n" - " pkt loss failure threshold = %u\n", + " pkt loss failure threshold = %u\n" + " portid = %d\n", ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING, options.run_time, options.warmup_time, options.output, CPU_COUNT(&options.cpu_set), options.ia_gen_str, options.target_qps, options.s_host_spec.ip, - options.pkt_loss_time_ms, options.pkt_loss_failure_threshold); + options.pkt_loss_time_ms, options.pkt_loss_failure_threshold, + options.portid); for (auto slave : options.slaves) { ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, @@ -794,8 +717,6 @@ usage() int main(int argc, char *argv[]) { - unsigned int nb_ports; - struct rte_mempool *mbuf_pool; std::ofstream log_file; bool has_host_spec = false; @@ -816,7 +737,7 @@ main(int argc, char *argv[]) int c; // parse arguments struct net_spec *ns; - while ((c = getopt(argc, argv, "vs:S:t:T:ho:A:i:q:H:L:l:")) != + while ((c = getopt(argc, argv, "vs:S:t:T:ho:A:i:q:H:L:l:p:")) != -1) { switch (c) { case 'v': @@ -824,8 +745,8 @@ main(int argc, char *argv[]) ntr_get_level(NTR_DEP_USER1) + 1); break; case 's': - if (str_to_netspec( - optarg, &options.server_spec) != 0) { + if (str_to_netspec(optarg, + &options.server_spec) != 0) { rte_exit(EXIT_FAILURE, "invalid server net spec.\n"); } @@ -839,16 +760,16 @@ main(int argc, char *argv[]) options.slaves.push_back(ns); options.master_mode = 1; if (options.slaves.size() > MAX_SLAVES) { - rte_exit( - EXIT_FAILURE, "too many rats.\n"); + rte_exit(EXIT_FAILURE, + "too many rats.\n"); } break; case 't': options.run_time = strtol(optarg, nullptr, 10); break; case 'T': - options.warmup_time = strtol( - optarg, nullptr, 10); + options.warmup_time = strtol(optarg, nullptr, + 10); break; case 'h': usage(); @@ -865,32 +786,35 @@ main(int argc, char *argv[]) sizeof(options.ia_gen_str) - 1); break; case 'q': - options.target_qps = strtoul( - optarg, nullptr, 10); + options.target_qps = strtoul(optarg, nullptr, + 10); break; case 'H': has_host_spec = true; - if (str_to_netspec( - optarg, &options.s_host_spec) != 0) { + if (str_to_netspec(optarg, + &options.s_host_spec) != 0) { rte_exit(EXIT_FAILURE, "invalid host net spec.\n"); } break; case 'L': - options.pkt_loss_failure_threshold = strtoul( - optarg, nullptr, 10); + options.pkt_loss_failure_threshold = + strtoul(optarg, nullptr, 10); break; case 'l': - options.pkt_loss_time_ms = strtoul( - optarg, nullptr, 10); + options.pkt_loss_time_ms = strtoul(optarg, + nullptr, 10); if (options.pkt_loss_time_ms == 0) { options.pkt_loss_time_ms = UINT32_MAX; } break; + case 'p': + options.portid = strtol(optarg, nullptr, 10); + break; default: usage(); - rte_exit( - EXIT_FAILURE, "unknown argument: %c\n", c); + rte_exit(EXIT_FAILURE, "unknown argument: %c\n", + c); } } } @@ -899,13 +823,68 @@ main(int argc, char *argv[]) rte_exit(EXIT_FAILURE, "must specify host IP\n"); } - // init nm - if (nm_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != 0) { - rte_exit(EXIT_FAILURE, "nm init failed!\n"); + // init libtopo + if (topo_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != + 0) { + rte_exit(EXIT_FAILURE, "libtopo init failed!\n"); } + // init nms + if (nms_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != 0) { + rte_exit(EXIT_FAILURE, "failed to init libnms!\n"); + } + + if (CPU_COUNT(&options.cpu_set) != 1) { + rte_exit(EXIT_FAILURE, "must specify exactly one core\n"); + } + int core_id = CPU_FFS(&options.cpu_set) - 1; + dump_options(); + // configure memory and port + struct port_conf pconf; + struct device_conf dconf; + struct mem_conf mconf; + portconf_get(options.portid, &pconf); + + dconf.mtu = MAX_STANDARD_MTU; + dconf.num_threads = 1; + dconf.portid = options.portid; + dconf.rss_hf = pconf.rss_hf; + dconf.rx_offloads = pconf.rxoffload; + dconf.tx_offloads = pconf.txoffload; + dconf.timesync = pconf.timesync; + + dconf.rx_fn = rx_add_timestamp; + dconf.rx_user = nullptr; + dconf.rx_ring_sz = 2048; + dconf.tx_fn = tx_add_timestamp; + dconf.tx_user = nullptr; + dconf.tx_ring_sz = 2048; + + mconf.cache_size = 64; + mconf.priv_size = 0; + mconf.num_elements = 4096; + mconf.data_room_size = RTE_MBUF_DEFAULT_BUF_SIZE + MAX_STANDARD_MTU; + mconf.max_pools = -1; + + dpdk_init(&dconf, &mconf); + + if (rte_eth_macaddr_get(options.portid, + &options.s_host_spec.mac_addr) != 0) { + rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n", + options.portid); + } + + ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, + "Configured port %d with mac addr %x:%x:%x:%x:%x:%x\n", + options.portid, options.s_host_spec.mac_addr.addr_bytes[0], + options.s_host_spec.mac_addr.addr_bytes[1], + options.s_host_spec.mac_addr.addr_bytes[2], + options.s_host_spec.mac_addr.addr_bytes[3], + options.s_host_spec.mac_addr.addr_bytes[4], + options.s_host_spec.mac_addr.addr_bytes[5]); + // create default generator options.s_iagen = createGenerator(options.ia_gen_str); if (options.s_iagen == nullptr) { @@ -921,50 +900,6 @@ main(int argc, char *argv[]) options.output); } - nb_ports = rte_eth_dev_count_avail(); - if (nb_ports == 0) { - rte_exit(EXIT_FAILURE, "number of ports must be > 0\n"); - } - - uint16_t portid = rte_eth_find_next(0); - if (portid == RTE_MAX_ETHPORTS) { - rte_exit(EXIT_FAILURE, "cannot find an available port\n"); - } - options.s_portid = portid; - - if (rte_eth_macaddr_get(portid, &options.s_host_spec.mac_addr) != 0) { - rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n", - portid); - } - - // create a mbuf memory pool on the socket - mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", MBUF_MAX_COUNT, - MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, - rte_eth_dev_socket_id(options.s_portid)); - if (mbuf_pool == nullptr) { - rte_exit(EXIT_FAILURE, "cannot create mbuf pool\n"); - } - options.mbuf_pool = mbuf_pool; - - if (port_init(portid, mbuf_pool) != 0) { - rte_exit(EXIT_FAILURE, "cannot init port %d\n", portid); - } - - ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, - "Configured port %d with mac addr %x:%x:%x:%x:%x:%x\n", portid, - options.s_host_spec.mac_addr.addr_bytes[0], - options.s_host_spec.mac_addr.addr_bytes[1], - options.s_host_spec.mac_addr.addr_bytes[2], - options.s_host_spec.mac_addr.addr_bytes[3], - options.s_host_spec.mac_addr.addr_bytes[4], - options.s_host_spec.mac_addr.addr_bytes[5]); - - int core_id = CPU_FFS(&options.cpu_set); - if (core_id == 0) { - rte_exit(EXIT_FAILURE, "invalid cpu list\n"); - } - core_id--; - sleep(INIT_DELAY); ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: launching thread on core %d\n", core_id); @@ -1014,11 +949,13 @@ main(int argc, char *argv[]) } log_file.close(); - ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "qps = %d, recved = %d, loss = %d, slave recved = %d, slave loss = %d\n", - qps, options.s_recved_pkts.load(), options.s_pkt_loss.load(), options.s_slave_recved.load(), options.s_slave_loss.load()); + ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, + "qps = %d, recved = %d, loss = %d, slave recved = %d, slave loss = %d\n", + qps, options.s_recved_pkts.load(), options.s_pkt_loss.load(), + options.s_slave_recved.load(), options.s_slave_loss.load()); // clean up - rte_eth_dev_stop(portid); + dpdk_cleanup(&dconf); return 0; } \ No newline at end of file diff --git a/net/khat.cc b/net/khat.cc index 63dc393..a4d0bf1 100644 --- a/net/khat.cc +++ b/net/khat.cc @@ -1,3 +1,16 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + #include #include #include @@ -7,40 +20,27 @@ #include #include #include -#include -#include "nm.hh" #include "ntr.h" + +#include "gen.hh" +#include "net/netsup.hh" #include "net/pkt.hh" -#include "net/util.hh" +#include "nms.h" -#include -#include -#include -#include -#include -#include -#include -#include - -#define MBUF_MAX_COUNT (rte_lcore_count() * 4096) -constexpr static unsigned int MBUF_CACHE_SIZE = 512; -constexpr static unsigned int RX_RING_SIZE = 2048; -constexpr static unsigned int TX_RING_SIZE = 2048; constexpr static unsigned int BURST_SIZE = 32; constexpr static unsigned int CACHELINE_SIZE = 64; constexpr static uint16_t THREAD_LOAD_BUFFER_SZ = 16384; -constexpr static size_t MEMPOOL_NAME_BUF_LEN = 64; -static const struct rte_mbuf_dynfield rte_mbuf_dynfield_probe_flag = { - .name = "rte_mbuf_dynfield_probe_flag", - .size = sizeof(uint32_t), - .align = __alignof__(uint32_t), - .flags = 0 -}; - -static int PROBE_FLAG_OFFSET { 0 }; -static const struct rte_eth_conf port_conf_default { +struct probe_state_t { + struct net_spec dst; + struct conn_spec cspec { + .dst = &dst + }; + uint64_t last_sw_rx; + uint64_t last_sw_tx; + uint64_t last_hw_rx; + uint32_t epoch; }; // keep track of the probe state @@ -59,63 +59,63 @@ struct thread_info { int txqid; int lcore_id; int node_id; - void * cache_lines; - void * load_buffer; -}; - -// state machine: -constexpr static int SERVER_STATE_WAIT = 0; -constexpr static int SERVER_STATE_PROBE = 1; - -struct probe_state_t { - struct net_spec dst; - struct conn_spec cspec { - .dst = &dst - }; - uint32_t epoch; - uint64_t last_sw_rx; - uint64_t last_sw_tx; - uint64_t last_hw_rx; + void *cache_lines; + void *load_buffer; }; struct options_t { // config int num_threads { 1 }; - cpuset_t cpu_set = CPUSET_T_INITIALIZER(0x2); // 2nd core - char mempool_name_buf[MEMPOOL_NAME_BUF_LEN]; - bool jumbo_frame_enabled { false }; // setting this to true changes mbuf size and mtu + cpuset_t cpu_set = CPUSET_T_INITIALIZER(0x2); // 2nd core + bool jumbo_frame_enabled { + false + }; // setting this to true changes mbuf size and mtu int port_mtu { MAX_STANDARD_MTU }; int thread_cacheline_cnt = { 128 }; bool mlg_enabled { false }; - uint64_t mlg_bps { 0 }; + uint64_t mlg_arr_sz { 0 }; cpuset_t mlg_cset = CPUSET_T_INITIALIZER(0x2); cpuset_t mlg_dset = CPUSET_T_INITIALIZER(0x1); + int mlg_shared_buffer { 0 }; memload_generator *mlg { nullptr }; - // states - uint16_t s_portid { 0 }; - struct net_spec s_host_spec { - }; - std::atomic s_state { SERVER_STATE_WAIT }; - struct probe_state_t s_probe_info; - std::vector s_thr_info; + uint16_t portid { 0 }; - struct rte_mempool *s_mempools[MAX_NODES]; + // states + struct net_spec s_host_spec { }; + std::vector s_thr_info; + int probe_state_offset { 0 }; + bool s_hwtimestamp { true }; + + struct probe_state_t s_probe_info; + std::atomic is_probing { false }; }; struct options_t options; +static bool +mbuf_is_probe_valid(struct rte_mbuf *pkt) +{ + return *RTE_MBUF_DYNFIELD(pkt, options.probe_state_offset, bool *); +} + +static void +mbuf_set_probe_valid(struct rte_mbuf *pkt, bool b) +{ + *RTE_MBUF_DYNFIELD(pkt, options.probe_state_offset, bool *) = b; +} + static uint16_t rx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused, struct rte_mbuf **pkts, uint16_t nb_pkts, uint16_t max_pkts __rte_unused, void *_ __rte_unused) { - uint64_t now = nm_get_uptime_ns(); - struct timespec ts { - }; + int rc = 0; + uint64_t now = topo_uptime_ns(); + struct timespec ts { }; struct pkt_hdr *pkt_data; for (int i = 0; i < nb_pkts; i++) { - pkt_data = check_valid_packet( - pkts[i], &options.s_host_spec.mac_addr); + pkt_data = check_valid_packet(pkts[i], + &options.s_host_spec.mac_addr); if (pkt_data == nullptr) { ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, @@ -125,40 +125,52 @@ rx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused, } if (rte_be_to_cpu_16(pkt_data->type) == PKT_TYPE_PROBE) { - int state_wait = SERVER_STATE_WAIT; - *RTE_MBUF_DYNFIELD( - pkts[i], PROBE_FLAG_OFFSET, uint32_t *) = 0; - if (rte_eth_timesync_read_rx_timestamp( - port, &ts, pkts[i]->timesync & 0x3) == 0) { - if (options.s_state.compare_exchange_strong( - state_wait, SERVER_STATE_PROBE)) { - // mark the mbuf as probe packet being - // processed only the locore that - // receives the pkt w/ userdata != - // nullptr processes that packet - *RTE_MBUF_DYNFIELD(pkts[i], - PROBE_FLAG_OFFSET, uint32_t *) = 1; - // tag with timestamps - options.s_probe_info.last_hw_rx = - ts.tv_nsec + ts.tv_sec * S2NS; - options.s_probe_info.last_sw_rx = now; + bool cmp = false; + mbuf_set_probe_valid(pkts[i], false); + if (options.is_probing.compare_exchange_strong(cmp, + true)) { + options.s_probe_info.last_sw_rx = now; + if (options.s_hwtimestamp) { + if ((rc = rte_eth_timesync_read_rx_timestamp( + port, &ts, + pkts[i]->timesync & 0x3)) == + 0) { + options.s_probe_info + .last_hw_rx = ts.tv_nsec + + ts.tv_sec * S2NS; + ntr(NTR_DEP_USER1, + NTR_LEVEL_DEBUG, + "rx_add_timestamp: tagged packet %p with sw rx: %lu hw rx:%lu.\n", + (void *)pkts[i], + options.s_probe_info + .last_sw_rx, + options.s_probe_info + .last_hw_rx); + mbuf_set_probe_valid(pkts[i], + true); + } else { + options.is_probing.store(false); + ntr(NTR_DEP_USER1, + NTR_LEVEL_WARNING, + "rx_add_timestamp: packet %p not tagged - failed to read hw rx timestamp: %d.\n", + (void *)pkts[i], rc); + } + } else { + mbuf_set_probe_valid(pkts[i], true); ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, - "rx_add_timestamp: tagged packet %p epoch %d with sw: %lu hw:%lu.\n", - (void *)pkts[i], - options.s_probe_info.epoch, now, - options.s_probe_info.last_hw_rx); - } else - ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, - "rx_add_timestamp: packet %p not tagged - server is processing a probe.\n", - (void *)pkts[i]); - } else - ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, - "rx_add_timestamp: packet %p not tagged - hw rx timestamp not available.\n", + "rx_add_timestamp: tagged packet %p with sw rx only: %lu.\n", + (void *)pkts[i], now); + } + } else { + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "rx_add_timestamp: packet %p not tagged - server is probing.\n", (void *)pkts[i]); - } else + } + } else { ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, - "rx_add_timestamp: packet %p not tagged - type %d.\n", + "rx_add_timestamp: packet %p not tagged - not PROBE packet: type %d.\n", (void *)pkts[i], rte_be_to_cpu_16(pkt_data->type)); + } } return nb_pkts; @@ -168,13 +180,13 @@ static uint16_t tx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused, struct rte_mbuf **pkts, uint16_t nb_pkts, void *_ __rte_unused) { - uint64_t now = nm_get_uptime_ns(); + uint64_t now = topo_uptime_ns(); struct pkt_hdr *pkt_data; for (int i = 0; i < nb_pkts; i++) { - pkt_data = check_valid_packet( - pkts[i], &options.s_host_spec.mac_addr); + pkt_data = check_valid_packet(pkts[i], + &options.s_host_spec.mac_addr); if (pkt_data == nullptr) { ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, @@ -188,14 +200,8 @@ tx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused, // at this time the packet is not sent to the NIC yet so // the state must be waiting stats - // XXX: this should be an assert - if (options.s_state.load() != SERVER_STATE_PROBE || - *RTE_MBUF_DYNFIELD( - pkts[i], PROBE_FLAG_OFFSET, uint32_t *) != 1) { - rte_exit(EXIT_FAILURE, - "packet %p sent to NIC before sw callback\n", - (void *)pkts[i]); - } + assert(options.is_probing.load() && + mbuf_is_probe_valid(pkts[i])); options.s_probe_info.last_sw_tx = now; @@ -225,23 +231,23 @@ locore_main(void *ti) bool pending_probe = false; - if (rte_eth_dev_socket_id(options.s_portid) > 0 && - rte_eth_dev_socket_id(options.s_portid) != (int)rte_socket_id()) { + if (rte_eth_dev_socket_id(options.portid) > 0 && + rte_eth_dev_socket_id(options.portid) != (int)rte_socket_id()) { ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "locore_main : WARNING, port %d is on remote NUMA node to " "polling thread.\n\tPerformance will " "not be optimal.\n", - tinfo->tid, options.s_portid); + tinfo->tid, options.portid); } ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, - "locore_main : running on locore %d with txidx %d and rxidx %d.\n", + "locore_main : running on locore %d with txqid %d and rxqid %d.\n", tinfo->tid, rte_lcore_id(), tinfo->txqid, tinfo->rxqid); while (true) { uint16_t nb_tx = 0; - const uint16_t nb_rx = rte_eth_rx_burst( - options.s_portid, tinfo->rxqid, bufs, BURST_SIZE); + const uint16_t nb_rx = rte_eth_rx_burst(options.portid, + tinfo->rxqid, bufs, BURST_SIZE); struct rte_mbuf *pkt_buf; struct pkt_hdr *tx_data; @@ -249,8 +255,8 @@ locore_main(void *ti) // XXX: optimization: in rx_add_timestamp every packet // is already validated once can just mark valid packet // with a value so we can avoid this redundant check - pkt_data = check_valid_packet( - bufs[i], &options.s_host_spec.mac_addr); + pkt_data = check_valid_packet(bufs[i], + &options.s_host_spec.mac_addr); if (pkt_data == nullptr) { ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, @@ -262,13 +268,10 @@ locore_main(void *ti) } NTR_PKT(NTR_DEP_USER1, NTR_LEVEL_DEBUG, pkt_data, - "locore_main : ", tinfo->tid); + "locore_main : received packet ", tinfo->tid); switch (rte_be_to_cpu_16(pkt_data->type)) { case PKT_TYPE_PROBE: { - if (options.s_state.load() == - SERVER_STATE_PROBE && - *RTE_MBUF_DYNFIELD(bufs[i], - PROBE_FLAG_OFFSET, uint32_t *) == 1) { + if (mbuf_is_probe_valid(bufs[i])) { // send back probe_resp pkt to probe for // return latency pending_probe = true; @@ -279,6 +282,7 @@ locore_main(void *ti) ((struct pkt_payload_epoch *) pkt_data->payload) ->epoch); + pkt_hdr_to_netspec(pkt_data, &options.s_probe_info.dst, &options.s_probe_info.cspec @@ -286,12 +290,12 @@ locore_main(void *ti) nullptr, &options.s_probe_info.cspec .src_port); + options.s_probe_info.cspec.src = &options.s_host_spec; - if (alloc_pkt_hdr( - options - .s_mempools[tinfo->node_id], + if (alloc_pkt_hdr(mempool_get( + tinfo->node_id), PKT_TYPE_PROBE_RESP, &options.s_probe_info.cspec, 0, &pkt_buf, &tx_data) != 0) { @@ -303,10 +307,11 @@ locore_main(void *ti) pkt_data->payload, sizeof(struct pkt_payload_epoch)); - *RTE_MBUF_DYNFIELD(pkt_buf, - PROBE_FLAG_OFFSET, uint32_t *) = 1; + mbuf_set_probe_valid(pkt_buf, true); // queue for burst send + NTR_PKT(NTR_DEP_USER1, NTR_LEVEL_DEBUG, tx_data, + "locore_main : sending packet ", tinfo->tid); tx_bufs[nb_tx++] = pkt_buf; } break; @@ -316,20 +321,33 @@ locore_main(void *ti) struct net_spec src; struct net_spec dst; - // touch the unused data to pretend that we read those dummy fields - memcpy(tinfo->load_buffer, pkt_data->payload, MIN(bufs[i]->data_len - sizeof(struct pkt_hdr), THREAD_LOAD_BUFFER_SZ)); + // touch the unused data to pretend that we read + // those dummy fields + memcpy(tinfo->load_buffer, pkt_data->payload, + MIN(bufs[i]->data_len - + sizeof(struct pkt_hdr), + THREAD_LOAD_BUFFER_SZ)); // perform the load - auto pld = (struct pkt_payload_load *)pkt_data->payload; + auto pld = (struct pkt_payload_load *) + pkt_data->payload; uint32_t which = rte_be_to_cpu_32(pld->which); uint32_t load = rte_be_to_cpu_32(pld->load); - uint32_t start_cacheline = which % (options.thread_cacheline_cnt * options.s_thr_info.size()); - uint32_t thrd = start_cacheline / options.thread_cacheline_cnt; - uint32_t start = start_cacheline % options.thread_cacheline_cnt; + uint32_t start_cacheline = which % + (options.thread_cacheline_cnt * + options.s_thr_info.size()); + uint32_t thrd = start_cacheline / + options.thread_cacheline_cnt; + uint32_t start = start_cacheline % + options.thread_cacheline_cnt; for (uint j = 0; j < load; j++) { - *(uint32_t *)tinfo->load_buffer = (start + j) % options.thread_cacheline_cnt; + *(uint32_t *)tinfo->load_buffer = + (start + j) % + options.thread_cacheline_cnt; } - ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "locore_main : LOAD @ thread %d, start %d, load %d\n", tinfo->tid, thrd, start, load); + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "locore_main : LOAD @ thread %d, start %d, load %d\n", + tinfo->tid, thrd, start, load); // reply pkt_hdr_to_netspec(pkt_data, &src, @@ -337,11 +355,10 @@ locore_main(void *ti) cspec.dst = &src; cspec.src = &dst; - // printf("LOAD PKT SIZE: %d\n", bufs[i]->data_len); - // we reply to load packet regardless of the - // server state - if (alloc_pkt_hdr( - options.s_mempools[tinfo->node_id], + // printf("LOAD PKT SIZE: %d\n", + // bufs[i]->data_len); we reply to load packet + // regardless of the server state + if (alloc_pkt_hdr(mempool_get(tinfo->node_id), PKT_TYPE_LOAD_RESP, &cspec, 0, &pkt_buf, &tx_data) != 0) { rte_exit(EXIT_FAILURE, @@ -352,6 +369,8 @@ locore_main(void *ti) sizeof(struct pkt_payload_load)); // queue for burst send + NTR_PKT(NTR_DEP_USER1, NTR_LEVEL_DEBUG, tx_data, + "locore_main : sending packet ", tinfo->tid); tx_bufs[nb_tx++] = pkt_buf; break; } @@ -366,175 +385,67 @@ locore_main(void *ti) } // send all packets - tx_burst_all(options.s_portid, tinfo->txqid, tx_bufs, nb_tx); + tx_burst_all(options.portid, tinfo->txqid, tx_bufs, nb_tx); // we wanna check every loop not only when there are packets if (pending_probe) { - struct timespec ts { - }; + assert(options.is_probing.load()); + struct timespec ts { }; struct pkt_payload_stat *stat; - if (rte_eth_timesync_read_tx_timestamp( - options.s_portid, &ts) == 0) { - ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, - "locore_main : obtained hw tx timestamp %lu.\n", - tinfo->tid, - (ts.tv_sec * S2NS + ts.tv_nsec)); - // now we have everything we need - if (alloc_pkt_hdr( - options.s_mempools[tinfo->node_id], - PKT_TYPE_STAT, - &options.s_probe_info.cspec, 0, - &pkt_buf, &tx_data) != 0) { - rte_exit(EXIT_FAILURE, - "failed to alloc pkt_buf\n"); + if (options.s_hwtimestamp) { + if (rte_eth_timesync_read_tx_timestamp( + options.portid, &ts) == 0) { + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "locore_main : obtained hw tx timestamp %lu.\n", + tinfo->tid, + (ts.tv_sec * S2NS + ts.tv_nsec)); + } else { + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "locore_main : failed to obtain hw tx timestamp.\n", + tinfo->tid); + pending_probe = false; + goto end_stat; } + } + // now we have everything we need - // populate stats - stat = (struct pkt_payload_stat *) - tx_data->payload; - stat->epoch = rte_cpu_to_be_32( - options.s_probe_info.epoch); + if (alloc_pkt_hdr(mempool_get(tinfo->node_id), + PKT_TYPE_STAT, &options.s_probe_info.cspec, 0, + &pkt_buf, &tx_data) != 0) { + rte_exit(EXIT_FAILURE, + "failed to alloc pkt_buf\n"); + } + + // populate stats + stat = (struct pkt_payload_stat *)tx_data->payload; + stat->epoch = rte_cpu_to_be_32( + options.s_probe_info.epoch); + if (options.s_hwtimestamp) { stat->hw_rx = rte_cpu_to_be_64( options.s_probe_info.last_hw_rx); stat->hw_tx = rte_cpu_to_be_64( ts.tv_nsec + ts.tv_sec * S2NS); - stat->sw_rx = rte_cpu_to_be_64( - options.s_probe_info.last_sw_rx); - stat->sw_tx = rte_cpu_to_be_64( - options.s_probe_info.last_sw_tx); - - // send the packet - tx_burst_all(options.s_portid, tinfo->txqid, &pkt_buf, 1); - - // release flux - pending_probe = false; - - int expected = SERVER_STATE_PROBE; - if (!options.s_state.compare_exchange_strong( - expected, SERVER_STATE_WAIT)) { - rte_exit(EXIT_FAILURE, - "s_state changed unexpectedly!"); - } + } else { + stat->hw_rx = 0; + stat->hw_tx = 0; } + stat->sw_rx = rte_cpu_to_be_64( + options.s_probe_info.last_sw_rx); + stat->sw_tx = rte_cpu_to_be_64( + options.s_probe_info.last_sw_tx); + + // send the packet + tx_burst_all(options.portid, tinfo->txqid, &pkt_buf, 1); + + end_stat: + // release flux + pending_probe = false; + options.is_probing.store(false); } } } -static int -port_init(uint16_t portid, struct rte_mempool *mbuf_pool) -{ - struct rte_eth_dev_info dev_info { - }; - struct rte_eth_conf port_conf = port_conf_default; - struct rte_eth_txconf txconf { - }; - struct rte_eth_rxconf rxconf { - }; - - uint16_t nb_rxd = RX_RING_SIZE; - uint16_t nb_txd = TX_RING_SIZE; - - if (!rte_eth_dev_is_valid_port(portid)) { - return -1; - } - - int ret = rte_eth_dev_info_get(portid, &dev_info); - if (ret != 0) { - return ret; - } - - port_conf.rxmode.max_rx_pkt_len = mtu_to_pkt_size(options.port_mtu); - port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; - port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_NONFRAG_IPV4_UDP | - ETH_RSS_L2_PAYLOAD | ETH_RSS_NONFRAG_IPV4_TCP; - port_conf.rx_adv_conf.rss_conf.rss_key = nullptr; - port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_RSS_HASH; - port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_UDP_CKSUM; - port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_IPV4_CKSUM; - port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM; - port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM; - port_conf.txmode.offloads |= DEV_TX_OFFLOAD_MBUF_FAST_FREE; - if (options.jumbo_frame_enabled) { - port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_JUMBO_FRAME; - } - - /* Configure the Ethernet device. */ - ret = rte_eth_dev_configure( - portid, options.num_threads, options.num_threads, &port_conf); - if (ret != 0) - return ret; - - ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd, &nb_txd); - if (ret != 0) - return ret; - - /* Allocate and set up 1 RX queue per thread per Ethernet port. */ - rxconf = dev_info.default_rxconf; - - if (options.jumbo_frame_enabled) { - rxconf.offloads |= DEV_RX_OFFLOAD_JUMBO_FRAME; - } - for (int i = 0; i < options.num_threads; i++) { - ret = rte_eth_rx_queue_setup(portid, i, nb_rxd, - rte_eth_dev_socket_id(portid), &rxconf, mbuf_pool); - if (ret < 0) - return ret; - options.s_thr_info.at(i)->rxqid = i; - } - - txconf = dev_info.default_txconf; - txconf.offloads = port_conf.txmode.offloads; - /* Allocate and set up 1 TX queue per thread per Ethernet port. */ - for (int i = 0; i < options.num_threads; i++) { - ret = rte_eth_tx_queue_setup( - portid, i, nb_txd, rte_eth_dev_socket_id(portid), &txconf); - if (ret < 0) - return ret; - options.s_thr_info.at(i)->txqid = i; - } - - // set mtu - ret = rte_eth_dev_set_mtu(portid, options.port_mtu); - if (ret != 0) - return ret; - - ret = rte_eth_dev_start(portid); - if (ret < 0) - return ret; - - /* Display the port MAC address. */ - struct rte_ether_addr addr { - }; - ret = rte_eth_macaddr_get(portid, &addr); - if (ret != 0) - return ret; - - ret = rte_eth_timesync_enable(portid); - if (ret != 0) - return ret; - - /* Enable RX in promiscuous mode for the Ethernet device. */ - ret = rte_eth_promiscuous_enable(portid); - if (ret != 0) - return ret; - - for (int i = 0; i < options.num_threads; i++) { - if (rte_eth_add_tx_callback(portid, - options.s_thr_info.at(i)->txqid, tx_add_timestamp, - nullptr) == nullptr || - rte_eth_add_rx_callback(portid, - options.s_thr_info.at(i)->rxqid, rx_add_timestamp, - nullptr) == nullptr) { - return -1; - } - } - - // sync_port_clock(portid); - - return 0; -} - static void usage() { @@ -542,13 +453,15 @@ usage() "Usage:\n" " -v(vv): verbose mode\n" " -h: seek help\n" - " -A: cpu mask for worker threads\n" + " -A: cpu list for worker threads\n" " -m: enable memory load generator(MLG)\n" - " -b: MLG bytes per second\n" + " -b: MLG trunk size\n" " -x: MLG thread affinity mask\n" " -X: MLG target domain affinity mask\n" + " -S: MLG shared buffer\n" " -H: host spec\n" - " -J: enable jumbo frames\n"); + " -J: enable jumbo frames\n" + " -p: port id\n"); fflush(stdout); } @@ -560,19 +473,22 @@ dump_options() " verbosity: +%d\n" " thread count: %d\n" " ip: 0x%x\n" - " MLG: %s [bps: %ld, thread cnt: %d, domain: %ld]\n" - " jumbo frame: %d\n", - ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING, options.num_threads, - options.s_host_spec.ip, options.mlg_enabled ? "on" : "off", - options.mlg_bps, CPU_COUNT(&options.mlg_cset), CPU_FFS(&options.mlg_dset) - 1, options.jumbo_frame_enabled); + " MLG: %s [arr_sz: %ld, thread cnt: %d, domain: %ld]\n" + " jumbo frame: %d\n" + " port id: %d\n", + ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING, + options.num_threads, options.s_host_spec.ip, + options.mlg_enabled ? "on" : "off", options.mlg_arr_sz, + CPU_COUNT(&options.mlg_cset), CPU_FFS(&options.mlg_dset) - 1, + options.jumbo_frame_enabled, options.portid); } int main(int argc, char *argv[]) { - unsigned int nb_ports; - struct rte_mempool *mbuf_pool; bool has_host_spec { false }; + struct mem_conf mconf; + struct device_conf dconf; ntr_init(); @@ -590,7 +506,7 @@ main(int argc, char *argv[]) { int c; // parse arguments - while ((c = getopt(argc, argv, "hvA:H:mb:X:x:J")) != -1) { + while ((c = getopt(argc, argv, "hvA:H:mb:X:x:JSp:")) != -1) { switch (c) { case 'v': ntr_set_level(NTR_DEP_USER1, @@ -601,15 +517,16 @@ main(int argc, char *argv[]) rte_exit(EXIT_SUCCESS, "\n"); case 'A': cpulist_to_cpuset(optarg, &options.cpu_set); - options.num_threads = CPU_COUNT(&options.cpu_set); + options.num_threads = CPU_COUNT( + &options.cpu_set); if (options.num_threads == 0) { rte_exit(EXIT_FAILURE, "must run at least one thread\n"); } break; case 'H': - if (str_to_netspec( - optarg, &options.s_host_spec) != 0) { + if (str_to_netspec(optarg, + &options.s_host_spec) != 0) { rte_exit(EXIT_FAILURE, "invalid host spec\n"); } @@ -619,7 +536,8 @@ main(int argc, char *argv[]) options.mlg_enabled = true; break; case 'b': - options.mlg_bps = strtoull(optarg, nullptr, 10); + options.mlg_arr_sz = strtoull(optarg, nullptr, + 10); break; case 'X': cpulist_to_cpuset(optarg, &options.mlg_dset); @@ -631,10 +549,16 @@ main(int argc, char *argv[]) options.jumbo_frame_enabled = true; options.port_mtu = MAX_JUMBO_MTU; break; + case 'S': + options.mlg_shared_buffer = 1; + break; + case 'p': + options.portid = atoi(optarg); + break; default: usage(); - rte_exit( - EXIT_SUCCESS, "unknown argument: %c", c); + rte_exit(EXIT_SUCCESS, "unknown argument: %c", + c); } } } @@ -643,109 +567,98 @@ main(int argc, char *argv[]) rte_exit(EXIT_FAILURE, "Must specify host spec\n"); } - // init nm - if (nm_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != 0) { - rte_exit(EXIT_FAILURE, "nm init failed!\n"); + // init libtopo + if (topo_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != + 0) { + rte_exit(EXIT_FAILURE, "libtopo init failed!\n"); + } + + // init libnms + if (nms_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != 0) { + rte_exit(EXIT_FAILURE, "libnms init failed!\n"); } dump_options(); - // init mlg - // if (options.mlg_enabled) { - // // bool success = false; - // // options.mlg = new memload_generator(options.mlg_cmask, - // // options.mlg_dmask, options.mlg_bps, &success); - // // if (!success) { - // // rte_exit(EXIT_FAILURE, "failed to init mlg\n"); - // // } - // } - // register dynamic field - PROBE_FLAG_OFFSET = rte_mbuf_dynfield_register( + struct rte_mbuf_dynfield rte_mbuf_dynfield_probe_flag = { + .name = "rte_mbuf_dynfield_probe_valid", + .size = sizeof(bool), + .align = __alignof__(uint32_t), + .flags = 0 + }; + options.probe_state_offset = rte_mbuf_dynfield_register( &rte_mbuf_dynfield_probe_flag); - if (PROBE_FLAG_OFFSET < 0) { - rte_exit(EXIT_FAILURE, "failed to register dynamic field\n"); + if (options.probe_state_offset == -1) { + rte_exit(EXIT_FAILURE, "failed to register dynamic field: %d\n", + rte_errno); } - nb_ports = rte_eth_dev_count_avail(); - if (nb_ports == 0) { - rte_exit(EXIT_FAILURE, "number of ports must be > 0\n"); + // configure memory and port + struct port_conf pconf; + portconf_get(options.portid, &pconf); + if (!pconf.timesync) { + ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, + "main: timesync disabled. hw timestamp unavailable.\n "); } + dconf.mtu = options.port_mtu; + dconf.num_threads = options.num_threads; + dconf.portid = options.portid; + dconf.rss_hf = pconf.rss_hf; + dconf.rx_offloads = pconf.rxoffload; + dconf.tx_offloads = pconf.txoffload; + dconf.timesync = pconf.timesync; - uint16_t portid = rte_eth_find_next(0); - if (portid == RTE_MAX_ETHPORTS) { - rte_exit(EXIT_FAILURE, "cannot find an available port\n"); - } - options.s_portid = portid; + dconf.rx_fn = rx_add_timestamp; + dconf.rx_user = nullptr; + dconf.rx_ring_sz = 2048; + dconf.tx_fn = tx_add_timestamp; + dconf.tx_user = nullptr; + dconf.tx_ring_sz = 2048; - if (rte_eth_macaddr_get(portid, &options.s_host_spec.mac_addr) != 0) { + mconf.cache_size = 512; + mconf.priv_size = 0; + mconf.num_elements = (dconf.rx_ring_sz + dconf.tx_ring_sz) * + rte_lcore_count() / rte_socket_count(); + mconf.data_room_size = RTE_MBUF_DEFAULT_BUF_SIZE + MAX_JUMBO_MTU - + MAX_STANDARD_MTU; + mconf.max_pools = -1; + + dpdk_init(&dconf, &mconf); + + if (rte_eth_macaddr_get(options.portid, + &options.s_host_spec.mac_addr) != 0) { rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n", - portid); - } - - if (rte_socket_count() > (int)MAX_NODES) { - rte_exit(EXIT_FAILURE, "Too many numa nodes\n"); - } - - for (int i = 0; i < (int)rte_socket_count(); i++) { - uint32_t nodeid = i; - ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, - "main: creating mempool for node %d\n", nodeid); - - // create one mbuf pool per socket - snprintf(options.mempool_name_buf, MEMPOOL_NAME_BUF_LEN, - "khat_mempool_%d", nodeid); - mbuf_pool = rte_pktmbuf_pool_create(options.mempool_name_buf, - MBUF_MAX_COUNT, MBUF_CACHE_SIZE, 0, - options.jumbo_frame_enabled ? - RTE_MBUF_DEFAULT_BUF_SIZE + (MAX_JUMBO_MTU - MAX_STANDARD_MTU) : - RTE_MBUF_DEFAULT_BUF_SIZE, nodeid); - if (mbuf_pool == nullptr) { - rte_exit(EXIT_FAILURE, "cannot create mbuf pool: %d\n", - rte_errno); - } - - options.s_mempools[nodeid] = mbuf_pool; - ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, - "main: created mempool for node %d\n", nodeid); + options.portid); } // init threads uint32_t cpu_idx = CPU_FFS(&options.cpu_set); uint32_t tid = 0; - while(cpu_idx != 0) { + while (cpu_idx != 0) { uint32_t lcore_id = cpu_idx - 1; uint32_t node_id = rte_lcore_to_socket_id(lcore_id); - auto *tinfo = (struct thread_info *)nm_malloc(node_id, sizeof(struct thread_info)); - tinfo->cache_lines = nm_malloc(node_id, CACHELINE_SIZE * options.thread_cacheline_cnt); - tinfo->load_buffer = nm_malloc(node_id, THREAD_LOAD_BUFFER_SZ); + auto *tinfo = (struct thread_info *)nms_malloc(node_id, + sizeof(struct thread_info)); + tinfo->cache_lines = nms_malloc(node_id, + CACHELINE_SIZE * options.thread_cacheline_cnt); + tinfo->load_buffer = nms_malloc(node_id, + THREAD_LOAD_BUFFER_SZ); tinfo->tid = tid; tinfo->lcore_id = lcore_id; tinfo->node_id = node_id; + tinfo->rxqid = tid; + tinfo->txqid = tid; options.s_thr_info.push_back(tinfo); ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, - "main: thread %d assigned to cpu %d, node %d\n", tinfo->tid, - tinfo->lcore_id, nm_get_node_from_core(lcore_id)); - + "main: thread %d assigned to cpu %d, node %d\n", tinfo->tid, + tinfo->lcore_id, topo_core_to_numa(lcore_id)); + tid++; CPU_CLR(cpu_idx - 1, &options.cpu_set); cpu_idx = CPU_FFS(&options.cpu_set); } - if (port_init(portid, mbuf_pool) != 0) { - rte_exit(EXIT_FAILURE, "cannot init port %d\n", portid); - } - - ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, - "Configured port %d on socket %d with mac addr %x:%x:%x:%x:%x:%x\n", - portid, rte_eth_dev_socket_id(portid), - options.s_host_spec.mac_addr.addr_bytes[0], - options.s_host_spec.mac_addr.addr_bytes[1], - options.s_host_spec.mac_addr.addr_bytes[2], - options.s_host_spec.mac_addr.addr_bytes[3], - options.s_host_spec.mac_addr.addr_bytes[4], - options.s_host_spec.mac_addr.addr_bytes[5]); - sleep(INIT_DELAY); for (int i = 0; i < options.num_threads; i++) { @@ -762,19 +675,35 @@ main(int argc, char *argv[]) } } - if (options.mlg_enabled) - options.mlg->start(); + // init mlg + if (options.mlg_enabled) { + bool success = false; + memload_generator::memload_generator_options opts; + opts.chunk_size = options.mlg_arr_sz; + opts.iteration = + memload_generator::memload_generator_options::ITERATION_MAX; + opts.shared_buffer = options.mlg_shared_buffer; + opts.verbose = (ntr_get_level(NTR_DEP_USER1) - + NTR_LEVEL_WARNING) != 0; + options.mlg = new memload_generator(&options.mlg_cset, + &options.mlg_dset, &opts, &success); + if (!success) { + rte_exit(EXIT_FAILURE, "failed to init mlg\n"); + } + } + while (true) { usleep(S2US); if (options.mlg_enabled) { uint64_t bps = options.mlg->get_bps(); ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, - "main: MLG bps = %ld ~= %ldM\n", bps, bps / 1024 / 1024); + "main: MLG bps = %ld ~= %ldM\n", bps, + bps / 1024 / 1024); } } - if (options.mlg_enabled) - options.mlg->stop(); + // shouldn't get here + // clean up for (int i = 0; i < options.num_threads; i++) { struct thread_info *tinfo = options.s_thr_info.at(i); ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, @@ -785,10 +714,9 @@ main(int argc, char *argv[]) } } - // shouldn't get here - // clean up - rte_eth_dev_stop(portid); delete options.mlg; + dpdk_cleanup(&dconf); + return 0; } diff --git a/net/libnetsup/dpdk.cc b/net/libnetsup/dpdk.cc new file mode 100644 index 0000000..f2a5e92 --- /dev/null +++ b/net/libnetsup/dpdk.cc @@ -0,0 +1,207 @@ +#include "net/netsup.hh" +#include + +#include "rte_build_config.h" +#include "rte_common.h" +#include "rte_config.h" +#include "rte_ether.h" +#include "rte_lcore.h" +#include "rte_mempool.h" +#include "rte_mbuf.h" +#include "rte_errno.h" +#include "rte_ethdev.h" + +#include "ntr.h" + +static struct rte_mempool *g_mempools[MAX_NUMA_NODES] = {nullptr}; +static unsigned int g_mempool_sz = 0; + +static void +mempool_init(struct mem_conf *mconf) +{ + struct rte_mempool * mbuf_pool; + char mempool_name[64]; + + for (int i = 0; i < (int)rte_socket_count(); i++) { + uint32_t nodeid = i; + // ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, + // "mempool_init: creating mempool for node %d\n", nodeid); + + // create one mbuf pool per socket + snprintf(mempool_name, sizeof(mempool_name), "net_mempool_%d", nodeid); + + mbuf_pool = rte_pktmbuf_pool_create(mempool_name, mconf->num_elements, + mconf->cache_size, mconf->priv_size, + mconf->data_room_size, nodeid); + + if (mbuf_pool == nullptr) { + rte_exit(EXIT_FAILURE, "cannot create mbuf pool: %d\n", rte_errno); + } + + g_mempools[nodeid] = mbuf_pool; + g_mempool_sz++; + ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "mempool_init: created mempool for node %d\n", nodeid); + } +} + +struct rte_mempool * +mempool_get(int nodeid) +{ + if ((unsigned int)nodeid < g_mempool_sz) { + return g_mempools[nodeid]; + } + return nullptr; +} + +static void +port_init(struct device_conf *dconf) +{ + struct rte_ether_addr addr; + struct rte_eth_dev_info dev_info { + }; + struct rte_eth_conf port_conf; + struct rte_eth_txconf txconf { + }; + struct rte_eth_rxconf rxconf { + }; + int ret; + + if (rte_eth_dev_count_avail() == 0) { + rte_exit(EXIT_FAILURE, "number of ports must be > 0\n"); + } + + if (!rte_eth_dev_is_valid_port(dconf->portid)) { + rte_exit(EXIT_FAILURE, "cannot find port %d\n", dconf->portid); + } + + if ((ret = rte_eth_macaddr_get(dconf->portid, &addr)) != 0) { + rte_exit(EXIT_FAILURE, "cannot get mac address of port: %d\n", ret); + } + + ret = rte_eth_dev_info_get(dconf->portid, &dev_info); + if (ret != 0) { + rte_exit(EXIT_FAILURE, "failed to get dev info: %d", ret); + } + + memset(&port_conf, 0, sizeof(struct rte_eth_conf)); + port_conf.rxmode.mtu = dconf->mtu; + port_conf.rxmode.mq_mode = RTE_ETH_MQ_RX_RSS; + port_conf.rx_adv_conf.rss_conf.rss_key = nullptr; + port_conf.rx_adv_conf.rss_conf.rss_hf = dconf->rss_hf; + + port_conf.rxmode.offloads = dconf->rx_offloads; + port_conf.txmode.offloads = dconf->tx_offloads; + + /* Configure the Ethernet device. */ + ret = rte_eth_dev_configure( + dconf->portid, dconf->num_threads, dconf->num_threads, &port_conf); + if (ret != 0) + rte_exit(EXIT_FAILURE, "failed to configure port: %d\n", ret); + + ret = rte_eth_dev_adjust_nb_rx_tx_desc(dconf->portid, &dconf->rx_ring_sz, &dconf->tx_ring_sz); + if (ret != 0) + rte_exit(EXIT_FAILURE, "failed to set rx tx queue size: %d\n", ret); + + /* Allocate and set up 1 RX queue per thread per Ethernet port. */ + rxconf = dev_info.default_rxconf; + rxconf.offloads = port_conf.rxmode.offloads; + for (int i = 0; i < dconf->num_threads; i++) { + ret = rte_eth_rx_queue_setup(dconf->portid, i, dconf->rx_ring_sz, + rte_eth_dev_socket_id(dconf->portid), &rxconf, + mempool_get(rte_eth_dev_socket_id(dconf->portid))); + + if (ret < 0) + rte_exit(EXIT_FAILURE, "failed to setup rx queue %d: %d\n", i, ret); + } + + /* Allocate and set up 1 TX queue per thread per Ethernet port. */ + txconf = dev_info.default_txconf; + txconf.offloads = port_conf.txmode.offloads; + for (int i = 0; i < dconf->num_threads; i++) { + ret = rte_eth_tx_queue_setup( + dconf->portid, i, dconf->tx_ring_sz, rte_eth_dev_socket_id(dconf->portid), + &txconf); + if (ret < 0) + rte_exit(EXIT_FAILURE, "failed to setup tx queue %d: %d", i, ret); + } + + // set mtu + ret = rte_eth_dev_set_mtu(dconf->portid, dconf->mtu); + if (ret != 0) + rte_exit(EXIT_FAILURE, "failed to set mtu: %d\n", ret); + + ret = rte_eth_dev_start(dconf->portid); + if (ret < 0) + rte_exit(EXIT_FAILURE, "failed to start port: %d\n", ret); + + if (dconf->timesync) { + ret = rte_eth_timesync_enable(dconf->portid); + if (ret != 0) + rte_exit(EXIT_FAILURE, "failed to enable timesync: %d\n", ret); + } + + /* Enable RX in promiscuous mode for the Ethernet device. */ + ret = rte_eth_promiscuous_enable(dconf->portid); + if (ret != 0) + rte_exit(EXIT_FAILURE, "failed to enable promiscuous mode: %d\n", ret); + + for (int i = 0; i < dconf->num_threads; i++) { + if (dconf->tx_fn != nullptr) { + if (rte_eth_add_tx_callback(dconf->portid, + i, dconf->tx_fn, + dconf->tx_user) == nullptr) { + rte_exit(EXIT_FAILURE, "failed to attach callback to tx queue %d\n", i); + } + } + + if (dconf->rx_fn != nullptr) { + if (rte_eth_add_rx_callback(dconf->portid, + i, dconf->rx_fn, + dconf->rx_user) == nullptr) { + rte_exit(EXIT_FAILURE, "failed to attach callback to rx queue %d\n", i); + } + } + } + + // sync_port_clock(portid); + + ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, + "port_init: configured port %d on socket %d with mac addr %x:%x:%x:%x:%x:%x\n", + dconf->portid, rte_eth_dev_socket_id(dconf->portid), + addr.addr_bytes[0], + addr.addr_bytes[1], + addr.addr_bytes[2], + addr.addr_bytes[3], + addr.addr_bytes[4], + addr.addr_bytes[5]); +} + +void +dpdk_init(struct device_conf *dconf, struct mem_conf *mconf) +{ + if (rte_socket_count() > (int)MAX_NUMA_NODES) { + rte_exit(EXIT_FAILURE, "too many numa nodes\n"); + } + + // ensure 1-1 mapping + for (int i = 0; i < (int)rte_socket_count(); i++) { + if (rte_socket_id_by_idx(i) != i) { + rte_exit(EXIT_FAILURE, "socket %d has id %d instead.\n", i, rte_socket_id_by_idx(i)); + } + } + + mempool_init(mconf); + + port_init(dconf); +} + +void +dpdk_cleanup(struct device_conf * dconf) +{ + rte_eth_dev_stop(dconf->portid); + rte_eth_dev_close(dconf->portid); + + for (int i = 0; i < (int)rte_socket_count(); i++) { + rte_mempool_free(g_mempools[i]); + } +} \ No newline at end of file diff --git a/net/libnetsup/portconf.cc b/net/libnetsup/portconf.cc new file mode 100644 index 0000000..83c9480 --- /dev/null +++ b/net/libnetsup/portconf.cc @@ -0,0 +1,59 @@ +#include "rte_ethdev.h" +#include "net/netsup.hh" +#include + +static struct port_conf port_confs[] = { + { + .driver_name = "net_cxgbe", + .rxoffload = RTE_ETH_RX_OFFLOAD_RSS_HASH | RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_IPV4_CKSUM, + .txoffload = RTE_ETH_TX_OFFLOAD_UDP_CKSUM | RTE_ETH_TX_OFFLOAD_IPV4_CKSUM, + .rss_hf = RTE_ETH_RSS_UDP | RTE_ETH_RSS_FRAG_IPV4, + .timesync = false + }, + { + .driver_name = "net_i40e", + .rxoffload = RTE_ETH_RX_OFFLOAD_RSS_HASH | RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_IPV4_CKSUM, + .txoffload = RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE | RTE_ETH_TX_OFFLOAD_UDP_CKSUM | RTE_ETH_TX_OFFLOAD_IPV4_CKSUM, + .rss_hf = RTE_ETH_RSS_FRAG_IPV4 | RTE_ETH_RSS_NONFRAG_IPV4_UDP | RTE_ETH_RSS_NONFRAG_IPV4_OTHER | RTE_ETH_RSS_L2_PAYLOAD, + .timesync = true + }, + { + .driver_name = "net_ice", + .rxoffload = RTE_ETH_RX_OFFLOAD_RSS_HASH | RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_IPV4_CKSUM | RTE_ETH_RX_OFFLOAD_TIMESTAMP, + .txoffload = RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE | RTE_ETH_TX_OFFLOAD_UDP_CKSUM | RTE_ETH_TX_OFFLOAD_IPV4_CKSUM, + .rss_hf = RTE_ETH_RSS_FRAG_IPV4 | RTE_ETH_RSS_NONFRAG_IPV4_UDP | RTE_ETH_RSS_NONFRAG_IPV4_OTHER | RTE_ETH_RSS_L2_PAYLOAD, + .timesync = true + } +}; + +static struct port_conf default_conf = { + .driver_name = "default", + .rxoffload = RTE_ETH_RX_OFFLOAD_RSS_HASH | RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_IPV4_CKSUM | RTE_ETH_RX_OFFLOAD_TIMESTAMP, + .txoffload = RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE | RTE_ETH_TX_OFFLOAD_UDP_CKSUM | RTE_ETH_TX_OFFLOAD_IPV4_CKSUM, + .rss_hf = RTE_ETH_RSS_FRAG_IPV4 | RTE_ETH_RSS_NONFRAG_IPV4_UDP | RTE_ETH_RSS_NONFRAG_IPV4_OTHER | RTE_ETH_RSS_L2_PAYLOAD, + .timesync = true +}; + +static const int port_size = sizeof(port_confs) / sizeof(port_confs[0]); + +int +portconf_get(int portid, struct port_conf * out) +{ + struct rte_eth_dev_info dev_info {}; + + if (rte_eth_dev_info_get(portid, &dev_info) != 0) { + rte_exit(EXIT_FAILURE, "failed to obtain device info for port %d\n", portid); + } + + for(int i = 0; i < port_size; i++) { + struct port_conf * conf = &port_confs[i]; + if (strcmp(conf->driver_name, dev_info.driver_name) == 0) { + memcpy(out, conf, sizeof(struct port_conf)); + return 0; + } + } + + fprintf(stdout, "portconf_get: unable to find matching conf for port %d:%s, returning default conf.\n", portid, dev_info.driver_name); + memcpy(out, &default_conf, sizeof(struct port_conf)); + return -1; +} \ No newline at end of file diff --git a/net/rat.cc b/net/rat.cc index baa100b..4536349 100644 --- a/net/rat.cc +++ b/net/rat.cc @@ -1,4 +1,14 @@ +#include +#include +#include +#include +#include +#include +#include + #include + +#include #include #include #include @@ -10,28 +20,15 @@ #include #include -#include "gen.hh" -#include "nm.hh" #include "ntr.h" + +#include "gen.hh" +#include "net/netsup.hh" #include "net/pkt.hh" -#include "net/util.hh" +#include "nms.h" -#include -#include -#include -#include -#include -#include - -#define MBUF_MAX_COUNT (rte_lcore_count() * 4096) -constexpr static unsigned int MBUF_CACHE_SIZE = 512; -constexpr static unsigned int RX_RING_SIZE = 2048; -constexpr static unsigned int TX_RING_SIZE = 2048; constexpr static unsigned int BURST_SIZE = 32; -static const struct rte_eth_conf port_conf_default { -}; - static unsigned int epoch_mk(unsigned int id, unsigned int epoch) { @@ -60,6 +57,7 @@ struct thread_info { unsigned int lcore_id { 0 }; unsigned int rxqid { 0 }; unsigned int txqid { 0 }; + int socket_id; // this field is read by the stat collecting thread std::atomic recved_pkts { 0 }; std::atomic lost_pkts { 0 }; @@ -74,9 +72,12 @@ struct thread_info { mtx; // this lock protects data shared between worker threads, i.e.: std::list recved_epochs; - thread_info() : which_rd(), which_rng(which_rd()), which_dice(std::uniform_int_distribution(0, UINT32_MAX)) + thread_info() + : which_rd() + , which_rng(which_rd()) + , which_dice(std::uniform_int_distribution(0, UINT32_MAX)) { - which_rng.seed(nm_get_uptime_ns()); + which_rng.seed(topo_uptime_ns()); } }; @@ -94,26 +95,22 @@ struct options_t { char ld_gen[256] { "fixed:0" }; uint32_t target_qps { 0 }; uint32_t depth { 1 }; - struct net_spec server_spec { - }; + struct net_spec server_spec { }; cpuset_t cpu_set = CPUSET_T_INITIALIZER(0x2); // 1 thread @ core 2 uint32_t pkt_loss_delay_ms { UINT32_MAX }; bool jumbo_frame_enabled { false }; int pkt_pad_sz { 0 }; int port_mtu { MAX_STANDARD_MTU }; + int portid { 0 }; // states unsigned int s_num_threads { 1 }; // 1 thread - struct rte_mempool *mbuf_pool { nullptr }; - struct net_spec s_host_spec { - }; - struct net_spec s_master_spec { - }; + struct net_spec s_host_spec { }; + struct net_spec s_master_spec { }; struct conn_spec s_master_cspec { .src = &s_host_spec, .src_port = DEFAULT_RAT_PORT, .dst = &s_master_spec, .dst_port = DEFAULT_RAT_PORT, }; - uint16_t s_portid { 0 }; std::vector s_thr_info; std::atomic s_state { STATE_RUNNING }; // default non master mode @@ -124,8 +121,8 @@ struct options_t { static struct options_t options; static inline void -calc_stats( - uint64_t now, uint32_t *qps, uint32_t *recved_pkt, uint32_t *total_loss) +calc_stats(uint64_t now, uint32_t *qps, uint32_t *recved_pkt, + uint32_t *total_loss) { uint32_t recv = 0; uint32_t loss = 0; @@ -159,8 +156,8 @@ proto_loop(struct thread_info *tinfo) ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "proto_loop : waiting for SYNC from cat\n", tinfo->id); while (options.s_state.load() == STATE_SYNC) { - const uint16_t nb_rx = rte_eth_rx_burst( - options.s_portid, tinfo->rxqid, rx_bufs, BURST_SIZE); + const uint16_t nb_rx = rte_eth_rx_burst(options.portid, + tinfo->rxqid, rx_bufs, BURST_SIZE); if (nb_rx > 0) { for (int i = 0; i < nb_rx; i++) { struct pkt_hdr *each = check_valid_packet( @@ -195,13 +192,13 @@ proto_loop(struct thread_info *tinfo) nullptr); if (alloc_pkt_hdr( - options - .mbuf_pool, + mempool_get( + tinfo + ->socket_id), PKT_TYPE_SYNC_ACK, &options .s_master_cspec, - 0, - &tx_buf, + 0, &tx_buf, &pkt_data) != 0) { rte_exit( @@ -209,7 +206,10 @@ proto_loop(struct thread_info *tinfo) "failed to alloc pkt hdr\n"); } - tx_burst_all(options.s_portid, tinfo->txqid, &tx_buf, 1); + tx_burst_all( + options.portid, + tinfo->txqid, + &tx_buf, 1); expected = STATE_SYNC_ACK; @@ -240,6 +240,7 @@ proto_loop(struct thread_info *tinfo) ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "proto_loop : ignoring invalid packet %p.\n", tinfo->id, (void *)rx_bufs[i]); + //dump_pkt(rx_bufs[i]); } rte_pktmbuf_free(rx_bufs[i]); @@ -268,16 +269,16 @@ pkt_loop(struct thread_info *tinfo) srv_cspec.src = &options.s_host_spec; srv_cspec.dst = &options.server_spec; - next_ts = nm_get_uptime_ns(); + next_ts = topo_uptime_ns(); ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "pkt_loop : entering\n", tinfo->id); while (options.s_state.load() == STATE_RUNNING) { - uint64_t now = nm_get_uptime_ns(); + uint64_t now = topo_uptime_ns(); // always pop incoming packets - const uint16_t nb_rx = rte_eth_rx_burst( - options.s_portid, tinfo->rxqid, rx_bufs, BURST_SIZE); + const uint16_t nb_rx = rte_eth_rx_burst(options.portid, + tinfo->rxqid, rx_bufs, BURST_SIZE); if (nb_rx > 0) { for (int i = 0; i < nb_rx; i++) { @@ -309,7 +310,8 @@ pkt_loop(struct thread_info *tinfo) pld_epoch->epoch); id = epoch_get_id(epoch); - // printf("Load resp size : %d\n", rx_bufs[i]->data_len); + // printf("Load resp size : %d\n", + // rx_bufs[i]->data_len); ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "pkt_loop : packet %p epoch 0x%x id %d.\n", @@ -337,7 +339,7 @@ pkt_loop(struct thread_info *tinfo) break; case PKT_TYPE_FIN: if (rte_is_same_ether_addr( - &each->eth_hdr.s_addr, + &each->eth_hdr.src_addr, &options.s_master_spec .mac_addr)) { ntr(NTR_DEP_USER1, @@ -364,11 +366,11 @@ pkt_loop(struct thread_info *tinfo) struct pkt_hdr *pkt_hdr; if (alloc_pkt_hdr( - options.mbuf_pool, + mempool_get( + tinfo->socket_id), PKT_TYPE_FIN_ACK, &options.s_master_cspec, - 0, - &tx_bufs[0], + 0, &tx_bufs[0], &pkt_hdr) != 0) { rte_exit(EXIT_FAILURE, "failed to allocate pkt hdr\n"); @@ -386,7 +388,9 @@ pkt_loop(struct thread_info *tinfo) rte_cpu_to_be_32( total_loss); - tx_burst_all(options.s_portid, tinfo->txqid, &tx_bufs[0], 1); + tx_burst_all(options.portid, + tinfo->txqid, &tx_bufs[0], + 1); options.s_state.store( STATE_FIN); @@ -487,9 +491,9 @@ pkt_loop(struct thread_info *tinfo) // change dst port for every packet for RSS srv_cspec.dst_port = dst_port_gen.next(); srv_cspec.src_port = src_port_gen.next(); - if (alloc_pkt_hdr(options.mbuf_pool, PKT_TYPE_LOAD, - &srv_cspec, options.pkt_pad_sz, &tx_bufs[total_send], - &pkt_data) != 0) { + if (alloc_pkt_hdr(mempool_get(tinfo->socket_id), + PKT_TYPE_LOAD, &srv_cspec, options.pkt_pad_sz, + &tx_bufs[total_send], &pkt_data) != 0) { rte_exit(EXIT_FAILURE, "failed to allocate pkt hdr\n"); } @@ -497,7 +501,8 @@ pkt_loop(struct thread_info *tinfo) pld_load = (struct pkt_payload_load *)pkt_data->payload; pld_load->load = rte_cpu_to_be_32( tinfo->load_gen->generate()); - pld_load->which = rte_cpu_to_be_32(tinfo->which_dice(tinfo->which_rng)); + pld_load->which = rte_cpu_to_be_32( + tinfo->which_dice(tinfo->which_rng)); unsigned int epoch = epoch_mk(tinfo->id, cur_epoch); pld_load->epoch = rte_cpu_to_be_32(epoch); cur_epoch++; @@ -514,26 +519,16 @@ pkt_loop(struct thread_info *tinfo) total_send++; } - tx_burst_all(options.s_portid, tinfo->txqid, tx_bufs, total_send); - // if (total_send > 0) { - // const uint16_t nb_tx = rte_eth_tx_burst( - // options.s_portid, tinfo->txqid, tx_bufs, - // total_send); - - // if (nb_tx != total_send) { - // rte_exit( - // EXIT_FAILURE, "failed to send packet\n"); - // } - // } + tx_burst_all(options.portid, tinfo->txqid, tx_bufs, total_send); // check rage quit only when we have sent a packet if (last_recv_ts == 0) { - last_recv_ts = nm_get_uptime_ns(); + last_recv_ts = topo_uptime_ns(); } - if (nm_get_uptime_ns() - last_recv_ts > - options.rage_quit_time * MS2NS) { + if (topo_uptime_ns() > + options.rage_quit_time * MS2NS + last_recv_ts) { rte_exit(EXIT_FAILURE, - "rat: thread %d waiting too long for resp. I QUIT!!\n", + "rat: thread %d waiting too long for resp. I F QUIT!\n", tinfo->id); } } @@ -554,16 +549,16 @@ locore_main(void *tif) uint32_t core_id = rte_lcore_id(); ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, - "locore_main : running on core %d...\n", tinfo->id, - core_id); + "locore_main : running on core %d rxqid %d txqid %d...\n", tinfo->id, + core_id, tinfo->rxqid, tinfo->txqid); - if (rte_eth_dev_socket_id(options.s_portid) > 0 && - rte_eth_dev_socket_id(options.s_portid) != (int)rte_socket_id()) { + if (rte_eth_dev_socket_id(options.portid) > 0 && + rte_eth_dev_socket_id(options.portid) != (int)rte_socket_id()) { ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "locore_main : WARNING, port %d is on remote NUMA node to " "polling thread.\n\tPerformance will " "not be optimal.\n", - tinfo->id, options.s_portid); + tinfo->id, options.portid); } if (options.slave_mode == 1) { @@ -575,7 +570,7 @@ locore_main(void *tif) while (options.s_state.load() != STATE_RUNNING) { } // store the current timestamp - options.s_ts_begin.store(nm_get_uptime_ns()); + options.s_ts_begin.store(topo_uptime_ns()); pkt_loop(tinfo); ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main : exited\n", @@ -584,99 +579,6 @@ locore_main(void *tif) return 0; } -static int -port_init(uint16_t portid, struct rte_mempool *mbuf_pool) -{ - struct rte_eth_dev_info dev_info { - }; - struct rte_eth_conf port_conf = port_conf_default; - struct rte_eth_txconf txconf { - }; - struct rte_eth_rxconf rxconf { - }; - - uint16_t nb_rxd = RX_RING_SIZE; - uint16_t nb_txd = TX_RING_SIZE; - - if (!rte_eth_dev_is_valid_port(portid)) { - return -1; - } - - int ret = rte_eth_dev_info_get(portid, &dev_info); - if (ret != 0) { - return ret; - } - - port_conf.rxmode.max_rx_pkt_len = mtu_to_pkt_size(options.port_mtu);; - port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; - port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_NONFRAG_IPV4_UDP | - ETH_RSS_L2_PAYLOAD | ETH_RSS_NONFRAG_IPV4_TCP; - port_conf.rx_adv_conf.rss_conf.rss_key = nullptr; - port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_UDP_CKSUM; - port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_IPV4_CKSUM; - port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_RSS_HASH; - port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM; - port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM; - port_conf.txmode.offloads |= DEV_TX_OFFLOAD_MBUF_FAST_FREE; - if (options.jumbo_frame_enabled) { - port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_JUMBO_FRAME; - } - - /* Configure the Ethernet device. */ - ret = rte_eth_dev_configure( - portid, options.s_num_threads, options.s_num_threads, &port_conf); - if (ret != 0) - return ret; - - ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd, &nb_txd); - if (ret != 0) - return ret; - - /* Allocate and set up 1 RX queue per thread . */ - rxconf = dev_info.default_rxconf; - - if (options.jumbo_frame_enabled) { - rxconf.offloads |= DEV_RX_OFFLOAD_JUMBO_FRAME; - } - rxconf.offloads = port_conf.rxmode.offloads; - for (uint32_t i = 0; i < options.s_num_threads; i++) { - ret = rte_eth_rx_queue_setup(portid, - options.s_thr_info.at(i)->rxqid, nb_rxd, - rte_eth_dev_socket_id(portid), &rxconf, mbuf_pool); - if (ret < 0) - return ret; - } - - txconf = dev_info.default_txconf; - txconf.offloads = port_conf.txmode.offloads; - /* Allocate and set up 1 TX queue per Ethernet port. */ - for (uint32_t i = 0; i < options.s_num_threads; i++) { - ret = rte_eth_tx_queue_setup(portid, - options.s_thr_info.at(i)->txqid, nb_txd, - rte_eth_dev_socket_id(portid), &txconf); - if (ret < 0) - return ret; - } - - // set mtu - ret = rte_eth_dev_set_mtu(portid, options.port_mtu); - if (ret != 0) - return ret; - - ret = rte_eth_dev_start(portid); - if (ret < 0) - return ret; - - /* Display the port MAC address. */ - struct rte_ether_addr addr { - }; - ret = rte_eth_macaddr_get(portid, &addr); - - // no promiscuous mode required - - return ret; -} - static void dump_options() { @@ -694,13 +596,13 @@ dump_options() " depth = %u\n" " packet loss time threshold = %u\n" " jumbo frame = %d\n" - " packet pad size = %d\n", + " packet pad size = %d\n" + " portid = %d\n", ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING, options.run_time, - options.s_num_threads, options.rage_quit_time, - options.slave_mode, options.ia_gen, options.ld_gen, - options.target_qps, options.s_host_spec.ip, options.depth, - options.pkt_loss_delay_ms, options.jumbo_frame_enabled, - options.pkt_pad_sz); + options.s_num_threads, options.rage_quit_time, options.slave_mode, + options.ia_gen, options.ld_gen, options.target_qps, + options.s_host_spec.ip, options.depth, options.pkt_loss_delay_ms, + options.jumbo_frame_enabled, options.pkt_pad_sz, options.portid); } static void @@ -722,14 +624,13 @@ usage() " -D: max number of packets in flight\n" " -l: packet loss time threshold\n" " -J: enable jumbo frame\n" - " -P: pad load packets to this size\n"); + " -P: pad load packets to this size\n" + " -p: portid\n"); } int main(int argc, char *argv[]) { - unsigned int nb_ports; - struct rte_mempool *mbuf_pool; struct thread_info *tinfo; bool has_host_spec = false; @@ -749,8 +650,8 @@ main(int argc, char *argv[]) { int c; // parse arguments - while ((c = getopt( - argc, argv, "vht:s:SA:i:w:r:q:H:D:l:JP:")) != -1) { + while ((c = getopt(argc, argv, + "vht:s:SA:i:w:r:q:H:D:l:JP:p:")) != -1) { switch (c) { case 'v': ntr_set_level(NTR_DEP_USER1, @@ -763,8 +664,8 @@ main(int argc, char *argv[]) options.run_time = strtol(optarg, nullptr, 10); break; case 's': - if (str_to_netspec( - optarg, &options.server_spec) != 0) { + if (str_to_netspec(optarg, + &options.server_spec) != 0) { rte_exit(EXIT_FAILURE, "invalid server net spec\n"); } @@ -776,9 +677,11 @@ main(int argc, char *argv[]) break; case 'A': cpulist_to_cpuset(optarg, &options.cpu_set); - options.s_num_threads = CPU_COUNT(&options.cpu_set); + options.s_num_threads = CPU_COUNT( + &options.cpu_set); if (options.s_num_threads == 0) { - rte_exit(EXIT_FAILURE, "invalid cpu mask %s\n", optarg); + rte_exit(EXIT_FAILURE, + "invalid cpu mask %s\n", optarg); } break; case 'i': @@ -790,17 +693,17 @@ main(int argc, char *argv[]) sizeof(options.ld_gen) - 1); break; case 'r': - options.rage_quit_time = strtol( - optarg, nullptr, 10); + options.rage_quit_time = strtol(optarg, nullptr, + 10); break; case 'q': - options.target_qps = strtol( - optarg, nullptr, 10); + options.target_qps = strtol(optarg, nullptr, + 10); break; case 'H': has_host_spec = true; - if (str_to_netspec( - optarg, &options.s_host_spec) != 0) { + if (str_to_netspec(optarg, + &options.s_host_spec) != 0) { rte_exit(EXIT_FAILURE, "invalid host net spec.\n"); } @@ -812,8 +715,8 @@ main(int argc, char *argv[]) } break; case 'l': - options.pkt_loss_delay_ms = strtol( - optarg, nullptr, 10); + options.pkt_loss_delay_ms = strtol(optarg, + nullptr, 10); if (options.pkt_loss_delay_ms == 0) { options.pkt_loss_delay_ms = UINT32_MAX; } @@ -823,75 +726,108 @@ main(int argc, char *argv[]) options.port_mtu = MAX_JUMBO_MTU; break; case 'P': - options.pkt_pad_sz = strtol( - optarg, nullptr, 10); + options.pkt_pad_sz = strtol(optarg, nullptr, + 10); + break; + case 'p': + options.portid = strtol(optarg, nullptr, 10); break; default: usage(); - rte_exit( - EXIT_FAILURE, "unknown argument: %c\n", c); + rte_exit(EXIT_FAILURE, "unknown argument: %c\n", + c); } } } - if (options.pkt_pad_sz != 0 && options.pkt_pad_sz > mtu_to_pkt_size(options.port_mtu)) { - rte_exit(EXIT_FAILURE, "pkt_pad_sz is too large for mtu %d\n", options.port_mtu); + if (options.pkt_pad_sz != 0 && + options.pkt_pad_sz > mtu_to_pkt_size(options.port_mtu)) { + rte_exit(EXIT_FAILURE, "pkt_pad_sz is too large for mtu %d\n", + options.port_mtu); } if (!has_host_spec) { rte_exit(EXIT_FAILURE, "Must specify host IP.\n"); } - // init nm - if (nm_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != 0) { - rte_exit(EXIT_FAILURE, "nm init failed!\n"); + // init libtopo + if (topo_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != + 0) { + rte_exit(EXIT_FAILURE, "libtopo init failed!\n"); + } + + if (nms_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != + 0) { + rte_exit(EXIT_FAILURE, "libnms init failed!\n"); } dump_options(); - nb_ports = rte_eth_dev_count_avail(); - if (nb_ports == 0) { - rte_exit(EXIT_FAILURE, "number of ports must be > 0\n"); + // configure memory and port + struct port_conf pconf; + struct device_conf dconf; + struct mem_conf mconf; + portconf_get(options.portid, &pconf); + if (!pconf.timesync) { + ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, + "main: timesync disabled. hw timestamp unavailable.\n "); } + dconf.mtu = options.port_mtu; + dconf.num_threads = options.s_num_threads; + dconf.portid = options.portid; + dconf.rss_hf = pconf.rss_hf; + dconf.rx_offloads = pconf.rxoffload; + dconf.tx_offloads = pconf.txoffload; + dconf.timesync = pconf.timesync; - uint16_t portid = rte_eth_find_next(0); - if (portid == RTE_MAX_ETHPORTS) { - rte_exit(EXIT_FAILURE, "cannot find an available port\n"); - } - options.s_portid = portid; + dconf.rx_fn = nullptr; + dconf.rx_user = nullptr; + dconf.rx_ring_sz = 2048; + dconf.tx_fn = nullptr; + dconf.tx_user = nullptr; + dconf.tx_ring_sz = 2048; - if (rte_eth_macaddr_get(portid, &options.s_host_spec.mac_addr) != 0) { + mconf.cache_size = 512; + mconf.priv_size = 0; + mconf.num_elements = (dconf.rx_ring_sz + dconf.tx_ring_sz) * + rte_lcore_count() / rte_socket_count(); + mconf.data_room_size = RTE_MBUF_DEFAULT_BUF_SIZE + MAX_JUMBO_MTU - + MAX_STANDARD_MTU; + mconf.max_pools = -1; + + dpdk_init(&dconf, &mconf); + + if (rte_eth_macaddr_get(options.portid, + &options.s_host_spec.mac_addr) != 0) { rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n", - portid); + options.portid); } - // create a mbuf memory pool on the socket - mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", MBUF_MAX_COUNT, - MBUF_CACHE_SIZE, 0, - options.jumbo_frame_enabled ? - RTE_MBUF_DEFAULT_BUF_SIZE + (MAX_JUMBO_MTU - MAX_STANDARD_MTU) : - RTE_MBUF_DEFAULT_BUF_SIZE, - rte_eth_dev_socket_id(options.s_portid)); - if (mbuf_pool == nullptr) { - rte_exit(EXIT_FAILURE, "cannot create mbuf pool\n"); - } - options.mbuf_pool = mbuf_pool; + ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, + "Configured port %d with mac addr %x:%x:%x:%x:%x:%x\n", + options.portid, options.s_host_spec.mac_addr.addr_bytes[0], + options.s_host_spec.mac_addr.addr_bytes[1], + options.s_host_spec.mac_addr.addr_bytes[2], + options.s_host_spec.mac_addr.addr_bytes[3], + options.s_host_spec.mac_addr.addr_bytes[4], + options.s_host_spec.mac_addr.addr_bytes[5]); unsigned int cpuset_idx = CPU_FFS(&options.cpu_set); unsigned int tid = 0; - while(cpuset_idx != 0) { + while (cpuset_idx != 0) { unsigned int lcore_id = cpuset_idx - 1; tinfo = new thread_info; tinfo->ia_gen = createGenerator(options.ia_gen); tinfo->load_gen = createGenerator(options.ld_gen); if (tinfo->ia_gen == nullptr || tinfo->load_gen == nullptr) { - rte_exit( - EXIT_FAILURE, "invalid ia_gen or ld_gen string\n"); + rte_exit(EXIT_FAILURE, + "invalid ia_gen or ld_gen string\n"); } tinfo->ia_gen->set_lambda((double)options.target_qps / (double)(options.s_num_threads)); tinfo->id = tid; tinfo->lcore_id = lcore_id; + tinfo->socket_id = rte_lcore_to_socket_id(lcore_id); tinfo->rxqid = tid; tinfo->txqid = tid; options.s_thr_info.push_back(tinfo); @@ -901,19 +837,6 @@ main(int argc, char *argv[]) cpuset_idx = CPU_FFS(&options.cpu_set); } - if (port_init(portid, mbuf_pool) != 0) { - rte_exit(EXIT_FAILURE, "cannot init port %d\n", portid); - } - - ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, - "Configured port %d with mac addr %x:%x:%x:%x:%x:%x\n", portid, - options.s_host_spec.mac_addr.addr_bytes[0], - options.s_host_spec.mac_addr.addr_bytes[1], - options.s_host_spec.mac_addr.addr_bytes[2], - options.s_host_spec.mac_addr.addr_bytes[3], - options.s_host_spec.mac_addr.addr_bytes[4], - options.s_host_spec.mac_addr.addr_bytes[5]); - sleep(INIT_DELAY); for (unsigned int i = 0; i < options.s_num_threads; i++) { @@ -958,7 +881,7 @@ main(int argc, char *argv[]) uint32_t qps; uint32_t total_recv; uint32_t total_loss; - calc_stats(nm_get_uptime_ns(), &qps, &total_recv, &total_loss); + calc_stats(topo_uptime_ns(), &qps, &total_recv, &total_loss); ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "qps = %d, recv = %d, loss = %d\n", qps, total_recv, total_loss); @@ -969,7 +892,7 @@ main(int argc, char *argv[]) } // clean up - rte_eth_dev_stop(portid); + dpdk_cleanup(&dconf); return 0; } \ No newline at end of file diff --git a/scripts/libs/libtc.py b/scripts/libs/libtc.py index 839ea08..9cf1e80 100644 --- a/scripts/libs/libtc.py +++ b/scripts/libs/libtc.py @@ -1,3 +1,4 @@ +from lib2to3.refactor import get_fixers_from_package import subprocess as sp import time import select @@ -95,12 +96,14 @@ def check_stderr(p, sel, exclude = []):# -> tuple[bool, list[str]]: good = True for e in err: + e = e.strip() if len(e) == 0: continue + good = False for exc in exclude: - if not (exc in err): - good = False + if exc in e: + good = True break return good, err @@ -124,9 +127,10 @@ def thr_check_stderr(p : sp.Popen, name: str, exclude): if not status: errthr_failed = True local_failed = True - log_print("Error detected in \"" + name + "\":\n") + log_print("Error detected in \"" + name + "\":") for e in err: - log_print(" " + e + "\n") + log_print(" \"" + e + "\"") + log_print("") time.sleep(random.uniform(0.001, 0.1)) def errthr_start(): diff --git a/scripts/run.py b/scripts/run.py index c8b3e92..f78740b 100755 --- a/scripts/run.py +++ b/scripts/run.py @@ -18,15 +18,15 @@ loadgen_load = "fixed:0" # pkt_pad jumbo_frame_threshold = 1518 pkt_pads = [ + #"9018", + #1518", "0", "256", "512", - "1024", - #"1518", + "1024" #"2048", #"4096", - #"8192", - #"9018" + #"8192" ] pkt_pads_depth = {} @@ -74,25 +74,32 @@ memgen_target = [ # paths -test_dir = "/numam.d/build" +test_dir = "/numam.d/build/bin" file_dir = os.path.dirname(os.path.realpath(__file__)) root_dir = os.path.join(file_dir,"..") sample_filename = "sample.txt" affinity = [ - #"1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47", - "49,51,53,55,57,59,61,63,65,67,69,71,73,75,77,79,81,83,85,87,89,91,93,95" + "1,3,5,7,9,11,13,15,17,19,21,23", + "65,67,69,71,73,75,77,79,81,83,85,87,89,91,93,95,97,99,101,103,105,107,109,111,113,115,117,119,121,123,125,127", + "1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63", + "1,3,5,7,9,11,13,15", + "17,19,21,23,25,27,29,31", + "33,35,37,39,41,43,45,47", + "49,51,53,55,57,59,61,63" ] -master = ["skylake2.rcs.uwaterloo.ca"] -master_spec = ["192.168.123.10@3c:15:fb:c9:f3:36"] +server = ["skylake5.rcs.uwaterloo.ca"] +server_spec = ["192.168.123.13@3c:15:fb:c9:f3:28"] master_cpumask = "2" # 1 thread -server = ["icelake2-int.rcs.uwaterloo.ca"] -server_spec = ["192.168.123.9@3c:ec:ef:61:39:f3"] +master = ["icelake2-int.rcs.uwaterloo.ca"] +master_spec = ["192.168.123.9@40:a6:b7:7c:86:10"] +#server = ["milan1-int.rcs.uwaterloo.ca"] +#server_spec = ["192.168.123.9@00:07:43:54:37:08"] -clients = ["skylake3.rcs.uwaterloo.ca", "skylake5.rcs.uwaterloo.ca", "skylake6.rcs.uwaterloo.ca"]# "skylake7.rcs.uwaterloo.ca", "skylake8.rcs.uwaterloo.ca"] -client_spec = ["192.168.123.12@3c:15:fb:c9:f3:4b", "192.168.123.13@3c:15:fb:c9:f3:28", "192.168.123.14@3c:15:fb:62:9b:2f"] #, "192.168.123.13@3c:15:fb:62:9c:be"] +clients = ["skylake2.rcs.uwaterloo.ca"]#, "skylake5.rcs.uwaterloo.ca", "skylake6.rcs.uwaterloo.ca"]# "skylake7.rcs.uwaterloo.ca", "skylake8.rcs.uwaterloo.ca"] +client_spec = ["192.168.123.12@3c:15:fb:c9:f3:36"]#, "192.168.123.13@3c:15:fb:c9:f3:28", "192.168.123.14@3c:15:fb:62:9b:2f"] #, "192.168.123.13@3c:15:fb:62:9c:be"] client_cpumask = "1,3,5,7,9,11,13,15,17,19,21,23" client_rage_quit = 1000 #1s @@ -191,7 +198,7 @@ def run_exp(affinity : str, ld : int, pkt_pad : str, aff_idx : int): p = sp[0] # launch stderr monitoring thread - exclude = ["Pseudo-terminal"] + exclude = ["Pseudo-terminal", "ice_", "i40e_"] tc.errthr_create([p], master, exclude) if not client_only: tc.errthr_create(ssrv, server, exclude) diff --git a/scripts/setup_dpdk.sh b/scripts/setup_dpdk.sh index a287dc5..4023eb6 100755 --- a/scripts/setup_dpdk.sh +++ b/scripts/setup_dpdk.sh @@ -1,7 +1,8 @@ #!/bin/sh dpdk_dir="/dpdk" +libtopo_dir="/libtopo" root="$(dirname "$0")/.." -servers="skylake3.rcs.uwaterloo.ca" # skylake5.rcs.uwaterloo.ca skylake6.rcs.uwaterloo.ca skylake7.rcs.uwaterloo.ca skylake8.rcs.uwaterloo.ca" +servers="skylake2.rcs.uwaterloo.ca skylake3.rcs.uwaterloo.ca skylake5.rcs.uwaterloo.ca skylake6.rcs.uwaterloo.ca icelake2-int.rcs.uwaterloo.ca milan1-int.rcs.uwaterloo.ca" rsync_flags="-az" ssh_args="-o StrictHostKeyChecking=no -p77" @@ -18,8 +19,8 @@ echo "USER: $user" compile() { # separate these functions because we might change kernel (reboot) without needing to recompile echo "====================$1====================" - #ssh $(echo $ssh_args $user@$1) "sudo reboot" - ssh $(echo $ssh_args $user@$1) "sudo sh -c \"rm -rf $dpdk_dir; mkdir -p $dpdk_dir; cd $dpdk_dir; git clone https://git.quacker.org/d/numam-dpdk; cd numam-dpdk; git checkout releases-13.0; meson -Denable_kmods=true build; cd build; ninja install\"" + ssh $(echo $ssh_args $user@$1) "sudo sh -c \"sudo rm -rf $libtopo_dir; sudo rm -rf /usr/local/include/libtopo; sudo rm -rf /usr/local/lib/libtopo;sudo mkdir -p $libtopo_dir; sudo chmod 777 $libtopo_dir; cd $libtopo_dir; git clone https://git.quacker.org/d/libtopo; cd libtopo; mkdir build; cd build; cmake ../; sudo make install\"" + ssh $(echo $ssh_args $user@$1) "sudo sh -c \"sudo pkg install -y meson pkgconf py38-pyelftools; sudo rm -rf $dpdk_dir; sudo mkdir -p $dpdk_dir; sudo chmod 777 $dpdk_dir; cd $dpdk_dir; git clone https://git.quacker.org/d/numam-dpdk; cd numam-dpdk; git checkout migration; CC=gcc CXX=g++ meson -Denable_kmods=true build; cd build; ninja install\"" wait echo "$1 Done." echo "" diff --git a/scripts/setup_program.sh b/scripts/setup_program.sh index 58c27d4..ac7497f 100755 --- a/scripts/setup_program.sh +++ b/scripts/setup_program.sh @@ -1,7 +1,7 @@ #!/bin/sh dpdk_dir="/numam.d" root="$(dirname "$0")/.." -servers="icelake2-int.rcs.uwaterloo.ca" # skylake7.rcs.uwaterloo.ca skylake8.rcs.uwaterloo.ca icelake2-int.rcs.uwaterloo.ca" +servers="skylake2.rcs.uwaterloo.ca skylake3.rcs.uwaterloo.ca skylake5.rcs.uwaterloo.ca skylake6.rcs.uwaterloo.ca icelake2-int.rcs.uwaterloo.ca milan1-int.rcs.uwaterloo.ca" rsync_flags="-rv -e \"ssh -p77\"" ssh_args="-o StrictHostKeyChecking=no -p77" diff --git a/test/ts.cc b/test/ts.cc deleted file mode 100644 index e961642..0000000 --- a/test/ts.cc +++ /dev/null @@ -1,84 +0,0 @@ -#include -#include "defs.hh" -#include "nm.hh" -#include - -void test(const char * case_name, struct timespec * ts) -{ - uint64_t slow; - uint64_t fast; - - slow = get_uptime(); - if (nanosleep(ts, nullptr) != 0) { - perror("nanosleep() interrupted!"); - exit(-1); - } - slow = get_uptime() - slow; - - fast = nm_get_uptime_ns(); - if (nanosleep(ts, nullptr) != 0) { - perror("nanosleep() interrupted!"); - exit(-1); - } - fast = nm_get_uptime_ns() - fast; - - printf("%s: get_uptime(): %lu, nm_get_uptime_ns(): %lu\n", case_name, slow, fast); -} - - -int main() -{ - struct timespec ts; - nm_init(0); - // 1s - ts.tv_nsec = 0; - ts.tv_sec = 1; - test("1s", &ts); - - // 100ms - ts.tv_nsec = 100000000; - ts.tv_sec = 0; - test("100ms", &ts); - - // 10ms - ts.tv_nsec = 10000000; - ts.tv_sec = 0; - test("10ms", &ts); - - // 1ms - ts.tv_nsec = 1000000; - ts.tv_sec = 0; - test("1ms", &ts); - - // 100us - ts.tv_nsec = 100000; - ts.tv_sec = 0; - test("100us", &ts); - - // 10us - ts.tv_nsec = 10000; - ts.tv_sec = 0; - test("10us", &ts); - - // 1us - ts.tv_nsec = 1000; - ts.tv_sec = 0; - test("1us", &ts); - - // 100ns - ts.tv_nsec = 100; - ts.tv_sec = 0; - test("100ns", &ts); - - // 10ns - ts.tv_nsec = 10; - ts.tv_sec = 0; - test("10ns", &ts); - - // 1ns - ts.tv_nsec = 1; - ts.tv_sec = 0; - test("1ns", &ts); - - return 0; -} \ No newline at end of file diff --git a/tests/nms_test.c b/tests/nms_test.c new file mode 100644 index 0000000..85544cd --- /dev/null +++ b/tests/nms_test.c @@ -0,0 +1,32 @@ +#include "nms.h" +#include +#include + +int main(void) +{ + void * ret; + + nms_init(1); + // duplicate init + nms_init(1); + + // 1G + ret = nms_malloc(0, 1024 * 1024 * 1024); + assert(ret != NULL); + printf("1G: %p\n", ret); + + // two 511Ms + ret = nms_malloc(0, 511 * 1024 * 1024); + assert(ret != NULL); + printf("511M: %p\n", ret); + ret = nms_malloc(0, 511 * 1024 * 1024); + assert(ret != NULL); + printf("511M: %p\n", ret); + + // another 1G + ret = nms_malloc(0, 1024 * 1024 * 1024); + assert(ret != NULL); + printf("1G: %p\n", ret); + + return 0; +} \ No newline at end of file diff --git a/util/memloadgen.cc b/util/memloadgen.cc index c17edb9..7e9c9ab 100644 --- a/util/memloadgen.cc +++ b/util/memloadgen.cc @@ -1,8 +1,9 @@ -#include "nm.hh" +#include "gen.hh" #include #include "ntr.h" #include #include +#include static void usage() @@ -12,7 +13,9 @@ usage() " -v: verbose mode\n" " -b: MLG bytes per second\n" " -x: MLG thread affinity mask\n" - " -X: MLG target domain affinity mask\n"); + " -X: MLG target domain affinity mask\n" + " -S: shared buffer\n" + " -i: iterations\n"); fflush(stdout); } @@ -21,14 +24,15 @@ int main(int argc, char * argv[]) ntr_init(); ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_WARNING); - unsigned long long mlg_arrsz = 0; - uint32_t mlg_iter = -1; - unsigned long long mlg_dmask = 0; - unsigned long long mlg_cmask = 0; + size_t arr_sz = 0; + uint32_t iter = -1; + cpuset_t threads; + int shared_buffer = 0; + cpuset_t domain_mask; { int c; // parse arguments - while ((c = getopt(argc, argv, "hb:X:x:vi:")) != -1) { + while ((c = getopt(argc, argv, "hb:X:x:vi:S")) != -1) { switch (c) { case 'v': ntr_set_level(NTR_DEP_USER1, @@ -38,18 +42,19 @@ int main(int argc, char * argv[]) usage(); exit(0); case 'b': - mlg_arrsz = strtoull(optarg, nullptr, 10); + arr_sz = strtoull(optarg, nullptr, 10); break; case 'i': - mlg_iter = strtoul(optarg, nullptr, 10); + iter = strtoul(optarg, nullptr, 10); break; case 'X': - mlg_dmask = strtoull( - optarg, nullptr, 16); + cpulist_to_cpuset(optarg, &domain_mask); break; case 'x': - mlg_cmask = strtoull( - optarg, nullptr, 16); + cpulist_to_cpuset(optarg, &threads); + break; + case 'S': + shared_buffer = 1; break; default: usage(); @@ -58,18 +63,22 @@ int main(int argc, char * argv[]) } } - ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "MLG: [size: %llu, iter: %u, threads: 0x%llx, domain: 0x%llx]\n", mlg_arrsz, mlg_iter, mlg_cmask, mlg_dmask); + ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "MLG: [size: %ld iter: %u, # threads: 0x%d, domain: 0x%ld]\n", arr_sz, iter, CPU_COUNT(&threads), CPU_FFS(&domain_mask) - 1); - // init nm - if (nm_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != 0) { - fprintf(stderr, "nm init failed!\n"); + // init topo + if (topo_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != 0) { + fprintf(stderr, "libtopo init failed!\n"); exit(1); } bool success = false; - memload_generator * mgen = new memload_generator(mlg_cmask, mlg_dmask, mlg_arrsz, mlg_iter, &success); + memload_generator::memload_generator_options opts; + opts.chunk_size = arr_sz; + opts.iteration = iter; + opts.shared_buffer = shared_buffer; + opts.verbose = 1; - mgen->start(); + auto mgen = new memload_generator(&threads, &domain_mask, &opts, &success); while(!mgen->check_done()) { usleep(10000); } @@ -78,6 +87,8 @@ int main(int argc, char * argv[]) ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: MLG bps = %ld ~= %ldM\n", bps, bps / 1024 / 1024); + delete mgen; + return 0; }