diff --git a/.arcconfig b/.arcconfig new file mode 100644 index 0000000..fd6be34 --- /dev/null +++ b/.arcconfig @@ -0,0 +1,3 @@ +{ + "phabricator.uri" : "https://review.rcs.uwaterloo.ca/" +} diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..3823fa5 --- /dev/null +++ b/.clang-format @@ -0,0 +1,194 @@ +# $FreeBSD$ +# Basic .clang-format +--- +BasedOnStyle: WebKit +AlignAfterOpenBracket: DontAlign +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +AlignEscapedNewlines: Left +AlignOperands: false +AlignTrailingComments: true +AllowAllArgumentsOnNextLine: false +AllowAllParametersOfDeclarationOnNextLine: false +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: InlineOnly +AllowShortIfStatementsOnASingleLine: Never +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterReturnType: TopLevelDefinitions +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: MultiLine +BinPackArguments: true +BinPackParameters: true +BreakBeforeBinaryOperators: None +BreakBeforeBraces: WebKit +BreakBeforeTernaryOperators: false +# TODO: BreakStringLiterals can cause very strange formatting so turn it off? +BreakStringLiterals: false +# Prefer: +# some_var = function(arg1, +# arg2) +# over: +# some_var = +# function(arg1, arg2) +PenaltyBreakAssignment: 100 +# Prefer: +# some_long_function(arg1, arg2 +# arg3) +# over: +# some_long_function( +# arg1, arg2, arg3) +PenaltyBreakBeforeFirstCallParameter: 100 +CompactNamespaces: true +DerivePointerAlignment: false +DisableFormat: false +ForEachMacros: + - ARB_ARRFOREACH + - ARB_ARRFOREACH_REVWCOND + - ARB_ARRFOREACH_REVERSE + - ARB_FOREACH + - ARB_FOREACH_FROM + - ARB_FOREACH_SAFE + - ARB_FOREACH_REVERSE + - ARB_FOREACH_REVERSE_FROM + - ARB_FOREACH_REVERSE_SAFE + - CPU_FOREACH + - FOREACH_THREAD_IN_PROC + - FOREACH_PROC_IN_SYSTEM + - FOREACH_PRISON_CHILD + - FOREACH_PRISON_DESCENDANT + - FOREACH_PRISON_DESCENDANT_LOCKED + - FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL + - MNT_VNODE_FOREACH_ALL + - MNT_VNODE_FOREACH_ACTIVE + - RB_FOREACH + - RB_FOREACH_FROM + - RB_FOREACH_SAFE + - RB_FOREACH_REVERSE + - RB_FOREACH_REVERSE_FROM + - RB_FOREACH_REVERSE_SAFE + - SLIST_FOREACH + - SLIST_FOREACH_FROM + - SLIST_FOREACH_FROM_SAFE + - SLIST_FOREACH_SAFE + - SLIST_FOREACH_PREVPTR + - SPLAY_FOREACH + - LIST_FOREACH + - LIST_FOREACH_FROM + - LIST_FOREACH_FROM_SAFE + - LIST_FOREACH_SAFE + - STAILQ_FOREACH + - STAILQ_FOREACH_FROM + - STAILQ_FOREACH_FROM_SAFE + - STAILQ_FOREACH_SAFE + - TAILQ_FOREACH + - TAILQ_FOREACH_FROM + - TAILQ_FOREACH_FROM_SAFE + - TAILQ_FOREACH_REVERSE + - TAILQ_FOREACH_REVERSE_FROM + - TAILQ_FOREACH_REVERSE_FROM_SAFE + - TAILQ_FOREACH_REVERSE_SAFE + - TAILQ_FOREACH_SAFE + - VM_MAP_ENTRY_FOREACH + - VM_PAGE_DUMP_FOREACH +IndentCaseLabels: false +IndentPPDirectives: None +Language: Cpp +NamespaceIndentation: None +PointerAlignment: Right +ContinuationIndentWidth: 4 +IndentWidth: 8 +TabWidth: 8 +ColumnLimit: 80 +UseTab: Always +SpaceAfterCStyleCast: false +IncludeBlocks: Regroup +IncludeCategories: + - Regex: '^\"opt_.*\.h\"' + Priority: 1 + SortPriority: 10 + - Regex: '^' + Priority: 2 + SortPriority: 20 + - Regex: '^' + Priority: 2 + SortPriority: 21 + - Regex: '^' + Priority: 2 + SortPriority: 22 + - Regex: '^' + Priority: 2 + SortPriority: 23 + - Regex: '^' + Priority: 3 + SortPriority: 30 + - Regex: '^ + int main() + { + hwloc_topology_t topology; + int nbcores; + hwloc_topology_init(&topology); + hwloc_topology_load(topology); + nbcores = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_CORE); + hwloc_topology_destroy(topology); + return 0; + } + " + ) + + TRY_COMPILE(_LINK_SUCCESS ${CMAKE_BINARY_DIR} "${_TEST_SOURCE}" + CMAKE_FLAGS + "-DINCLUDE_DIRECTORIES:STRING=${Hwloc_INCLUDE_DIR}" + CMAKE_FLAGS + "-DLINK_LIBRARIES:STRING=${Hwloc_LIBRARY}" + ) + + IF(NOT _LINK_SUCCESS) + if(CMAKE_SIZEOF_VOID_P EQUAL 8) + message(STATUS "You are building 64bit target.") + ELSE() + message(STATUS "You are building 32bit code. If you like to build x64 use e.g. -G 'Visual Studio 12 Win64' generator." ) + ENDIF() + message(FATAL_ERROR "Library found, but linking test program failed.") + ENDIF() + + # + # Resolve version if some compiled binary found... + # + find_program(HWLOC_INFO_EXECUTABLE + NAMES + hwloc-info + PATHS + ENV HWLOC_ROOT + PATH_SUFFIXES + bin + ) + + if(HWLOC_INFO_EXECUTABLE) + execute_process( + COMMAND ${HWLOC_INFO_EXECUTABLE} "--version" + OUTPUT_VARIABLE HWLOC_VERSION_LINE + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + string(REGEX MATCH "([0-9]+.[0-9]+)$" + Hwloc_VERSION "${HWLOC_VERSION_LINE}") + unset(HWLOC_VERSION_LINE) + endif() + + # + # All good + # + + set(Hwloc_LIBRARIES ${Hwloc_LIBRARY}) + set(Hwloc_INCLUDE_DIRS ${Hwloc_INCLUDE_DIR}) + + include(FindPackageHandleStandardArgs) + find_package_handle_standard_args( + Hwloc + FOUND_VAR Hwloc_FOUND + REQUIRED_VARS Hwloc_LIBRARY Hwloc_INCLUDE_DIR Hwloc_VERSION_PARSED Hwloc_VERSION_MAJOR Hwloc_VERSION_MINOR + VERSION_VAR Hwloc_VERSION) + + mark_as_advanced( + Hwloc_INCLUDE_DIR + Hwloc_LIBRARY) + + foreach(arg ${Hwloc_INCLUDE_DIRS}) + set(Hwloc_CFLAGS "${Hwloc_CFLAGS} /I${arg}") + endforeach() + + set(Hwloc_LDFLAGS "${Hwloc_LIBRARY}") + +else() + + if(CMAKE_CROSSCOMPILING) + + find_path(Hwloc_INCLUDE_DIRS + NAMES + hwloc.h + PATHS + ENV HWLOC_ROOT + ) + + find_library(Hwloc_LIBRARIES + NAMES + hwloc + PATHS + ENV HWLOC_ROOT + ) + + if(Hwloc_INCLUDE_DIRS AND Hwloc_LIBRARIES) + message(WARNING "HWLOC library found using find_library() - cannot determine version. Assuming 1.7.0") + set(Hwloc_FOUND 1) + set(Hwloc_VERSION "1.7.0") + endif() + + else() # Find with pkgconfig for non-crosscompile builds + + find_package(PkgConfig) + + if(HWLOC_ROOT) + set(ENV{PKG_CONFIG_PATH} "${HWLOC_ROOT}/lib/pkgconfig") + else() + foreach(PREFIX ${CMAKE_PREFIX_PATH}) + set(PKG_CONFIG_PATH "${PKG_CONFIG_PATH}:${PREFIX}/lib/pkgconfig") + endforeach() + set(ENV{PKG_CONFIG_PATH} "${PKG_CONFIG_PATH}:$ENV{PKG_CONFIG_PATH}") + endif() + + if(hwloc_FIND_REQUIRED) + set(_hwloc_OPTS "REQUIRED") + elseif(hwloc_FIND_QUIETLY) + set(_hwloc_OPTS "QUIET") + else() + set(_hwloc_output 1) + endif() + + if(hwloc_FIND_VERSION) + if(hwloc_FIND_VERSION_EXACT) + pkg_check_modules(Hwloc ${_hwloc_OPTS} hwloc=${hwloc_FIND_VERSION}) + else() + pkg_check_modules(Hwloc ${_hwloc_OPTS} hwloc>=${hwloc_FIND_VERSION}) + endif() + else() + pkg_check_modules(Hwloc ${_hwloc_OPTS} hwloc) + endif() + + if(Hwloc_FOUND) + string(REPLACE "." ";" Hwloc_VERSION_PARSED "${Hwloc_VERSION}") + set(Hwloc_VERSION "${Hwloc_VERSION}" CACHE STRING "version of Hwloc as a list") + list(GET Hwloc_VERSION_PARSED 0 Hwloc_VERSION_MAJOR) + set(Hwloc_VERSION_MAJOR "${Hwloc_VERSION_MAJOR}" CACHE STRING "Major version of Hwloc") + list(GET Hwloc_VERSION_PARSED 1 Hwloc_VERSION_MINOR) + set(Hwloc_VERSION_MINOR "${Hwloc_VERSION_MINOR}" CACHE STRING "Minor version of Hwloc") + + include(FindPackageHandleStandardArgs) + find_package_handle_standard_args(Hwloc DEFAULT_MSG Hwloc_LIBRARIES) + + if(NOT ${Hwloc_VERSION} VERSION_LESS 1.7.0) + set(Hwloc_GL_FOUND 1) + endif() + + if(_hwloc_output) + message(STATUS + "Found hwloc ${Hwloc_VERSION} in ${Hwloc_INCLUDE_DIRS}:${Hwloc_LIBRARIES}") + endif() + endif() + + endif() # cross-compile else + +endif() \ No newline at end of file diff --git a/cat/cat.cc b/cat/cat.cc index 4407683..1ac6929 100644 --- a/cat/cat.cc +++ b/cat/cat.cc @@ -1,234 +1,629 @@ -#include -#include -#include -#include +#include #include +#include +#include #include #include -#include -#include -#include #include #include -#include -#include -#include -#include +#include +#include #include -#include "ntrlog.h" +#include "gen.h" +#include "nm.h" +#include "ntr.h" #include "pkt.h" -#include "rte_byteorder.h" -#include "rte_ip.h" +#include "util.h" -// init NTRLOG -NTR_DECL_IMPL; +#include +#include +#include +#include +#include -constexpr unsigned int MBUF_MAX_COUNT = 8191; -constexpr unsigned int MBUF_CACHE_SIZE = 250; -constexpr unsigned int RX_RING_SIZE = 1024; -constexpr unsigned int TX_RING_SIZE = 1024; -constexpr unsigned int RX_RING_NUM = 1; -constexpr unsigned int TX_RING_NUM = 1; -constexpr unsigned int BURST_SIZE = 32; +constexpr static unsigned int MBUF_MAX_COUNT = 65536; +constexpr static unsigned int MBUF_CACHE_SIZE = 512; +constexpr static unsigned int RX_RING_SIZE = 4096; +constexpr static unsigned int TX_RING_SIZE = 4096; +constexpr static unsigned int BURST_SIZE = 8; +constexpr static unsigned int MAX_SLAVES = 32; -static const struct rte_eth_conf port_conf_default{}; +static const struct rte_eth_conf port_conf_default { +}; -struct datapt{ - uint64_t server_proc = 0; - uint64_t rtt = 0; +struct datapt { + uint32_t epoch; + uint32_t valid; + uint64_t clt_hw_tx; + uint64_t clt_sw_tx; + uint64_t clt_hw_rx; + uint64_t clt_sw_rx; + uint64_t srv_hw_tx; + uint64_t srv_sw_tx; + uint64_t srv_hw_rx; + uint64_t srv_sw_rx; }; struct options_t { - unsigned int run_time = 5; - unsigned int warmup_time = 0; - char output[256] = "output.txt"; - struct rte_ether_addr server_mac; - // states - std::atomic s_stop {false}; - std::atomic s_record {false}; - std::vector s_stats; - struct rte_mempool * s_mbuf_pool; - uint16_t s_portid; - struct rte_ether_addr s_host_mac; + // parameters + unsigned int run_time { 5 }; + unsigned int warmup_time { 3 }; + char output[256] = "output.txt"; + char ia_gen_str[256] = "fixed"; + unsigned int target_qps { 0 }; + unsigned int master_mode { 0 }; + struct net_spec server_spec { + }; + uint64_t cpu_mask { 0x4 }; // 2nd core + std::vector slaves; + unsigned long rage_quit_time { (unsigned long)-1 }; + + // states + struct rte_mempool *mbuf_pool { nullptr }; + struct net_spec s_host_spec { + }; + struct conn_spec s_host_conn { + .src = &s_host_spec, .dst = &server_spec, .dst_port = POU_PORT + }; + uint16_t s_portid { 0 }; + unsigned int s_rxqid { 0 }; + unsigned int s_txqid { 0 }; + // for qps calculation + unsigned int s_total_pkts { 0 }; + std::atomic s_start_time { 0 }; + std::atomic s_end_time { 0 }; + std::atomic s_slave_qps { 0 }; + + Generator *s_iagen { nullptr }; + std::vector s_data; + struct datapt *s_last_datapt { nullptr }; + uint32_t s_epoch { 0 }; + std::atomic s_stop { false }; + std::atomic s_record { 0 }; }; -struct options_t options; +static struct options_t options; static uint16_t -rx_calc_latency(uint16_t port __rte_unused, uint16_t qidx __rte_unused, - struct rte_mbuf **pkts, uint16_t nb_pkts, uint16_t max_pkts __rte_unused, void *_ __rte_unused) +rx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused, + struct rte_mbuf **pkts, uint16_t nb_pkts, uint16_t max_pkts __rte_unused, + void *_ __rte_unused) { - // XXX: need to get the timestamp in every loop? - uint64_t now = rte_rdtsc(); - struct packet_data * pkt_data; + uint64_t now = nm_tsc2ns(rte_rdtsc()); + struct pkt_hdr *pkt_data; + struct timespec ts { + }; + int ret; - for (int i = 0; i < nb_pkts; i++) { - pkt_data = check_valid_packet(pkts[i]); + for (int i = 0; i < nb_pkts; i++) { + pkt_data = check_valid_packet( + pkts[i], &options.s_host_spec.mac_addr); - if (pkt_data == NULL) { - ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "rx_calc_latency: ignoring invalid packet 0x%p.\n", (void*)pkts[i]); - continue; - } - - ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "rx_calc_latency: tagged packet %p with %llu.\n", (void*)pkts[i], now); - pkt_data->clt_ts_rx = rte_cpu_to_be_64(now); - } + if (pkt_data == nullptr) { + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "rx_add_timestamp: ignoring invalid packet 0x%p.\n", + (void *)pkts[i]); + continue; + } - return nb_pkts; + if (rte_be_to_cpu_16(pkt_data->type) == PKT_TYPE_PROBE_RESP) { + uint32_t epoch = rte_be_to_cpu_32( + ((struct pkt_payload_epoch *)pkt_data->payload) + ->epoch); + if (options.s_last_datapt != nullptr && + options.s_last_datapt->epoch == epoch) { + if ((ret = rte_eth_timesync_read_rx_timestamp( + port, &ts, pkts[i]->timesync & 0x3)) == + 0) { + // has hw rx timestamp + options.s_last_datapt->clt_hw_rx = + ts.tv_sec * S2NS + ts.tv_nsec; + options.s_last_datapt->clt_sw_rx = now; + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "rx_add_timestamp: tagged packet %p with sw: %lu hw: %lu.\n", + (void *)pkts[i], now, + options.s_last_datapt->clt_hw_rx); + } else { + rte_exit(EXIT_FAILURE, + "rx_add_timestamp: packet %p not tagged - hw ts not available - %d.\n", + (void *)pkts[i], ret); + } + } else { + ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, + "rx_add_timestamp: packet %p epoch %d != last epoch %d.\n", + (void *)pkts[i], epoch, + options.s_last_datapt->epoch); + } + } else { + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "rx_add_timestamp: packet %p not tagged - type %d.\n", + (void *)pkts[i], rte_be_to_cpu_16(pkt_data->type)); + } + } + + return nb_pkts; } static uint16_t tx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused, - struct rte_mbuf **pkts, uint16_t nb_pkts, void *_ __rte_unused) + struct rte_mbuf **pkts, uint16_t nb_pkts, void *_ __rte_unused) { - // XXX: need to get the timestamp in every loop? - uint64_t now = rte_rdtsc(); - struct packet_data * pkt_data; + uint64_t now = nm_tsc2ns(rte_rdtsc()); + struct pkt_hdr *pkt_data; - for (int i = 0; i < nb_pkts; i++) { - pkt_data = check_valid_packet(pkts[i]); + for (int i = 0; i < nb_pkts; i++) { + pkt_data = check_valid_packet( + pkts[i], &options.s_host_spec.mac_addr); - if (pkt_data == NULL) { - ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "tx_add_timestamp: ignoring invalid packet 0x%p.\n", (void*)pkts[i]); - continue; - } + if (pkt_data == nullptr) { + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "tx_add_timestamp: ignoring invalid packet 0x%p.\n", + (void *)pkts[i]); + continue; + } - ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "tx_add_timestamp: tagged packet %p with %llu.\n", (void*)pkts[i], now); - pkt_data->clt_ts_tx = rte_cpu_to_be_64(now); - } + if (rte_be_to_cpu_16(pkt_data->type) == PKT_TYPE_PROBE) { + uint32_t epoch = rte_be_to_cpu_32( + ((struct pkt_payload_epoch *)pkt_data->payload) + ->epoch); - return nb_pkts; + if (options.s_last_datapt == nullptr || + epoch != options.s_last_datapt->epoch) { + rte_exit(EXIT_FAILURE, + "tx_add_timestamp: packet epoch %d != last epoch %d\n", + epoch, options.s_last_datapt->epoch); + } + + options.s_last_datapt->clt_sw_tx = now; + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "tx_add_timestamp: tagged packet %p with sw: %lu.\n", + (void *)pkts[i], now); + } else { + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "tx_add_timestamp: packet %p not tagged - type %d.\n", + (void *)pkts[i], pkt_data->type); + } + } + + return nb_pkts; } -#define STATE_SEND (0) -#define STATE_RECV (1) +// returns 0 on success +static void +send_all_slaves(uint16_t type) +{ + struct rte_mbuf *tx_bufs[MAX_SLAVES]; + + struct conn_spec cspec; + cspec.src = &options.s_host_spec; + cspec.dst_port = DEFAULT_RAT_PORT; + cspec.src_port = DEFAULT_RAT_PORT; + + // send all clients SYNC + for (unsigned int i = 0; i < options.slaves.size(); i++) { + struct pkt_hdr *hdr; + cspec.dst = options.slaves.at(i); + if (alloc_pkt_hdr(options.mbuf_pool, type, &cspec, &tx_bufs[i], + &hdr) != 0) { + rte_exit(EXIT_FAILURE, "failed to alloc packet\n"); + } + } + + if (rte_eth_tx_burst(options.s_portid, options.s_txqid, tx_bufs, + options.slaves.size()) != options.slaves.size()) { + rte_exit(EXIT_FAILURE, "failed to send some packets\n"); + } +} + +// sizeof mbuf must >= MAX_SLAVES +// this function fills up to #slave +static void +wait_for_slaves(uint16_t etype, struct rte_mbuf **out) +{ + struct rte_mbuf *tx_bufs[MAX_SLAVES]; + bool stop = false; + const uint64_t start = nm_get_uptime_ns(); + std::vector recved; + uint32_t tot = 0; + + while (!stop) { + uint64_t now = nm_get_uptime_ns(); + const uint16_t nb_rx = rte_eth_rx_burst( + options.s_portid, options.s_rxqid, tx_bufs, MAX_SLAVES); + + if (nb_rx > 0) { + for (unsigned int i = 0; i < nb_rx; i++) { + struct pkt_hdr *each = check_valid_packet( + tx_bufs[i], &options.s_host_spec.mac_addr); + uint16_t type; + if (each == nullptr) { + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "wait_for_slaves: ignoring invalid packet %p.\n", + (void *)tx_bufs[i]); + goto end_loop; + } + + type = rte_be_to_cpu_16(each->type); + + if (type == etype) { + bool invalid = true; + + // check if it is from one of our + // clients + for (auto eaddr : options.slaves) { + if (rte_is_same_ether_addr( + &eaddr->mac_addr, + &each->eth_hdr + .s_addr)) { + invalid = false; + break; + } + } + + if (invalid) { + // received invalid packet from + // unregistered slave + ntr(NTR_DEP_USER1, + NTR_LEVEL_WARNING, + "wait_for_slaves: invalid packet %p from unregistered slave\n.", + tx_bufs[i]); + goto end_loop; + } + + invalid = false; + // check if we have already received the + // same packet from the mac addr + for (auto eaddr : recved) { + if (rte_is_same_ether_addr( + eaddr, + &each->eth_hdr + .s_addr)) { + invalid = true; + break; + } + } + + if (invalid) { + // received invalid packet from + // the same slave + ntr(NTR_DEP_USER1, + NTR_LEVEL_WARNING, + "wait_for_slaves: invalid packet %p - duplicated\n.", + tx_bufs[i]); + goto end_loop; + } + + recved.push_back(&each->eth_hdr.s_addr); + + if (recved.size() == + options.slaves.size()) { + stop = true; + } + + if (out != nullptr) { + out[tot] = tx_bufs[i]; + tot++; + // don't free this packet + continue; + } + } else { + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "wait_for_slaves: ignoring invalid packet %p type %d.\n", + (void *)tx_bufs[i], type); + } + end_loop: + rte_pktmbuf_free(tx_bufs[i]); + } + } + + if (now - start > options.rage_quit_time * MS2NS) { + rte_exit( + EXIT_FAILURE, "waiting for too long. I QUIT!!"); + } + } +} + +static void +pkt_loop() +{ + struct rte_mbuf *tx_buf; + struct rte_mbuf *rx_bufs[BURST_SIZE]; + struct pkt_hdr *pkt_data; + rdport_generator port_gen(MIN_RANDOM_PORT); + + bool read_tx = true; + bool recv_stat = true; + bool recv_resp = true; + + uint64_t next_ts = nm_get_uptime_ns(); + uint64_t last_ts = next_ts + options.rage_quit_time * MS2NS; + + if (rte_eth_dev_socket_id(options.s_portid) > 0 && + rte_eth_dev_socket_id(options.s_portid) != (int)rte_socket_id()) { + ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, + "locore_main: WARNING, port %d is on remote NUMA node to " + "polling thread.\n\tPerformance will " + "not be optimal.\n", + options.s_portid); + } + + while (!options.s_stop.load()) { + uint64_t now = nm_get_uptime_ns(); + // always pop incoming packets + const uint16_t nb_rx = rte_eth_rx_burst( + options.s_portid, options.s_rxqid, rx_bufs, BURST_SIZE); + + if (nb_rx > 0) { + for (int i = 0; i < nb_rx; i++) { + struct pkt_hdr *each = check_valid_packet( + rx_bufs[i], &options.s_host_spec.mac_addr); + + if (each == nullptr) { + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "locore_main: ignoring invalid packet %p.\n", + (void *)rx_bufs[i]); + rte_pktmbuf_free(rx_bufs[i]); + continue; + } + + uint16_t type = rte_be_to_cpu_16(each->type); + NTR_PKT(NTR_DEP_USER1, NTR_LEVEL_DEBUG, each, + "locore_main: "); + struct pkt_payload_epoch *pld_epoch; + struct pkt_payload_stat *pld_stat; + uint32_t epoch; + switch (type) { + case PKT_TYPE_PROBE_RESP: + pld_epoch = (struct pkt_payload_epoch *) + each->payload; + epoch = rte_be_to_cpu_32( + pld_epoch->epoch); + + if (options.s_last_datapt == nullptr || + epoch != + options.s_last_datapt->epoch) { + ntr(NTR_DEP_USER1, + NTR_LEVEL_WARNING, + "locore_main: packet %p epoch %d doesn't match datapt %d.\n", + (void *)rx_bufs[i], epoch, + options.s_last_datapt + ->epoch); + break; + } + + options.s_total_pkts++; + + recv_resp = true; + break; + case PKT_TYPE_STAT: + pld_stat = (struct pkt_payload_stat *) + each->payload; + epoch = rte_be_to_cpu_32( + pld_stat->epoch); + + if (options.s_last_datapt == nullptr || + epoch != + options.s_last_datapt->epoch) { + ntr(NTR_DEP_USER1, + NTR_LEVEL_WARNING, + "locore_main: packet %p epoch %d doesn't match datapt %d.\n", + (void *)rx_bufs[i], epoch, + options.s_last_datapt + ->epoch); + break; + } + + options.s_last_datapt->srv_hw_tx = + rte_be_to_cpu_64(pld_stat->hw_tx); + options.s_last_datapt->srv_hw_rx = + rte_be_to_cpu_64(pld_stat->hw_rx); + options.s_last_datapt->srv_sw_tx = + rte_be_to_cpu_64(pld_stat->sw_tx); + options.s_last_datapt->srv_sw_rx = + rte_be_to_cpu_64(pld_stat->sw_rx); + + recv_stat = true; + break; + default: + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "locore_main: ignoring packet %p with unknown type %d.\n", + (void *)rx_bufs[i], type); + } + + rte_pktmbuf_free(rx_bufs[i]); + } + } + + if (read_tx && recv_stat & recv_resp) { + // if we have all the data + + if (options.s_last_datapt != nullptr) { + // push the data to the queue if we haven't done + // so already + options.s_data.push_back(options.s_last_datapt); + + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "locore_main: datapt for epoch %d dump:\n" + " Valid: %d\n" + " client TX HW: %lu\n" + " client TX SW: %lu\n" + " client RX HW: %lu\n" + " client RX SW: %lu\n" + " server TX HW: %lu\n" + " server TX SW: %lu\n" + " server RX HW: %lu\n" + " server RX SW: %lu\n\n", + options.s_last_datapt->epoch, + options.s_last_datapt->valid, + options.s_last_datapt->clt_hw_tx, + options.s_last_datapt->clt_sw_tx, + options.s_last_datapt->clt_hw_rx, + options.s_last_datapt->clt_sw_rx, + options.s_last_datapt->srv_hw_tx, + options.s_last_datapt->srv_sw_tx, + options.s_last_datapt->srv_hw_rx, + options.s_last_datapt->srv_sw_rx); + options.s_last_datapt = nullptr; + } + + if (now >= next_ts) { + struct pkt_payload_epoch *pld_epoch; + uint32_t epoch; + + next_ts += (int)(options.s_iagen->generate() * + S2NS); + + options.s_host_conn.src_port = port_gen.next(); + if (alloc_pkt_hdr(options.mbuf_pool, + PKT_TYPE_PROBE, &options.s_host_conn, + &tx_buf, &pkt_data) != 0) { + rte_exit(EXIT_FAILURE, + "failed to alloc probe packet.\n"); + } + + epoch = options.s_epoch; + options.s_epoch++; + pld_epoch = (struct pkt_payload_epoch *) + pkt_data->payload; + pld_epoch->epoch = rte_cpu_to_be_32(epoch); + options.s_last_datapt = new struct datapt; + options.s_last_datapt->epoch = epoch; + options.s_last_datapt->valid = + options.s_record.load(); + + read_tx = false; + recv_resp = false; + recv_stat = false; + last_ts = now; + + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "locore_main: sending packet %p with epoch %d\n", + (void *)tx_buf, epoch); + const uint16_t nb_tx = rte_eth_tx_burst( + options.s_portid, options.s_txqid, &tx_buf, + 1); + + if (nb_tx != 1) { + rte_exit(EXIT_FAILURE, + "failed to send packet 0x%p, epoch %d\n", + (void *)tx_buf, epoch); + } + } + } + + if (!recv_stat) { + // if we haven't recevied the stats get ready to rage + // quit + if (now - last_ts > options.rage_quit_time * MS2NS) { + rte_exit(EXIT_FAILURE, + "waiting too long for resp. I QUIT!!\n"); + } + } + + if (!read_tx) { + struct timespec ts { + }; + if (rte_eth_timesync_read_tx_timestamp( + options.s_portid, &ts) == 0) { + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "locore_main: read hw tx timestamp %lu.\n", + (ts.tv_nsec + ts.tv_sec * S2NS)); + options.s_last_datapt->clt_hw_tx = ts.tv_nsec + + ts.tv_sec * S2NS; + read_tx = true; + } + } + } +} static int -locore_main(void * _unused __rte_unused) +locore_main(void *tif __rte_unused) { - struct rte_mbuf *tx_buf; - struct rte_mbuf *rx_bufs[BURST_SIZE]; - struct packet_data *pkt_data; - uint32_t core_id = rte_lcore_id(); - uint32_t epoch = 0; - int state = STATE_SEND; + struct rte_mbuf *mbufs[MAX_SLAVES]; + uint32_t core_id = rte_lcore_id(); - // XXX: check link status instead + ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: core %d running...\n", + core_id); - sleep(1); - if (rte_eth_dev_socket_id(options.s_portid) > 0 && rte_eth_dev_socket_id(options.s_portid) != (int)rte_socket_id()) { - ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "locore_main: WARNING, port %d is on remote NUMA node to " - "polling thread.\n\tPerformance will " - "not be optimal.\n", options.s_portid); - } - - ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: core %d running...\n", core_id); - - tx_buf = rte_pktmbuf_alloc(options.s_mbuf_pool); - - if (tx_buf == NULL) { - rte_exit(EXIT_FAILURE, "cannot allocate tx_buf\n"); - } - - pkt_data = construct_udp_pkt_hdr(tx_buf, - &options.s_host_mac, &options.server_mac, - RTE_IPV4(192, 168, 100, 150), RTE_IPV4(192, 168, 100, 151), - 1337, 1337); - if (pkt_data == NULL) { - rte_exit(EXIT_FAILURE, "cannot allocate space for packet_data in mbuf\n"); - } - pkt_data->magic = rte_cpu_to_be_32(ETHER_FRAME_MAGIC); - - while(!options.s_stop.load()) { - // always pop incoming packets - const uint16_t nb_rx = rte_eth_rx_burst(options.s_portid, 0, rx_bufs, BURST_SIZE); - - if (nb_rx != 0) { - // only process packets when we are ready to receive - for (int i = 0; i < nb_rx; i++) { - struct packet_data * each = check_valid_packet(rx_bufs[i]); - - if (each == NULL) { - ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "locore_main: ignoring invalid packet %p.\n", (void*)rx_bufs[i]); - dump_pkt(rx_bufs[i]); - rte_pktmbuf_free(rx_bufs[i]); - continue; - } - - if (rte_be_to_cpu_32(each->epoch) == epoch && state == STATE_RECV) { - ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: received packet %p for epoch %d\n", (void*)rx_bufs[i], epoch); - - if (options.s_record.load()) { - // keep statistics - struct datapt * dpt = new datapt; - dpt->rtt = rte_be_to_cpu_64(each->clt_ts_rx) - rte_be_to_cpu_64(each->clt_ts_tx); - dpt->server_proc = rte_be_to_cpu_64(each->srv_ts_tx) - rte_be_to_cpu_64(each->srv_ts_rx); - options.s_stats.push_back(dpt); - } - - // bump the epoch and stop processing other packets - state = STATE_SEND; - epoch++; - } else { - ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: ignoring packet 0x%p with invalid epoch %d.\n", (void*)rx_bufs[i], epoch); - } - - rte_pktmbuf_free(rx_bufs[i]); - } - } - - if (state == STATE_SEND) { - // set new epoch - pkt_data->epoch = rte_cpu_to_be_32(epoch); - ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: sending packet %p with epoch %d\n", (void*)tx_buf, epoch); - - const uint16_t nb_tx = rte_eth_tx_burst(options.s_portid, 0, &tx_buf, 1); - - if (nb_tx < 1) { - rte_exit(EXIT_FAILURE, "failed to send packet 0x%p, epoch %d\n", (void*)tx_buf, epoch); - } - state = STATE_RECV; - } + if (options.master_mode == 1) { + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "locore_main: sending SYNC ...\n"); + send_all_slaves(PKT_TYPE_SYNC); + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "locore_main: waiting for SYNC_ACK ...\n"); + wait_for_slaves(PKT_TYPE_SYNC_ACK, nullptr); } - - rte_pktmbuf_free(tx_buf); - ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: core %d successfully stopped.\n", core_id); + options.s_start_time.store(nm_get_uptime_ns()); + pkt_loop(); + options.s_end_time.store(nm_get_uptime_ns()); - return 0; + if (options.master_mode == 1) { + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "locore_main: sending FIN ...\n"); + send_all_slaves(PKT_TYPE_FIN); + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "locore_main: waiting for FIN_ACK ...\n"); + wait_for_slaves(PKT_TYPE_FIN_ACK, mbufs); + + // aggregate slave QPS + for (unsigned int i = 0; i < options.slaves.size(); i++) { + // these packets already underwent validity check in + // wait_for_slaves + auto pkt_hdr = rte_pktmbuf_mtod( + mbufs[i], struct pkt_hdr *); + auto pld_qps = (struct pkt_payload_qps *) + pkt_hdr->payload; + uint32_t qps = rte_be_to_cpu_32(pld_qps->qps); + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "locore_main: received qps %d from client %d\n", + qps, i); + options.s_slave_qps.fetch_add(qps); + rte_pktmbuf_free(mbufs[i]); + } + } + + ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: exited\n"); + + return 0; } -static int +static int port_init(uint16_t portid, struct rte_mempool *mbuf_pool) { - struct rte_eth_dev_info dev_info; - struct rte_eth_conf port_conf = port_conf_default; - struct rte_eth_txconf txconf; - struct rte_eth_rxconf rxconf; + struct rte_eth_dev_info dev_info { + }; + struct rte_eth_conf port_conf = port_conf_default; + struct rte_eth_txconf txconf { + }; + struct rte_eth_rxconf rxconf { + }; - uint16_t nb_rxd = RX_RING_SIZE; - uint16_t nb_txd = TX_RING_SIZE; - port_conf.rxmode.max_rx_pkt_len = RTE_ETHER_MAX_LEN; + uint16_t nb_rxd = RX_RING_SIZE; + uint16_t nb_txd = TX_RING_SIZE; + port_conf.rxmode.max_rx_pkt_len = RTE_ETHER_MAX_LEN; - if(!rte_eth_dev_is_valid_port(portid)) { - return -1; - } + if (!rte_eth_dev_is_valid_port(portid)) { + return -1; + } - int ret = rte_eth_dev_info_get(portid, &dev_info); - if (ret != 0) { - return ret; - } + int ret = rte_eth_dev_info_get(portid, &dev_info); + if (ret != 0) { + return ret; + } - port_conf.rxmode.max_rx_pkt_len = RTE_ETHER_MAX_LEN; - port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_UDP_CKSUM; - port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_IPV4_CKSUM; - port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM; - port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM; - port_conf.txmode.offloads |= DEV_TX_OFFLOAD_MBUF_FAST_FREE; - - /* Configure the Ethernet device. */ - ret = rte_eth_dev_configure(portid, RX_RING_NUM, TX_RING_NUM, &port_conf); + port_conf.rxmode.max_rx_pkt_len = RTE_ETHER_MAX_LEN; + port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; + port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_RSS_HASH; + port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_UDP_CKSUM; + port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_IPV4_CKSUM; + port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM; + port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM; + port_conf.txmode.offloads |= DEV_TX_OFFLOAD_MBUF_FAST_FREE; + + /* Configure the Ethernet device. */ + ret = rte_eth_dev_configure(portid, 1, 1, &port_conf); if (ret != 0) return ret; @@ -236,209 +631,310 @@ port_init(uint16_t portid, struct rte_mempool *mbuf_pool) if (ret != 0) return ret; - /* Allocate and set up 1 RX queue per Ethernet port. */ - rxconf = dev_info.default_rxconf; - rxconf.offloads = port_conf.rxmode.offloads; - for (uint32_t i = 0; i < RX_RING_NUM; i++) { - ret = rte_eth_rx_queue_setup(portid, i, nb_rxd, rte_eth_dev_socket_id(portid), &rxconf, mbuf_pool); + /* Allocate and set up 1 RX queue per thread . */ + rxconf = dev_info.default_rxconf; + rxconf.offloads = port_conf.rxmode.offloads; + for (uint32_t i = 0; i < 1; i++) { + ret = rte_eth_rx_queue_setup(portid, options.s_rxqid, nb_rxd, + rte_eth_dev_socket_id(portid), &rxconf, mbuf_pool); if (ret < 0) return ret; } - txconf = dev_info.default_txconf; + txconf = dev_info.default_txconf; txconf.offloads = port_conf.txmode.offloads; /* Allocate and set up 1 TX queue per Ethernet port. */ - for (uint32_t i = 0; i < TX_RING_NUM; i++) { - ret = rte_eth_tx_queue_setup(portid, i, nb_txd, rte_eth_dev_socket_id(portid), &txconf); + for (uint32_t i = 0; i < 1; i++) { + ret = rte_eth_tx_queue_setup(portid, options.s_txqid, nb_txd, + rte_eth_dev_socket_id(portid), &txconf); if (ret < 0) return ret; } - ret = rte_eth_dev_start(portid); - if (ret < 0) - return ret; + ret = rte_eth_dev_start(portid); + if (ret < 0) + return ret; /* Display the port MAC address. */ - struct rte_ether_addr addr; - ret = rte_eth_macaddr_get(portid, &addr); - if (ret != 0) - return ret; - - /* Enable RX in promiscuous mode for the Ethernet device. */ - ret = rte_eth_promiscuous_enable(portid); + struct rte_ether_addr addr { + }; + ret = rte_eth_macaddr_get(portid, &addr); if (ret != 0) return ret; - rte_eth_add_tx_callback(portid, 0, tx_add_timestamp, NULL); - rte_eth_add_rx_callback(portid, 0, rx_calc_latency, NULL); + ret = rte_eth_timesync_enable(portid); + if (ret != 0) + return ret; + + /* Enable RX in promiscuous mode for the Ethernet device. */ + ret = rte_eth_promiscuous_enable(portid); + if (ret != 0) + return ret; + + rte_eth_add_tx_callback( + portid, options.s_rxqid, tx_add_timestamp, nullptr); + rte_eth_add_rx_callback( + portid, options.s_txqid, rx_add_timestamp, nullptr); return 0; } -static void dump_options() +static void +dump_options() { - fprintf(stdout, "Configuration:\n" \ - " run time = %d\n" \ - " warmup time = %d\n" \ - " output file = %s\n" \ - " server MAC = %x:%x:%x:%x:%x:%x\n", - options.run_time, - options.warmup_time, - options.output, - options.server_mac.addr_bytes[0], - options.server_mac.addr_bytes[1], - options.server_mac.addr_bytes[2], - options.server_mac.addr_bytes[3], - options.server_mac.addr_bytes[4], - options.server_mac.addr_bytes[5]); + fprintf(stdout, + "Configuration:\n" + " verbosity = +%d\n" + " run time = %d\n" + " warmup time = %d\n" + " output file = %s\n" + " rage quit time = %ld\n" + " cpu mask = 0x%lx\n" + " interarrival dist = %s\n" + " target qps = %d\n" + " host IP = 0x%x\n", + ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING, options.run_time, + options.warmup_time, options.output, options.rage_quit_time, + options.cpu_mask, options.ia_gen_str, options.target_qps, + options.s_host_spec.ip); } -static void usage() +static void +usage() { - fprintf(stdout, - "Usage:\n " \ - " -v(vv): verbose mode\n" \ - " -h: display the information\n" \ - " -o: output filename\n" \ - " -t: run time\n" \ - " -T: warmup time\n" \ - " -s: server's mac\n\n" ); + fprintf(stdout, + "Usage:\n" + " -v(vv): verbose mode\n" + " -s: server net spec\n" + " -S: slave(rat)'s net spec (also turns on master mode)\n" + " -t: run time\n" + " -T: warmup time\n" + " -h: display the information\n" + " -o: output filename\n" + " -A: affinity mask\n" + " -i: inter-arrival time distribution\n" + " -r: rage quit time (in ms)\n" + " -q: target qps\n" + " -H: host net spec\n"); } -int main(int argc, char* argv[]) +int +main(int argc, char *argv[]) { - unsigned int nb_ports; - struct rte_mempool *mbuf_pool, *mbuf_pool_pkt; - std::ofstream log_file; + unsigned int nb_ports; + struct rte_mempool *mbuf_pool; + std::ofstream log_file; + bool has_host_spec = false; - // init dpdk - int ret = rte_eal_init(argc, argv); - if (ret < 0) { - rte_exit(EXIT_FAILURE, "rte_eal_init failed!\n"); - } + ntr_init(); - argc -= ret; - argv += ret; + // init dpdk + int ret = rte_eal_init(argc, argv); + if (ret < 0) { + rte_exit(EXIT_FAILURE, "rte_eal_init failed!\n"); + } - // set warning level - ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_WARNING); - { - int c; - // parse arguments - while((c = getopt(argc, argv, "hvo:t:T:s:")) != -1) { - switch (c) { - case 'v': - ntr_set_level(NTR_DEP_USER1, ntr_get_level(NTR_DEP_USER1) + 1); - break; - case 's': - if (rte_ether_unformat_addr(optarg, &options.server_mac) == -1) { - rte_exit(EXIT_FAILURE, "cannot parse %s as mac address.\n", optarg); - } - break; - case 't': - options.run_time = atoi(optarg); - break; - case 'T': - options.warmup_time = atoi(optarg); - break; - case 'h': - usage(); - rte_exit(EXIT_SUCCESS, NULL); - break; - case 'o': - strncpy(options.output, optarg, sizeof(options.output) - 1); - break; - default: - usage(); - rte_exit(EXIT_FAILURE, "unknown argument: %c\n", c); - break; - } - } - } + argc -= ret; + argv += ret; - // open log file for writing - log_file.open(options.output, std::ofstream::out); - if (!log_file) { - rte_exit(EXIT_FAILURE, "failed to open log file %s\n", options.output); - } + // set warning level + ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_WARNING); + { + int c; + // parse arguments + struct net_spec *ns; + while ( + (c = getopt(argc, argv, "vs:S:t:T:ho:A:i:r:q:H:")) != -1) { + switch (c) { + case 'v': + ntr_set_level(NTR_DEP_USER1, + ntr_get_level(NTR_DEP_USER1) + 1); + break; + case 's': + if (str_to_netspec( + optarg, &options.server_spec) != 0) { + rte_exit(EXIT_FAILURE, + "invalid server net spec.\n"); + } + break; + case 'S': + ns = new struct net_spec; + if (str_to_netspec( + optarg, &options.server_spec) != 0) { + rte_exit(EXIT_FAILURE, + "invalid client net spec\n"); + } + options.slaves.push_back(ns); + options.master_mode = 1; + if (options.slaves.size() > MAX_SLAVES) { + rte_exit( + EXIT_FAILURE, "too many rats.\n"); + } + break; + case 't': + options.run_time = strtol(optarg, nullptr, 10); + break; + case 'T': + options.warmup_time = strtol( + optarg, nullptr, 10); + break; + case 'h': + usage(); + rte_exit(EXIT_SUCCESS, "\n"); + case 'o': + strncpy(options.output, optarg, + sizeof(options.output) - 1); + break; + case 'A': + options.cpu_mask = strtoull( + optarg, nullptr, 16); + break; + case 'i': + strncpy(options.ia_gen_str, optarg, + sizeof(options.ia_gen_str) - 1); + break; + case 'r': + options.rage_quit_time = strtoul( + optarg, nullptr, 10); + break; + case 'q': + options.target_qps = strtoul( + optarg, nullptr, 10); + break; + case 'H': + has_host_spec = true; + if (str_to_netspec( + optarg, &options.s_host_spec) != 0) { + rte_exit(EXIT_FAILURE, + "invalid host net spec.\n"); + } + break; + default: + usage(); + rte_exit( + EXIT_FAILURE, "unknown argument: %c\n", c); + } + } + } - nb_ports = rte_eth_dev_count_avail(); - if (nb_ports == 0) { - rte_exit(EXIT_FAILURE, "number of ports must be > 0\n"); - } + if (!has_host_spec) { + rte_exit(EXIT_FAILURE, "must specify host IP\n"); + } - // create a mbuf memory pool on the socket - mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", MBUF_MAX_COUNT * nb_ports, MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id()); - if (mbuf_pool == nullptr) { - rte_exit(EXIT_FAILURE, "cannot create mbuf pool\n"); - } + dump_options(); - mbuf_pool_pkt = rte_pktmbuf_pool_create("MBUF_POOL_PKT", MBUF_MAX_COUNT * nb_ports, MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id()); - if (mbuf_pool_pkt == nullptr) { - rte_exit(EXIT_FAILURE, "cannot create mbuf pool\n"); - } - options.s_mbuf_pool = mbuf_pool_pkt; + // init nm + if (nm_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != 0) { + rte_exit(EXIT_FAILURE, "nm init failed!\n"); + } - uint16_t portid = rte_eth_find_next(0); - if (portid == RTE_MAX_ETHPORTS) { - rte_exit(EXIT_FAILURE, "cannot find an available port\n"); - } - options.s_portid = portid; + // create default generator + options.s_iagen = createGenerator(options.ia_gen_str); + if (options.s_iagen == nullptr) { + rte_exit(EXIT_FAILURE, "invalid generator string %s\n", + options.ia_gen_str); + } + options.s_iagen->set_lambda((double)options.target_qps); - if (port_init(portid, mbuf_pool) != 0) { - rte_exit(EXIT_FAILURE, "cannot init port %d\n", portid); - } + // open log file for writing + log_file.open(options.output, std::ofstream::out); + if (!log_file) { + rte_exit(EXIT_FAILURE, "failed to open log file %s\n", + options.output); + } - if (rte_eth_macaddr_get(portid, &options.s_host_mac) != 0) { - rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n", portid); - } + nb_ports = rte_eth_dev_count_avail(); + if (nb_ports == 0) { + rte_exit(EXIT_FAILURE, "number of ports must be > 0\n"); + } - ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "Configured port %d with mac addr %x:%x:%x:%x:%x:%x\n", portid, - options.s_host_mac.addr_bytes[0], - options.s_host_mac.addr_bytes[1], - options.s_host_mac.addr_bytes[2], - options.s_host_mac.addr_bytes[3], - options.s_host_mac.addr_bytes[4], - options.s_host_mac.addr_bytes[5]); + uint16_t portid = rte_eth_find_next(0); + if (portid == RTE_MAX_ETHPORTS) { + rte_exit(EXIT_FAILURE, "cannot find an available port\n"); + } + options.s_portid = portid; - dump_options(); + if (rte_eth_macaddr_get(portid, &options.s_host_spec.mac_addr) != 0) { + rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n", + portid); + } - uint16_t core_id = rte_get_next_lcore(0, true, false); - if (rte_eal_remote_launch(locore_main, NULL, core_id) != 0) { - rte_exit(EXIT_FAILURE, "failed to launch function on locore\n"); - } + // create a mbuf memory pool on the socket + mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", MBUF_MAX_COUNT, + MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, + rte_eth_dev_socket_id(options.s_portid)); + if (mbuf_pool == nullptr) { + rte_exit(EXIT_FAILURE, "cannot create mbuf pool\n"); + } + options.mbuf_pool = mbuf_pool; - // poor man's timer - // XXX: use kqueue instead - struct timespec ts; - ts.tv_sec = 1; - ts.tv_nsec = 0; - uint32_t second = 0; - while(true) { - if (second >= options.warmup_time) { - options.s_record.store(true); - } - if (second >= options.run_time + options.warmup_time) { - options.s_stop.store(true); - break; - } - clock_nanosleep(CLOCK_REALTIME, 0, &ts, NULL); - second++; - } + if (port_init(portid, mbuf_pool) != 0) { + rte_exit(EXIT_FAILURE, "cannot init port %d\n", portid); + } - if (rte_eal_wait_lcore(core_id) < 0) - rte_exit(EXIT_FAILURE, "failed to wait for job completion\n"); + ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, + "Configured port %d with mac addr %x:%x:%x:%x:%x:%x\n", portid, + options.s_host_spec.mac_addr.addr_bytes[0], + options.s_host_spec.mac_addr.addr_bytes[1], + options.s_host_spec.mac_addr.addr_bytes[2], + options.s_host_spec.mac_addr.addr_bytes[3], + options.s_host_spec.mac_addr.addr_bytes[4], + options.s_host_spec.mac_addr.addr_bytes[5]); + uint64_t cmask = options.cpu_mask; + const int16_t core_id = cmask_get_next_cpu(&cmask); + if (core_id == NEXT_CPU_NULL) { + rte_exit(EXIT_FAILURE, "invalid cpu mask 0x%lx\n", cmask); + } - // dump stats - for (auto it = std::begin(options.s_stats); it != std::end(options.s_stats); ++it) { - log_file << (*it)->rtt << "," << (*it)->server_proc << std::endl; - delete *it; - } - log_file.close(); + sleep(INIT_DELAY); + ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, + "main: launching thread on core %d\n", core_id); + if (rte_eal_remote_launch(locore_main, nullptr, core_id) != 0) { + rte_exit(EXIT_FAILURE, "failed to launch function on locore\n"); + } - // clean up - rte_eth_dev_stop(portid); - rte_eth_dev_close(portid); + // XXX: poor man's timer + uint32_t second = 0; + while (true) { + if (second >= options.warmup_time) { + options.s_record.store(1); + } + if (second >= options.run_time + options.warmup_time) { + options.s_stop.store(true); + break; + } + usleep(S2US); + second++; + } - return 0; + if (rte_eal_wait_lcore(core_id) < 0) + rte_exit(EXIT_FAILURE, "failed to wait for job completion\n"); + + // calculate QPS + uint32_t qps = (uint32_t)((double)options.s_total_pkts) / + (((double)(options.s_end_time.load() - + options.s_start_time.load()) / + (double)S2NS)); + qps += options.s_slave_qps.load(); + + // dump stats + for (auto it : options.s_data) { + if (it->valid) { + log_file << it->clt_sw_rx << ',' << it->clt_sw_tx << ',' + << it->clt_hw_rx << ',' << it->clt_hw_tx << ',' + << it->srv_sw_rx << ',' << it->srv_sw_tx << ',' + << it->srv_hw_rx << ',' << it->srv_hw_tx + << std::endl; + } + } + log_file.close(); + + fprintf(stdout, + "Processed %d packets in %d seconds. Total QPS (incl. slaves): %d\n", + qps, options.run_time, qps); + + // clean up + rte_eth_dev_stop(portid); + + return 0; } \ No newline at end of file diff --git a/compile_flags.txt b/compile_flags.txt index 03babec..53b2ac7 100644 --- a/compile_flags.txt +++ b/compile_flags.txt @@ -2,8 +2,12 @@ -O2 -std=c++11 -Wall +-Wextra -Werror --Wpedantic -I/usr/include/dpdk -Iinc --Wno-deprecated-declarations \ No newline at end of file +-Wno-deprecated-declarations +-Wno-packed-not-aligned +-Wno-address-of-packed-member +-Wno-zero-length-array +-Wno-gnu-zero-variadic-macro-arguments \ No newline at end of file diff --git a/inc/gen.h b/inc/gen.h new file mode 100644 index 0000000..0cfa52b --- /dev/null +++ b/inc/gen.h @@ -0,0 +1,296 @@ +// modified from mutilate +// -*- c++ -*- + +// 1. implement "fixed" generator +// 2. implement discrete generator +// 3. implement combine generator? + +#pragma once + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "util.h" + +#include +#include +#include + +#define D(fmt, ...) +#define DIE(fmt, ...) (void)0; + +#define FNV_64_PRIME (0x100000001b3ULL) +#define FNV1_64_INIT (0xcbf29ce484222325ULL) +static inline uint64_t +fnv_64_buf(const void *buf, size_t len) +{ + uint64_t hval = FNV1_64_INIT; + + unsigned char *bp = (unsigned char *)buf; /* start of buffer */ + unsigned char *be = bp + len; /* beyond end of buffer */ + + while (bp < be) { + hval ^= (uint64_t)*bp++; + hval *= FNV_64_PRIME; + } + + return hval; +} + +static inline uint64_t +fnv_64(uint64_t in) +{ + return fnv_64_buf(&in, sizeof(in)); +} + +// Generator syntax: +// +// \d+ == fixed +// n[ormal]:mean,sd +// e[xponential]:lambda +// p[areto]:scale,shape +// g[ev]:loc,scale,shape +// fb_value, fb_key, fb_rate + +class Generator { + public: + Generator() { } + // Generator(const Generator &g) = delete; + // virtual Generator& operator=(const Generator &g) = delete; + virtual ~Generator() { } + + virtual double generate(double U = -1.0) = 0; + virtual void set_lambda(double) { DIE("set_lambda() not implemented"); } + + protected: + std::string type; +}; + +class Fixed : public Generator { + public: + Fixed(double _value = 1.0) + : value(_value) + { + D("Fixed(%f)", value); + } + virtual double generate(double) { return value; } + virtual void set_lambda(double lambda) + { + if (lambda > 0.0) + value = 1.0 / lambda; + else + value = 0.0; + } + + private: + double value; +}; + +class Uniform : public Generator { + public: + Uniform(double _scale) + : scale(_scale) + { + D("Uniform(%f)", scale); + } + + virtual double generate(double U = -1.0) + { + if (U < 0.0) + U = drand48(); + return scale * U; + } + + virtual void set_lambda(double lambda) + { + if (lambda > 0.0) + scale = 2.0 / lambda; + else + scale = 0.0; + } + + private: + double scale; +}; + +class Normal : public Generator { + public: + Normal(double _mean = 1.0, double _sd = 1.0) + : mean(_mean) + , sd(_sd) + { + D("Normal(mean=%f, sd=%f)", mean, sd); + } + + virtual double generate(double U = -1.0) + { + if (U < 0.0) + U = drand48(); + double V = U; // drand48(); + double N = sqrt(-2 * log(U)) * cos(2 * M_PI * V); + return mean + sd * N; + } + + virtual void set_lambda(double lambda) + { + if (lambda > 0.0) + mean = 1.0 / lambda; + else + mean = 0.0; + } + + private: + double mean, sd; +}; + +class Exponential : public Generator { + public: + Exponential(double _lambda = 1.0) + : lambda(_lambda) + { + D("Exponential(lambda=%f)", lambda); + } + + virtual double generate(double U = -1.0) + { + if (lambda <= 0.0) + return 0.0; + if (U < 0.0) + U = drand48(); + return -log(U) / lambda; + } + + virtual void set_lambda(double lambda) { this->lambda = lambda; } + + private: + double lambda; +}; + +class GPareto : public Generator { + public: + GPareto(double _loc = 0.0, double _scale = 1.0, double _shape = 1.0) + : loc(_loc) + , scale(_scale) + , shape(_shape) + { + assert(shape != 0.0); + D("GPareto(loc=%f, scale=%f, shape=%f)", loc, scale, shape); + } + + virtual double generate(double U = -1.0) + { + if (U < 0.0) + U = drand48(); + return loc + scale * (pow(U, -shape) - 1) / shape; + } + + virtual void set_lambda(double lambda) + { + if (lambda <= 0.0) + scale = 0.0; + else + scale = (1 - shape) / lambda - (1 - shape) * loc; + } + + private: + double loc /* mu */; + double scale /* sigma */, shape /* k */; +}; + +class GEV : public Generator { + public: + GEV(double _loc = 0.0, double _scale = 1.0, double _shape = 1.0) + : e(1.0) + , loc(_loc) + , scale(_scale) + , shape(_shape) + { + assert(shape != 0.0); + D("GEV(loc=%f, scale=%f, shape=%f)", loc, scale, shape); + } + + virtual double generate(double U = -1.0) + { + return loc + scale * (pow(e.generate(U), -shape) - 1) / shape; + } + + private: + Exponential e; + double loc /* mu */, scale /* sigma */, shape /* k */; +}; + +class Discrete : public Generator { + public: + ~Discrete() { delete def; } + Discrete(Generator *_def = NULL) + : def(_def) + { + if (def == NULL) + def = new Fixed(0.0); + } + + virtual double generate(double U = -1.0) + { + double Uc = U; + if (pv.size() > 0 && U < 0.0) + U = drand48(); + + double sum = 0; + + for (auto p : pv) { + sum += p.first; + if (U < sum) + return p.second; + } + + return def->generate(Uc); + } + + void add(double p, double v) + { + pv.push_back(std::pair(p, v)); + } + + private: + Generator *def; + std::vector> pv; +}; + +class KeyGenerator { + public: + KeyGenerator(Generator *_g, double _max = 10000) + : g(_g) + , max(_max) + { + } + std::string generate(uint64_t ind) + { + uint64_t h = fnv_64(ind); + double U = (double)h / (double)ULLONG_MAX; + double G = g->generate(U); + int keylen = MAX(round(G), floor(log10(max)) + 1); + char key[256]; + snprintf(key, 256, "%0*" PRIu64, keylen, ind); + + // D("%d = %s", ind, key); + return std::string(key); + } + + private: + Generator *g; + double max; +}; + +Generator *createGenerator(std::string str); +Generator *createFacebookKey(); +Generator *createFacebookValue(); +Generator *createFacebookIA(); diff --git a/inc/nm.h b/inc/nm.h new file mode 100644 index 0000000..4d24578 --- /dev/null +++ b/inc/nm.h @@ -0,0 +1,20 @@ +#pragma once + +#include +#include + +constexpr static int NM_LEVEL_NUMA = 0; +constexpr static int NM_LEVEL_CPU = 1; +constexpr static int NM_LEVEL_CORE = 2; + +std::vector *nm_get_nodes(); +std::vector *nm_get_cpus(); +std::vector *nm_get_cores(); + +// 0 on success +// -1 on error +int nm_init(int verbosity); + +uint64_t nm_tsc2ns(uint64_t tsc); + +uint64_t nm_get_uptime_ns(); diff --git a/inc/ntr.h b/inc/ntr.h new file mode 100644 index 0000000..d480544 --- /dev/null +++ b/inc/ntr.h @@ -0,0 +1,38 @@ +#pragma once + +#include +#include + +#define NTR_LEVEL_NONE (0) +#define NTR_LEVEL_ERROR (1) +#define NTR_LEVEL_WARNING (2) +#define NTR_LEVEL_INFO (3) +#define NTR_LEVEL_DEBUG (4) +#define NTR_LEVEL_DEFAULT (NTR_LEVEL_WARNING) + +#define NTR_DEP_NTR (0) +#define NTR_DEP_USER1 (1) +#define NTR_DEP_USER2 (2) +#define NTR_DEP_USER3 (3) +#define NTR_DEP_USER4 (4) +#define NTR_DEP_USER5 (5) +#define NTR_DEP_MAX (NTR_DEP_USER5 + 1) + +#ifdef __cplusplus +extern "C" { +#endif + +void ntr_init(); + +__attribute__((format(printf, 3, 4))) void ntr( + int dep, int level, const char *fmt, ...); + +void ntr_set_level(int dep, int level); + +void ntr_set_output(FILE *f); + +int ntr_get_level(int dep); + +#ifdef __cplusplus +} +#endif diff --git a/inc/ntrlog.h b/inc/ntrlog.h deleted file mode 100644 index b2760aa..0000000 --- a/inc/ntrlog.h +++ /dev/null @@ -1,61 +0,0 @@ -#pragma once - -#include - -#define NTR_LEVEL_NONE (0) -#define NTR_LEVEL_ERROR (1) -#define NTR_LEVEL_WARNING (2) -#define NTR_LEVEL_INFO (3) -#define NTR_LEVEL_DEBUG (4) -#define NTR_LEVEL_DEFAULT (NTR_LEVEL_WARNING) - -#define NTR_DEP_NTR (0) -#define NTR_DEP_USER1 (1) -#define NTR_DEP_USER2 (2) -#define NTR_DEP_USER3 (3) -#define NTR_DEP_USER4 (4) -#define NTR_DEP_USER5 (5) -#define NTR_DEP_MAX (NTR_DEP_USER5 + 1) - -#define NTR_DECL_IMPL \ -int ntr_log_levels[NTR_DEP_MAX] = {NTR_LEVEL_DEFAULT}; \ -FILE * ntr_out = stdout - -extern int ntr_log_levels[]; -extern FILE * ntr_out; - -static inline -void ntr(int dep, int level, const char * fmt, ...) -{ - va_list vl; - va_start(vl, fmt); - if (dep < NTR_DEP_MAX && level <= ntr_log_levels[dep]) { - vfprintf(ntr_out, fmt, vl); - } - va_end(vl); -} - -static inline -void ntr_set_level(int dep, int level) -{ - if (dep < NTR_DEP_MAX) { - ntr_log_levels[dep] = level; - } -} - -static inline -void ntr_set_output(FILE * f) -{ - if (f != NULL) { - ntr_out = f; - } -} - -static inline -int ntr_get_level(int dep) -{ - if (dep < NTR_DEP_MAX) { - return ntr_log_levels[dep]; - } - return 0; -} diff --git a/inc/pkt.h b/inc/pkt.h index fe4d08e..bffd501 100644 --- a/inc/pkt.h +++ b/inc/pkt.h @@ -1,16 +1,19 @@ #pragma once -#include -#include -#include #include -#include -#include -#include #include -#include +#include +#include +#include +#include #include -#include +#include +#include + +#include "nm.h" +#include "util.h" + +#include #define IP_DEFTTL 64 /* from RFC 1340. */ #define IP_VERSION 0x40 @@ -19,59 +22,247 @@ #define IP_ADDR_FMT_SIZE 15 constexpr static uint32_t ETHER_FRAME_MAGIC = 0xDCDCE5E5; +const static struct rte_ether_addr POU_MAC { + 0x01, 0x00, 0x5e, 0x00, 0x01, 0x81 +}; +const static uint32_t POU_IP = RTE_IPV4(224, 0, 1, 129); +const static uint16_t POU_PORT = 319; +/* Khat Protocol: + * khat only processes two kinds of packets - LOAD and PROBE + * rat: + * rat -> LOAD -> khat + * khat -> LOAD_RESP -> rat + * cat: + * cat -> PROBE -> khat (cat tx timestamps) + * khat -> PROBE_RESP -> cat (cat rx timestamps and khat tx/rx + * timestamps) khat -> STAT -> cat (khat sends its tx/rx timestamps) + */ -struct packet_hdr { - struct rte_ether_hdr eth_hdr; - struct rte_ipv4_hdr ipv4_hdr; - struct rte_udp_hdr udp_hdr; +/* Rat Protocol: + * cat & rat: + * 1. both launch with full parameters + * rat with slave flag + * cat with master flag + * 2. rats create threads and wait for cat's signal + * 3. cat creates threads + * 4. cat -> rats SYNC + * 5. rats -> cat SYNC_ACK and start running + * 6. cat start running after received all SYNC_ACKs + * 7. cat stops running, cat -> rats FIN + * 8. rats stops running, rats -> cat FIN_ACK with QPS + * 9. cat exits after receiving all FIN_ACKs and flushing statsGG + */ + +struct ptp_hdr { + uint8_t ptp_msg_type; + uint8_t ptp_ver; + uint8_t unused[34]; } __attribute__((packed)); -struct packet_data -{ - struct packet_hdr pkt_hdr; - uint32_t magic; - uint32_t epoch; - uint64_t clt_ts_tx; - uint64_t clt_ts_rx; - uint64_t srv_ts_tx; - uint64_t srv_ts_rx; +struct pkt_hdr { + struct rte_ether_hdr eth_hdr; + struct rte_ipv4_hdr ipv4_hdr; + struct rte_udp_hdr udp_hdr; + struct ptp_hdr ptp_hdr; + uint16_t type; + uint32_t magic; + char payload[0]; +} __attribute__((packed)); + +struct net_spec { + uint32_t ip; + rte_ether_addr mac_addr; }; static inline void -print_mac(struct rte_ether_addr * mac) +pkt_hdr_to_netspec(struct pkt_hdr *pkt, struct net_spec *src, + uint16_t *src_port, struct net_spec *dst, uint16_t *dst_port) { - printf("%x:%x:%x:%x:%x:%x", mac->addr_bytes[0], - mac->addr_bytes[1], - mac->addr_bytes[2], - mac->addr_bytes[3], - mac->addr_bytes[4], - mac->addr_bytes[5]); + if (src != nullptr) { + rte_ether_addr_copy(&pkt->eth_hdr.s_addr, &src->mac_addr); + src->ip = rte_be_to_cpu_32(pkt->ipv4_hdr.src_addr); + } + + if (src_port != nullptr) { + *src_port = rte_be_to_cpu_16(pkt->udp_hdr.src_port); + } + + if (dst != nullptr) { + rte_ether_addr_copy(&pkt->eth_hdr.d_addr, &dst->mac_addr); + dst->ip = rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr); + } + + if (dst_port != nullptr) { + *dst_port = rte_be_to_cpu_16(pkt->udp_hdr.dst_port); + } +}; + +struct conn_spec { + struct net_spec *src; + uint16_t src_port; + struct net_spec *dst; + uint16_t dst_port; +}; + +// returns 0 on success +static inline int +str_to_netspec(char *str, struct net_spec *out) +{ + const char *tok = "@"; + char *token; + char *ptr; + uint32_t a, b, c, d; + + token = strtok_r(str, tok, &ptr); + + if (token == nullptr || + sscanf(token, "%d.%d.%d.%d", &a, &b, &c, &d) != 4) { + return -1; + } + + out->ip = RTE_IPV4(a, b, c, d); + + // mac next + token = strtok_r(nullptr, tok, &ptr); + if (token == nullptr || + rte_ether_unformat_addr(token, &out->mac_addr) != 0) { + return -1; + } + + return 0; +} + +constexpr static uint16_t PKT_TYPE_LOAD = 0; +struct pkt_payload_load { + uint32_t epoch; + uint32_t load; +}; + +constexpr static uint16_t PKT_TYPE_PROBE = 1; +constexpr static uint16_t PKT_TYPE_LOAD_RESP = 2; +constexpr static uint16_t PKT_TYPE_PROBE_RESP = 3; +struct pkt_payload_epoch { + uint32_t epoch; +}; + +constexpr static uint16_t PKT_TYPE_STAT = 4; +struct pkt_payload_stat { + uint32_t epoch; + uint64_t hw_rx; + uint64_t hw_tx; + uint64_t sw_rx; + uint64_t sw_tx; +}; + +constexpr static uint16_t PKT_TYPE_SYNC = 5; +constexpr static uint16_t PKT_TYPE_SYNC_ACK = 6; +constexpr static uint16_t PKT_TYPE_FIN = 7; +constexpr static uint16_t PKT_TYPE_FIN_ACK = 8; +struct pkt_payload_qps { + uint32_t qps; +}; + +constexpr static uint16_t NUM_PKT_TYPES = PKT_TYPE_FIN_ACK + 1; +// for fast packet verification +static const uint32_t expected_payload_size[NUM_PKT_TYPES] { + sizeof(struct pkt_payload_load), // LOAD + sizeof(struct pkt_payload_epoch), // PROBE + sizeof(struct pkt_payload_epoch), // LOAD_RESP + sizeof(struct pkt_payload_epoch), // PROBE_RESP + sizeof(struct pkt_payload_stat), // STAT + 0, // SYNC + 0, // SYNC_ACK + 0, // FIN + sizeof(struct pkt_payload_qps) // FIN_ACK +}; + +class rdport_generator { + private: + DISALLOW_EVIL_CONSTRUCTORS(rdport_generator); + constexpr static uint32_t MAX_PORT = 65535; + uint32_t min_port; + uint32_t cur; + std::random_device rd; + std::default_random_engine gen; + std::uniform_int_distribution dist; + + public: + rdport_generator(uint32_t mport) + : min_port(mport) + , cur(0) + , dist(0, MAX_PORT - min_port) + { + gen.seed(nm_get_uptime_ns()); + cur = dist(gen); + } + uint16_t next() + { + uint16_t ret = ((cur) % (MAX_PORT - min_port)) + min_port; + cur++; + return ret; + } +}; + +#define NTR_PKT(dep, level, pkt, prefix_fmt, ...) \ + ntr(dep, level, \ + prefix_fmt \ + "src: %d.%d.%d.%d:%d@%02x:%02x:%02x:%02x:%02x:%02x dst: %d.%d.%d.%d:%d@%02x:%02x:%02x:%02x:%02x:%02x type: %d\n", \ + ##__VA_ARGS__, \ + (rte_be_to_cpu_32(pkt->ipv4_hdr.src_addr) >> 24) & 0xff, \ + (rte_be_to_cpu_32(pkt->ipv4_hdr.src_addr) >> 16) & 0xff, \ + (rte_be_to_cpu_32(pkt->ipv4_hdr.src_addr) >> 8) & 0xff, \ + (rte_be_to_cpu_32(pkt->ipv4_hdr.src_addr) >> 0) & 0xff, \ + rte_be_to_cpu_16(pkt->udp_hdr.src_port), \ + pkt->eth_hdr.s_addr.addr_bytes[0], \ + pkt->eth_hdr.s_addr.addr_bytes[1], \ + pkt->eth_hdr.s_addr.addr_bytes[2], \ + pkt->eth_hdr.s_addr.addr_bytes[3], \ + pkt->eth_hdr.s_addr.addr_bytes[4], \ + pkt->eth_hdr.s_addr.addr_bytes[5], \ + (rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr) >> 24) & 0xff, \ + (rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr) >> 16) & 0xff, \ + (rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr) >> 8) & 0xff, \ + (rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr) >> 0) & 0xff, \ + rte_be_to_cpu_16(pkt->udp_hdr.dst_port), \ + pkt->eth_hdr.d_addr.addr_bytes[0], \ + pkt->eth_hdr.d_addr.addr_bytes[1], \ + pkt->eth_hdr.d_addr.addr_bytes[2], \ + pkt->eth_hdr.d_addr.addr_bytes[3], \ + pkt->eth_hdr.d_addr.addr_bytes[4], \ + pkt->eth_hdr.d_addr.addr_bytes[5], rte_be_to_cpu_16(pkt->type)) + +static inline void +print_mac(struct rte_ether_addr *mac) +{ + printf("%x:%x:%x:%x:%x:%x", mac->addr_bytes[0], mac->addr_bytes[1], + mac->addr_bytes[2], mac->addr_bytes[3], mac->addr_bytes[4], + mac->addr_bytes[5]); } static inline void print_ipv4(uint32_t ip) { - printf("%d-%d-%d-%d", (ip >> 24) & 0xff, - (ip >> 16) & 0xff, - (ip >> 8) & 0xff, - (ip >> 0) & 0xff); + printf("%d-%d-%d-%d", (ip >> 24) & 0xff, (ip >> 16) & 0xff, + (ip >> 8) & 0xff, (ip >> 0) & 0xff); } static inline void dump_pkt(struct rte_mbuf *pkt) { - if(rte_pktmbuf_data_len(pkt) < sizeof(struct rte_ether_hdr)) { + if (rte_pktmbuf_data_len(pkt) < sizeof(struct rte_ether_hdr)) { return; } struct rte_ether_hdr _eth_hdr; - struct rte_ether_hdr * eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_read(pkt, 0, sizeof(struct rte_ether_hdr), &_eth_hdr); - if (eth_hdr == NULL) { + auto eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_read( + pkt, 0, sizeof(struct rte_ether_hdr), &_eth_hdr); + if (eth_hdr == nullptr) { return; } // ethernet frame - printf("Packet %p: Length 0x%x\n", (void*)pkt, rte_pktmbuf_data_len(pkt)); + printf( + "Packet %p: Length 0x%x\n", (void *)pkt, rte_pktmbuf_data_len(pkt)); printf(" Ethernet header:\n"); printf(" Src:"); print_mac(ð_hdr->s_addr); @@ -86,12 +277,13 @@ dump_pkt(struct rte_mbuf *pkt) return; } - if(rte_pktmbuf_data_len(pkt) < sizeof(struct rte_ether_hdr) + sizeof(struct rte_ipv4_hdr)) { + if (rte_pktmbuf_data_len(pkt) < + sizeof(struct rte_ether_hdr) + sizeof(struct rte_ipv4_hdr)) { return; } // dump ip header - struct rte_ipv4_hdr * ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1); + auto ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1); printf(" IPv4 header:\n"); printf(" Src:"); print_ipv4(rte_be_to_cpu_32(ipv4_hdr->src_addr)); @@ -100,76 +292,167 @@ dump_pkt(struct rte_mbuf *pkt) print_ipv4(rte_be_to_cpu_32(ipv4_hdr->dst_addr)); printf("\n"); printf(" Protocol: 0x%x\n", ipv4_hdr->next_proto_id); - } -static inline -struct packet_data * construct_udp_pkt_hdr(struct rte_mbuf * buf, - struct rte_ether_addr * src_mac, struct rte_ether_addr * dst_mac, - uint32_t src_ip, uint32_t dst_ip, uint16_t src_port, uint16_t dst_port) +static inline bool +is_l2ts_pkt(uint16_t type) { - rte_pktmbuf_reset(buf); - - struct packet_data * pkt_data = (struct packet_data *)rte_pktmbuf_append(buf, sizeof(struct packet_data)); - struct rte_ether_hdr * eth_hdr; - struct rte_ipv4_hdr * ipv4_hdr; - struct rte_udp_hdr * udp_hdr; - - if (pkt_data == NULL) - return NULL; - - // single segment - buf->nb_segs = 1; - - // construct l2 header - eth_hdr = &pkt_data->pkt_hdr.eth_hdr; - rte_ether_addr_copy(src_mac, ð_hdr->s_addr); - rte_ether_addr_copy(dst_mac, ð_hdr->d_addr); - eth_hdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4); - buf->l2_len = sizeof(struct rte_ether_hdr); - - // construct l3 header - ipv4_hdr = &pkt_data->pkt_hdr.ipv4_hdr; - memset(ipv4_hdr, 0, sizeof(struct rte_ipv4_hdr)); - ipv4_hdr->version_ihl = IP_VHL_DEF; - ipv4_hdr->type_of_service = 0; - ipv4_hdr->fragment_offset = 0; - ipv4_hdr->time_to_live = IP_DEFTTL; - ipv4_hdr->next_proto_id = IPPROTO_UDP; - ipv4_hdr->packet_id = 0; - ipv4_hdr->src_addr = rte_cpu_to_be_32(src_ip); - ipv4_hdr->dst_addr = rte_cpu_to_be_32(dst_ip); - ipv4_hdr->total_length = rte_cpu_to_be_16(sizeof(struct packet_data) - sizeof(struct rte_ether_hdr)); - ipv4_hdr->hdr_checksum = 0; - buf->l3_len = sizeof(struct rte_ipv4_hdr); - - // construct l4 header - udp_hdr = &pkt_data->pkt_hdr.udp_hdr; - udp_hdr->src_port = rte_cpu_to_be_16(src_port); - udp_hdr->dst_port = rte_cpu_to_be_16(dst_port); - udp_hdr->dgram_cksum = 0; /* No UDP checksum. */ - udp_hdr->dgram_len = rte_cpu_to_be_16(sizeof(struct packet_data) - - sizeof(struct rte_ether_hdr) - - sizeof(struct rte_udp_hdr)); - buf->l4_len = sizeof(struct rte_udp_hdr); - - return pkt_data; + return type == PKT_TYPE_PROBE || type == PKT_TYPE_PROBE_RESP; } -static inline -struct packet_data * check_valid_packet(struct rte_mbuf * pkt) +// fills the packet with the information except for the payload itself +static inline struct pkt_hdr * +construct_pkt_hdr( + struct rte_mbuf *buf, uint16_t type, const struct conn_spec *conn) { - struct packet_data * pkt_data = NULL; + rte_pktmbuf_reset(buf); - if (rte_pktmbuf_data_len(pkt) < sizeof(struct packet_data)) { - return NULL; - } + const uint32_t total_sz = sizeof(struct pkt_hdr) + + expected_payload_size[type]; + auto pkt_data = (struct pkt_hdr *)rte_pktmbuf_append(buf, total_sz); + struct rte_ether_hdr *eth_hdr; + struct rte_ipv4_hdr *ipv4_hdr; + struct rte_udp_hdr *udp_hdr; + bool is_ts_pkt = is_l2ts_pkt(type); - pkt_data = rte_pktmbuf_mtod(pkt, struct packet_data *); + if (pkt_data == nullptr) + return nullptr; - if (rte_be_to_cpu_32(pkt_data->magic) == ETHER_FRAME_MAGIC) { - return pkt_data; - } + // single segment + buf->nb_segs = 1; - return NULL; + // construct l2 header + eth_hdr = &pkt_data->eth_hdr; + rte_ether_addr_copy(&conn->src->mac_addr, ð_hdr->s_addr); + if (is_ts_pkt) { + rte_ether_addr_copy(&POU_MAC, ð_hdr->d_addr); + } else { + rte_ether_addr_copy(&conn->dst->mac_addr, ð_hdr->d_addr); + } + eth_hdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4); + buf->l2_len = sizeof(struct rte_ether_hdr); + + // construct l3 header + ipv4_hdr = &pkt_data->ipv4_hdr; + memset(ipv4_hdr, 0, sizeof(struct rte_ipv4_hdr)); + ipv4_hdr->version_ihl = IP_VHL_DEF; + ipv4_hdr->type_of_service = 0; + ipv4_hdr->fragment_offset = 0; + ipv4_hdr->time_to_live = IP_DEFTTL; + ipv4_hdr->next_proto_id = IPPROTO_UDP; + ipv4_hdr->packet_id = 0; + ipv4_hdr->src_addr = rte_cpu_to_be_32(conn->src->ip); + if (is_ts_pkt) { + ipv4_hdr->dst_addr = rte_cpu_to_be_32(POU_IP); + } else { + ipv4_hdr->dst_addr = rte_cpu_to_be_32(conn->dst->ip); + } + ipv4_hdr->total_length = rte_cpu_to_be_16( + sizeof(struct pkt_hdr) - sizeof(struct rte_ether_hdr)); + ipv4_hdr->hdr_checksum = 0; + buf->l3_len = sizeof(struct rte_ipv4_hdr); + + // construct l4 header + udp_hdr = &pkt_data->udp_hdr; + udp_hdr->src_port = rte_cpu_to_be_16(conn->src_port); + if (is_ts_pkt) { + udp_hdr->dst_port = rte_cpu_to_be_16(POU_PORT); + } else { + udp_hdr->dst_port = rte_cpu_to_be_16(conn->dst_port); + } + udp_hdr->dgram_cksum = 0; /* No UDP checksum. */ + udp_hdr->dgram_len = rte_cpu_to_be_16(sizeof(struct pkt_hdr) + + expected_payload_size[type] - sizeof(struct rte_ether_hdr) - + sizeof(struct rte_udp_hdr)); + buf->l4_len = sizeof(struct rte_udp_hdr); + + if (is_ts_pkt) { + // set misc flags + buf->ol_flags |= PKT_TX_IEEE1588_TMST; + pkt_data->ptp_hdr.ptp_ver = 0x2; // VER 2 + pkt_data->ptp_hdr.ptp_msg_type = 0x0; // SYNC + } else { + pkt_data->ptp_hdr.ptp_ver = 0xff; // invalid ver + } + + pkt_data->type = rte_cpu_to_be_16(type); + pkt_data->magic = rte_cpu_to_be_32(ETHER_FRAME_MAGIC); + + return pkt_data; +} + +// returns 0 on success +static inline int +alloc_pkt_hdr(struct rte_mempool *pool, uint16_t type, + const struct conn_spec *conn, struct rte_mbuf **mbuf_out, + struct pkt_hdr **hdr_out) +{ + struct pkt_hdr *hdr; + struct rte_mbuf *pkt = rte_pktmbuf_alloc(pool); + if (pkt == nullptr) { + return -1; + } + + // printf("alloc_pkt_hdr:\n"); + // printf("from "); + // print_mac(&conn->src->mac_addr); + // printf("\nto "); + // print_mac(&conn->dst->mac_addr); + // printf("\n"); + + hdr = construct_pkt_hdr(pkt, type, conn); + if (hdr == nullptr) { + rte_pktmbuf_free(pkt); + return -1; + } + + *mbuf_out = pkt; + *hdr_out = hdr; + return 0; +} + +static inline struct pkt_hdr * +check_valid_packet(struct rte_mbuf *pkt, const struct rte_ether_addr *host_mac) +{ + struct pkt_hdr *pkt_data = nullptr; + const struct rte_ether_addr *expected_mac = nullptr; + uint16_t type; + const uint32_t data_len = rte_pktmbuf_data_len(pkt); + + if (data_len < sizeof(struct pkt_hdr)) { + return nullptr; + } + + pkt_data = rte_pktmbuf_mtod(pkt, struct pkt_hdr *); + + // check MAGIC + if (rte_be_to_cpu_32(pkt_data->magic) != ETHER_FRAME_MAGIC) { + return nullptr; + } + + type = rte_be_to_cpu_16(pkt_data->type); + // check type and payload size + if ((type >= NUM_PKT_TYPES) || + (data_len < + (sizeof(struct pkt_hdr) + + expected_payload_size[rte_be_to_cpu_16(pkt_data->type)]))) { + return nullptr; + } + + // strict dest mac filter + if (host_mac != nullptr) { + if (is_l2ts_pkt(type)) { + // dst mac must be the broadcast addr + expected_mac = &POU_MAC; + } else { + // dst mac must match the host mac + expected_mac = host_mac; + } + + if (!rte_is_same_ether_addr( + expected_mac, &pkt_data->eth_hdr.d_addr)) + return nullptr; + } + + return pkt_data; } diff --git a/inc/util.h b/inc/util.h new file mode 100644 index 0000000..a7679db --- /dev/null +++ b/inc/util.h @@ -0,0 +1,102 @@ +#pragma once +#include +#include + +#define DISALLOW_EVIL_CONSTRUCTORS(TypeName) \ + TypeName(const TypeName &) = delete; \ + void operator=(const TypeName &) = delete + +constexpr static unsigned long S2NS = 1000000000UL; +constexpr static unsigned long S2US = 1000000UL; +constexpr static unsigned long MS2NS = 1000000UL; +constexpr static uint16_t MIN_RANDOM_PORT = 1000; +constexpr static uint16_t DEFAULT_RAT_PORT = 1234; +constexpr static unsigned int INIT_DELAY = 2; + +constexpr static int NEXT_CPU_NULL = -1; +static inline int +cmask_get_next_cpu(uint64_t *mask) +{ + int ffs = ffsll(*mask); + *mask &= ~(1 << (ffs - 1)); + return ffs - 1; +} + +static inline int +cmask_get_num_cpus(const uint64_t mask) +{ + return _mm_popcnt_u64(mask); +} + +// constexpr static int LATENCY_MEASURE_TIMES = 10000; + +// static inline void +// sync_port_clock(uint16_t portid) +//{ +// int64_t lat = 0; +// int64_t get_time_lat; +// int64_t write_time_lat; +// struct timespec dum; +// struct timespec start; +// struct timespec end; +// +// // measure clock_gettime latency +// for(int i = 0; i < LATENCY_MEASURE_TIMES; i++) { +// // end - start ~= 2x clock_gettime's latency +// clock_gettime(CLOCK_REALTIME, &start); +// clock_gettime(CLOCK_REALTIME, &dum); +// clock_gettime(CLOCK_REALTIME, &end); +// +// if (end.tv_sec != start.tv_sec) { +// rte_exit(EXIT_FAILURE, "clock_gettime too slow\n"); +// } +// +// // shouldn't overflow +// lat += (end.tv_nsec - start.tv_nsec) / 2; +// } +// get_time_lat = lat / LATENCY_MEASURE_TIMES; +// +// // measure rte_eth_timesync_write_time latency +// lat = 0; +// for(int i = 0; i < LATENCY_MEASURE_TIMES; i++) { +// // end - start ~= rte_eth_timesync latency + clock_gettime's latency +// clock_gettime(CLOCK_REALTIME, &dum); +// clock_gettime(CLOCK_REALTIME, &start); +// if (rte_eth_timesync_write_time(portid, &dum) != 0) { +// rte_exit(EXIT_FAILURE, "failed to write time\n"); +// } +// clock_gettime(CLOCK_REALTIME, &end); +// +// if (end.tv_sec != start.tv_sec) { +// rte_exit(EXIT_FAILURE, "clock_gettime too slow!\n"); +// } +// +// // shouldn't overflow +// int64_t elat = (end.tv_nsec - start.tv_nsec) - get_time_lat; +// if (elat < 0) { +// rte_exit(EXIT_FAILURE, "something is wrong with lat \n"); +// } +// lat += elat; +// } +// write_time_lat = lat / LATENCY_MEASURE_TIMES; +// +// int64_t delta = (get_time_lat + write_time_lat) / 2; +// int64_t s2ns = (int64_t)S2NS; +// // sync the clock +// while (true) { +// clock_gettime(CLOCK_REALTIME, &dum); +// dum.tv_nsec += delta; +// if (dum.tv_nsec > s2ns) { +// // try again if overflow +// continue; +// } +// if (rte_eth_timesync_write_time(portid, &dum) != 0) { +// rte_exit(EXIT_FAILURE, "failed to write time\n"); +// } +// break; +// } +// rte_eth_timesync_enable(portid); +// +// printf("Sync-ed time: get lat %ld write lat %ld\n", get_time_lat, +// write_time_lat); +//} diff --git a/khat/khat.cc b/khat/khat.cc index ab509d3..e186ef7 100644 --- a/khat/khat.cc +++ b/khat/khat.cc @@ -1,222 +1,435 @@ -#include -#include #include +#include +#include #include #include -#include -#include -#include -#include -#include #include #include -#include +#include +#include +#include #include +#include "nm.h" +#include "ntr.h" #include "pkt.h" -#include "ntrlog.h" -#include "rte_arp.h" -#include "rte_mbuf_core.h" +#include "util.h" -NTR_DECL_IMPL; +#include +#include +#include -constexpr unsigned int MBUF_MAX_COUNT = 8191; -constexpr unsigned int MBUF_CACHE_SIZE = 250; -constexpr unsigned int RX_RING_SIZE = 1024; -constexpr unsigned int TX_RING_SIZE = 1024; -constexpr unsigned int RX_RING_NUM = 1; -constexpr unsigned int TX_RING_NUM = 1; -constexpr unsigned int BURST_SIZE = 32; +constexpr static unsigned int MBUF_MAX_COUNT = 65536; +constexpr static unsigned int MBUF_CACHE_SIZE = 512; +constexpr static unsigned int RX_RING_SIZE = 4096; +constexpr static unsigned int TX_RING_SIZE = 4096; +constexpr static unsigned int BURST_SIZE = 8; -static const struct rte_eth_conf port_conf_default{}; +static const struct rte_mbuf_dynfield rte_mbuf_dynfield_probe_flag = { + .name = "rte_mbuf_dynfield_probe_flag", + .size = sizeof(uint32_t), + .align = __alignof__(uint32_t), + .flags = 0 +}; + +static int PROBE_FLAG_OFFSET { 0 }; +static const struct rte_eth_conf port_conf_default { +}; + +// keep track of the probe state +// when a probe packet first arrives this state is set to be influx and the +// rte_mbuf's userdata is set to PROBE_MAGIC which prevents other probe packets +// to be processed when the server sends the probe stats back to user influx is +// released this is to guarantee that the server only processes one probe packet +// at the time +// XXX: also this can be attached to the mbuf itself and processed by the lcore +// thread +// I kept this global because globally there could be only one pending +// probe request and rx_add_timestamp can save their shit here too +struct thread_info { + int tid; + int rxqid; + int txqid; + int lcore_id; +}; + +// state machine: +constexpr static int SERVER_STATE_WAIT = 0; +constexpr static int SERVER_STATE_PROBE = 1; + +struct probe_state_t { + struct net_spec dst; + struct conn_spec cspec { + .dst = &dst + }; + uint32_t epoch; + uint64_t last_sw_rx; + uint64_t last_sw_tx; + uint64_t last_hw_rx; +}; struct options_t { - //states - uint16_t s_portid; - struct rte_ether_addr s_host_mac; - struct rte_mempool * s_pkt_mempool; + // config + int num_threads { 1 }; + uint64_t cpuset { 0x4 }; // 2nd core + uint64_t memmask { 0x0 }; // same socket as the NIC + + // states + uint16_t s_portid { 0 }; + struct net_spec s_host_spec { + }; + struct rte_mempool *s_pkt_mempool { nullptr }; + std::atomic s_state { SERVER_STATE_WAIT }; + struct probe_state_t s_probe_info; + std::vector s_thr_info; }; struct options_t options; static uint16_t rx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused, - struct rte_mbuf **pkts, uint16_t nb_pkts, uint16_t max_pkts __rte_unused, void *_ __rte_unused) + struct rte_mbuf **pkts, uint16_t nb_pkts, uint16_t max_pkts __rte_unused, + void *_ __rte_unused) { - uint64_t now = rte_rdtsc(); - struct packet_data * pkt_data; - for (int i = 0; i < nb_pkts; i++) { - pkt_data = check_valid_packet(pkts[i]); + uint64_t now = nm_get_uptime_ns(); + struct timespec ts { + }; + struct pkt_hdr *pkt_data; + for (int i = 0; i < nb_pkts; i++) { + pkt_data = check_valid_packet( + pkts[i], &options.s_host_spec.mac_addr); - if (pkt_data == NULL) { - ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "rx_add_timestamp: ignoring invalid packet %p.\n", (void*)pkts[i]); - continue; - } + if (pkt_data == nullptr) { + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "rx_add_timestamp: ignoring invalid packet %p.\n", + (void *)pkts[i]); + continue; + } - ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "rx_add_timestamp: tagged packet %p with %llu.\n", (void*)pkts[i], now); - pkt_data->srv_ts_rx = rte_cpu_to_be_64(now); - } + if (rte_be_to_cpu_16(pkt_data->type) == PKT_TYPE_PROBE) { + int state_wait = SERVER_STATE_WAIT; + *RTE_MBUF_DYNFIELD( + pkts[i], PROBE_FLAG_OFFSET, uint32_t *) = 0; + if (rte_eth_timesync_read_rx_timestamp( + port, &ts, pkts[i]->timesync & 0x3) == 0) { + if (options.s_state.compare_exchange_strong( + state_wait, SERVER_STATE_PROBE)) { + // mark the mbuf as probe packet being + // processed only the locore that + // receives the pkt w/ userdata != + // nullptr processes that packet + *RTE_MBUF_DYNFIELD(pkts[i], + PROBE_FLAG_OFFSET, uint32_t *) = 1; + // tag with timestamps + options.s_probe_info.last_hw_rx = + ts.tv_nsec + ts.tv_sec * S2NS; + options.s_probe_info.last_sw_rx = now; + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "rx_add_timestamp: tagged packet %p epoch %d with sw: %lu hw:%lu.\n", + (void *)pkts[i], + options.s_probe_info.epoch, now, + options.s_probe_info.last_hw_rx); + } else + ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, + "rx_add_timestamp: packet %p not tagged - server is processing a probe.\n", + (void *)pkts[i]); + } else + ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, + "rx_add_timestamp: packet %p not tagged - hw rx timestamp not available.\n", + (void *)pkts[i]); + } else + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "rx_add_timestamp: packet %p not tagged - type %d.\n", + (void *)pkts[i], rte_be_to_cpu_16(pkt_data->type)); + } - return nb_pkts; + return nb_pkts; } static uint16_t -tx_calc_latency(uint16_t port __rte_unused, uint16_t qidx __rte_unused, - struct rte_mbuf **pkts, uint16_t nb_pkts, void *_ __rte_unused) +tx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused, + struct rte_mbuf **pkts, uint16_t nb_pkts, void *_ __rte_unused) { - uint64_t now = rte_rdtsc(); - struct packet_data * pkt_data; + uint64_t now = nm_get_uptime_ns(); + struct pkt_hdr *pkt_data; - for (int i = 0; i < nb_pkts; i++) { + for (int i = 0; i < nb_pkts; i++) { - pkt_data = check_valid_packet(pkts[i]); + pkt_data = check_valid_packet( + pkts[i], &options.s_host_spec.mac_addr); - if (pkt_data == NULL) { - ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "tx_calc_latency: ignoring invalid packet %p.\n", (void*)pkts[i]); - continue; - } + if (pkt_data == nullptr) { + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "tx_add_timestamp: ignoring invalid packet %p.\n", + (void *)pkts[i]); + continue; + } - ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "tx_calc_latency: tagged packet %p with %llu.\n", (void*)pkts[i], now); - pkt_data->srv_ts_tx = rte_cpu_to_be_64(now); - } + if (rte_be_to_cpu_16(pkt_data->type) == PKT_TYPE_PROBE_RESP) { + // this packet is the response to PROBE packets - return nb_pkts; + // at this time the packet is not sent to the NIC yet so + // the state must be waiting stats + // XXX: this should be an assert + if (options.s_state.load() != SERVER_STATE_PROBE || + *RTE_MBUF_DYNFIELD( + pkts[i], PROBE_FLAG_OFFSET, uint32_t *) != 1) { + rte_exit(EXIT_FAILURE, + "packet %p sent to NIC before sw callback\n", + (void *)pkts[i]); + } + + options.s_probe_info.last_sw_tx = now; + + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "tx_add_timestamp: tagged packet %p with sw tx %lu\n", + (void *)pkts[i], options.s_probe_info.last_sw_tx); + } else { + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "tx_add_timestamp: packet %p not tagged - type %d\n", + (void *)pkts[i], pkt_data->type); + } + } + + return nb_pkts; +} + +noreturn static int +locore_main(void *ti) +{ + auto tinfo = (struct thread_info *)ti; + struct rte_mbuf *bufs[BURST_SIZE]; + // + 1 because it might involve an extra PKT_TYPE_STAT packet + // when all tx timestamps are ready + struct rte_mbuf *tx_bufs[BURST_SIZE]; + struct pkt_hdr *pkt_data; + + bool pending_probe = false; + + if (rte_eth_dev_socket_id(options.s_portid) > 0 && + rte_eth_dev_socket_id(options.s_portid) != (int)rte_socket_id()) { + ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, + "locore_main : WARNING, port %d is on remote NUMA node to " + "polling thread.\n\tPerformance will " + "not be optimal.\n", + tinfo->tid, options.s_portid); + } + + ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, + "locore_main : running on locore %d with txidx %d and rxidx %d.\n", + tinfo->tid, rte_lcore_id(), tinfo->txqid, tinfo->rxqid); + + while (true) { + uint16_t nb_tx = 0; + const uint16_t nb_rx = rte_eth_rx_burst( + options.s_portid, tinfo->rxqid, bufs, BURST_SIZE); + struct rte_mbuf *pkt_buf; + struct pkt_hdr *tx_data; + + for (int i = 0; i < nb_rx; i++) { + // XXX: optimization: in rx_add_timestamp every packet + // is already validated once can just mark valid packet + // with a value so we can avoid this redundant check + pkt_data = check_valid_packet( + bufs[i], &options.s_host_spec.mac_addr); + + if (pkt_data == nullptr) { + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "locore_main : skipping invalid packet %p.\n", + tinfo->tid, (void *)bufs[i]); + // dump_pkt(bufs[i]); + rte_pktmbuf_free(bufs[i]); + continue; + } + + NTR_PKT(NTR_DEP_USER1, NTR_LEVEL_DEBUG, pkt_data, + "locore_main : ", tinfo->tid); + switch (rte_be_to_cpu_16(pkt_data->type)) { + case PKT_TYPE_PROBE: { + if (options.s_state.load() == + SERVER_STATE_PROBE && + *RTE_MBUF_DYNFIELD(bufs[i], + PROBE_FLAG_OFFSET, uint32_t *) == 1) { + // send back probe_resp pkt to probe for + // return latency + pending_probe = true; + + // book keep probe results + options.s_probe_info.epoch = + rte_be_to_cpu_32( + ((struct pkt_payload_epoch *) + pkt_data->payload) + ->epoch); + pkt_hdr_to_netspec(pkt_data, + &options.s_probe_info.dst, + &options.s_probe_info.cspec + .dst_port, + nullptr, + &options.s_probe_info.cspec + .src_port); + options.s_probe_info.cspec.src = + &options.s_host_spec; + + if (alloc_pkt_hdr(options.s_pkt_mempool, + PKT_TYPE_PROBE_RESP, + &options.s_probe_info.cspec, + &pkt_buf, &tx_data) != 0) { + rte_exit(EXIT_FAILURE, + "failed to allocate pkt\n"); + } + + rte_memcpy(tx_data->payload, + pkt_data->payload, + sizeof(struct pkt_payload_epoch)); + + *RTE_MBUF_DYNFIELD(pkt_buf, + PROBE_FLAG_OFFSET, uint32_t *) = 1; + + // queue for burst send + tx_bufs[nb_tx++] = pkt_buf; + } + break; + } + case PKT_TYPE_LOAD: { + struct conn_spec cspec; + struct net_spec src; + struct net_spec dst; + + pkt_hdr_to_netspec(pkt_data, &src, + &cspec.dst_port, &dst, &cspec.src_port); + cspec.dst = &src; + cspec.src = &dst; + + // we reply to load packet regardless of the + // server state + if (alloc_pkt_hdr(options.s_pkt_mempool, + PKT_TYPE_LOAD_RESP, &cspec, &pkt_buf, + &tx_data) != 0) { + rte_exit(EXIT_FAILURE, + "failed to allocate pkt\n"); + } + + rte_memcpy(tx_data->payload, pkt_data->payload, + sizeof(struct pkt_payload_load)); + + // queue for burst send + tx_bufs[nb_tx++] = pkt_buf; + break; + } + default: + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "locore_main : ignoring packet %p with unknown type %d.\n", + tinfo->tid, (void *)bufs[i], + rte_be_to_cpu_16(pkt_data->type)); + break; + } + rte_pktmbuf_free(bufs[i]); + } + + // send the packets + if (nb_tx > 0) { + const uint16_t nb_tx_succ = rte_eth_tx_burst( + options.s_portid, tinfo->txqid, tx_bufs, nb_tx); + if (nb_tx_succ < nb_tx) { + rte_exit(EXIT_FAILURE, + "failed to send some packets.\n"); + } + } + + // we wanna check every loop not only when there are packets + if (pending_probe) { + struct timespec ts { + }; + struct pkt_payload_stat *stat; + if (rte_eth_timesync_read_tx_timestamp( + options.s_portid, &ts) == 0) { + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "locore_main : obtained hw tx timestamp %lu.\n", + tinfo->tid, + (ts.tv_sec * S2NS + ts.tv_nsec)); + // now we have everything we need + + if (alloc_pkt_hdr(options.s_pkt_mempool, + PKT_TYPE_STAT, + &options.s_probe_info.cspec, &pkt_buf, + &tx_data) != 0) { + rte_exit(EXIT_FAILURE, + "failed to alloc pkt_buf\n"); + } + + // populate stats + stat = (struct pkt_payload_stat *) + tx_data->payload; + stat->epoch = rte_cpu_to_be_32( + options.s_probe_info.epoch); + stat->hw_rx = rte_cpu_to_be_64( + options.s_probe_info.last_hw_rx); + stat->hw_tx = rte_cpu_to_be_64( + ts.tv_nsec + ts.tv_sec * S2NS); + stat->sw_rx = rte_cpu_to_be_64( + options.s_probe_info.last_sw_rx); + stat->sw_tx = rte_cpu_to_be_64( + options.s_probe_info.last_sw_tx); + + // send the packet + if (rte_eth_tx_burst(options.s_portid, + tinfo->txqid, &pkt_buf, 1) < 1) { + rte_exit(EXIT_FAILURE, + "failed to send some packets.\n"); + } + + // release flux + pending_probe = false; + + int expected = SERVER_STATE_PROBE; + if (!options.s_state.compare_exchange_strong( + expected, SERVER_STATE_WAIT)) { + rte_exit(EXIT_FAILURE, + "s_state changed unexpectedly!"); + } + } + } + } } static int -locore_main(void * _unused __rte_unused) -{ - struct rte_mbuf *bufs[BURST_SIZE]; - struct rte_mbuf *tx_bufs[BURST_SIZE]; - struct packet_data *pkt_data; - uint32_t core_id = rte_lcore_id(); - - if (rte_eth_dev_socket_id(options.s_portid) > 0 && rte_eth_dev_socket_id(options.s_portid) != (int)rte_socket_id()) { - ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "locore_main: WARNING, port %d is on remote NUMA node to " - "polling thread.\n\tPerformance will " - "not be optimal.\n", options.s_portid); - } - - ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: core %d running.\n", core_id); - - while(true) { - uint16_t nb_tx = 0; - const uint16_t nb_rx = rte_eth_rx_burst(options.s_portid, 0, bufs, BURST_SIZE); - - if (nb_rx == 0) { - continue; - } - - for(int i = 0; i < nb_rx; i++) { - - pkt_data = check_valid_packet(bufs[i]); - - if (pkt_data == NULL) { - ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "locore_main: core %d skipping invalid packet %p.\n", core_id, (void*)bufs[i]); - dump_pkt(bufs[i]); - rte_pktmbuf_free(bufs[i]); - continue; - } - - uint32_t dst_ip = rte_be_to_cpu_32(pkt_data->pkt_hdr.ipv4_hdr.dst_addr); - uint32_t src_ip = rte_be_to_cpu_32(pkt_data->pkt_hdr.ipv4_hdr.src_addr); - uint16_t src_port = rte_be_to_cpu_16(pkt_data->pkt_hdr.udp_hdr.src_port); - uint16_t dst_port = rte_be_to_cpu_16(pkt_data->pkt_hdr.udp_hdr.dst_port); - ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: core %d packet %p from %d.%d.%d.%d(%x:%x:%x:%x:%x:%x) to %d.%d.%d.%d(%x:%x:%x:%x:%x:%x), sport %d, dport %d, epoch %d\n", - core_id, - (void*)bufs[i], - (src_ip >> 24) & 0xff, - (src_ip >> 16) & 0xff, - (src_ip >> 8) & 0xff, - (src_ip >> 0) & 0xff, - pkt_data->pkt_hdr.eth_hdr.s_addr.addr_bytes[0], - pkt_data->pkt_hdr.eth_hdr.s_addr.addr_bytes[1], - pkt_data->pkt_hdr.eth_hdr.s_addr.addr_bytes[2], - pkt_data->pkt_hdr.eth_hdr.s_addr.addr_bytes[3], - pkt_data->pkt_hdr.eth_hdr.s_addr.addr_bytes[4], - pkt_data->pkt_hdr.eth_hdr.s_addr.addr_bytes[5], - (dst_ip >> 24) & 0xff, - (dst_ip >> 16) & 0xff, - (dst_ip >> 8) & 0xff, - (dst_ip >> 0) & 0xff, - pkt_data->pkt_hdr.eth_hdr.d_addr.addr_bytes[0], - pkt_data->pkt_hdr.eth_hdr.d_addr.addr_bytes[1], - pkt_data->pkt_hdr.eth_hdr.d_addr.addr_bytes[2], - pkt_data->pkt_hdr.eth_hdr.d_addr.addr_bytes[3], - pkt_data->pkt_hdr.eth_hdr.d_addr.addr_bytes[4], - pkt_data->pkt_hdr.eth_hdr.d_addr.addr_bytes[5], - src_port, - dst_port, - rte_be_to_cpu_32(pkt_data->epoch)); - // swap s_addr and d_addr - struct rte_mbuf * pkt_buf = rte_pktmbuf_alloc(options.s_pkt_mempool); - if (pkt_buf == NULL) { - rte_exit(EXIT_FAILURE, "locore_main: failed to allocate memory for pkt_buf"); - } - - struct packet_data * tx_data = construct_udp_pkt_hdr(pkt_buf, - &options.s_host_mac, - &pkt_data->pkt_hdr.eth_hdr.s_addr, - dst_ip, - src_ip, - dst_port, - src_port); - if (tx_data == NULL) { - rte_exit(EXIT_FAILURE, "failed to construct tx packet %p", (void*)pkt_buf); - } - // copy, endianess doesn't matter - tx_data->epoch = pkt_data->epoch; - tx_data->magic = pkt_data->magic; - tx_data->clt_ts_rx = pkt_data->clt_ts_rx; - tx_data->clt_ts_tx = pkt_data->clt_ts_tx; - tx_data->srv_ts_rx = pkt_data->srv_ts_rx; - tx_data->srv_ts_tx = pkt_data->srv_ts_tx; - // queue for burst send - tx_bufs[nb_tx++] = pkt_buf; - // free rx packet - rte_pktmbuf_free(bufs[i]); - } - - const uint16_t nb_tx_succ = rte_eth_tx_burst(options.s_portid, 0, tx_bufs, nb_tx); - // cleanup unsent packets - // don't need to free others because it's offloaded - if (nb_tx_succ < nb_tx) { - rte_exit(EXIT_FAILURE, "locore_main: failed to send some packets.\n"); - } - } - - return 0; -} - -static int port_init(uint16_t portid, struct rte_mempool *mbuf_pool) { - struct rte_eth_dev_info dev_info; - struct rte_eth_conf port_conf = port_conf_default; - struct rte_eth_txconf txconf; - struct rte_eth_rxconf rxconf; + struct rte_eth_dev_info dev_info { + }; + struct rte_eth_conf port_conf = port_conf_default; + struct rte_eth_txconf txconf { + }; + struct rte_eth_rxconf rxconf { + }; - uint16_t nb_rxd = RX_RING_SIZE; - uint16_t nb_txd = TX_RING_SIZE; + uint16_t nb_rxd = RX_RING_SIZE; + uint16_t nb_txd = TX_RING_SIZE; - if(!rte_eth_dev_is_valid_port(portid)) { - return -1; - } + if (!rte_eth_dev_is_valid_port(portid)) { + return -1; + } - int ret = rte_eth_dev_info_get(portid, &dev_info); - if (ret != 0) { - return ret; - } + int ret = rte_eth_dev_info_get(portid, &dev_info); + if (ret != 0) { + return ret; + } - port_conf.rxmode.max_rx_pkt_len = RTE_ETHER_MAX_LEN; - port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_UDP_CKSUM; - port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_IPV4_CKSUM; - port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM; - port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM; - port_conf.txmode.offloads |= DEV_TX_OFFLOAD_MBUF_FAST_FREE; - - /* Configure the Ethernet device. */ - ret = rte_eth_dev_configure(portid, RX_RING_NUM, TX_RING_NUM, &port_conf); + port_conf.rxmode.max_rx_pkt_len = RTE_ETHER_MAX_LEN; + port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; + port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_NONFRAG_IPV4_UDP | + ETH_RSS_L2_PAYLOAD | ETH_RSS_NONFRAG_IPV4_TCP; + port_conf.rx_adv_conf.rss_conf.rss_key = nullptr; + port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_RSS_HASH; + port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_UDP_CKSUM; + port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_IPV4_CKSUM; + port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM; + port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM; + port_conf.txmode.offloads |= DEV_TX_OFFLOAD_MBUF_FAST_FREE; + + /* Configure the Ethernet device. */ + ret = rte_eth_dev_configure( + portid, options.num_threads, options.num_threads, &port_conf); if (ret != 0) return ret; @@ -224,155 +437,247 @@ port_init(uint16_t portid, struct rte_mempool *mbuf_pool) if (ret != 0) return ret; - /* Allocate and set up 1 RX queue per Ethernet port. */ - rxconf = dev_info.default_rxconf; - for (uint32_t i = 0; i < RX_RING_NUM; i++) { - ret = rte_eth_rx_queue_setup(portid, i, nb_rxd, rte_eth_dev_socket_id(portid), &rxconf, mbuf_pool); + /* Allocate and set up 1 RX queue per thread per Ethernet port. */ + rxconf = dev_info.default_rxconf; + for (int i = 0; i < options.num_threads; i++) { + ret = rte_eth_rx_queue_setup(portid, i, nb_rxd, + rte_eth_dev_socket_id(portid), &rxconf, mbuf_pool); if (ret < 0) return ret; + options.s_thr_info.at(i)->rxqid = i; } - txconf = dev_info.default_txconf; + txconf = dev_info.default_txconf; txconf.offloads = port_conf.txmode.offloads; - /* Allocate and set up 1 TX queue per Ethernet port. */ - for (uint32_t i = 0; i < TX_RING_NUM; i++) { - ret = rte_eth_tx_queue_setup(portid, i, nb_txd, rte_eth_dev_socket_id(portid), &txconf); + /* Allocate and set up 1 TX queue per thread per Ethernet port. */ + for (int i = 0; i < options.num_threads; i++) { + ret = rte_eth_tx_queue_setup( + portid, i, nb_txd, rte_eth_dev_socket_id(portid), &txconf); if (ret < 0) return ret; + options.s_thr_info.at(i)->txqid = i; } - ret = rte_eth_dev_start(portid); - if (ret < 0) - return ret; + ret = rte_eth_dev_start(portid); + if (ret < 0) + return ret; /* Display the port MAC address. */ - struct rte_ether_addr addr; - ret = rte_eth_macaddr_get(portid, &addr); - if (ret != 0) - return ret; - - /* Enable RX in promiscuous mode for the Ethernet device. */ - ret = rte_eth_promiscuous_enable(portid); + struct rte_ether_addr addr { + }; + ret = rte_eth_macaddr_get(portid, &addr); if (ret != 0) return ret; - if (rte_eth_add_tx_callback(portid, 0, tx_calc_latency, NULL) == NULL || rte_eth_add_rx_callback(portid, 0, rx_add_timestamp, NULL) == NULL) { - return -1; - } + ret = rte_eth_timesync_enable(portid); + if (ret != 0) + return ret; + + /* Enable RX in promiscuous mode for the Ethernet device. */ + ret = rte_eth_promiscuous_enable(portid); + if (ret != 0) + return ret; + + for (int i = 0; i < options.num_threads; i++) { + if (rte_eth_add_tx_callback(portid, + options.s_thr_info.at(i)->txqid, tx_add_timestamp, + nullptr) == nullptr || + rte_eth_add_rx_callback(portid, + options.s_thr_info.at(i)->rxqid, rx_add_timestamp, + nullptr) == nullptr) { + return -1; + } + } + + // sync_port_clock(portid); return 0; } -static void usage() +static void +usage() { - fprintf(stdout, - "Usage:\n" \ - " -v(vv): verbose mode\n" \ - " -h: display the information\n"); + fprintf(stdout, + "Usage:\n" + " -v(vv): verbose mode\n" + " -h: seek help\n" + " -A: cpu mask for worker threads\n" + " -M: mempool socket affinity mask\n" + " -H: host spec\n"); + fflush(stdout); } -int main(int argc, char* argv[]) +static void +dump_options() { - unsigned int nb_ports; - struct rte_mempool *mbuf_pool, *mbuf_pool_pkt; - - // init dpdk - int ret = rte_eal_init(argc, argv); - if (ret < 0) { - rte_exit(EXIT_FAILURE, "rte_eal_init failed!\n"); - } - - argc -= ret; - argv += ret; - - // set warning level - ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_WARNING); - { - int c; - // parse arguments - while((c = getopt(argc, argv, "hv")) != -1) { - switch (c) { - case 'v': - ntr_set_level(NTR_DEP_USER1, ntr_get_level(NTR_DEP_USER1) + 1); - break; - case 'h': - usage(); - rte_exit(EXIT_SUCCESS, NULL); - break; - default: - usage(); - rte_exit(EXIT_SUCCESS, "unknown argument: %c", c); - break; - } - } - } - - // XXX: singal handler to exit - - nb_ports = rte_eth_dev_count_avail(); - if (nb_ports == 0) { - rte_exit(EXIT_FAILURE, "number of ports must be > 0\n"); - } - - // create a mbuf memory pool on the socket - mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", MBUF_MAX_COUNT * nb_ports, MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id()); - if (mbuf_pool == nullptr) { - rte_exit(EXIT_FAILURE, "cannot create mbuf pool\n"); - } - - // create a pkt mbuf memory pool on the socket - mbuf_pool_pkt = rte_pktmbuf_pool_create("MBUF_POOL_PKT", MBUF_MAX_COUNT * nb_ports, MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id()); - if (mbuf_pool_pkt == nullptr) { - rte_exit(EXIT_FAILURE, "cannot create mbuf_pkt pool\n"); - } - options.s_pkt_mempool = mbuf_pool_pkt; - - - uint16_t portid = rte_eth_find_next(0); - if (portid == RTE_MAX_ETHPORTS) { - rte_exit(EXIT_FAILURE, "cannot find an available port\n"); - } - options.s_portid = portid; - - if (port_init(portid, mbuf_pool) != 0) { - rte_exit(EXIT_FAILURE, "cannot init port %d\n", portid); - } - - if (rte_eth_macaddr_get(portid, &options.s_host_mac) != 0) { - rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n", portid); - } - - ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "Configured port %d with mac addr %x:%x:%x:%x:%x:%x\n", portid, - options.s_host_mac.addr_bytes[0], - options.s_host_mac.addr_bytes[1], - options.s_host_mac.addr_bytes[2], - options.s_host_mac.addr_bytes[3], - options.s_host_mac.addr_bytes[4], - options.s_host_mac.addr_bytes[5]); - - - uint16_t lcore_id = rte_get_next_lcore(0, true, false); - - if (lcore_id == RTE_MAX_LCORE) { - rte_exit(EXIT_FAILURE, "cannot detect lcores.\n"); - } - - if (rte_eal_remote_launch(locore_main, NULL, lcore_id) != 0) { - rte_exit(EXIT_FAILURE, "failed to launch function on locore %d\n", lcore_id); - } - - // while(true) { - // struct rte_eth_stats stats; - // rte_eth_stats_get(portid, &stats); - // printf("recv: %d missed: %d err: %d\n",(uint32_t)stats.ipackets, (uint32_t)stats.imissed,(uint32_t)stats.ierrors); - // usleep(1000000); - // } - - if (rte_eal_wait_lcore(lcore_id) != 0) { - rte_exit(EXIT_FAILURE, "failed to wait for locore %d\n", lcore_id); - } - - // shouldn't get here - - return 0; + ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, + "main: khat configuration:\n" + " verbosity: +%d\n" + " thread count: %d\n" + " cpu mask: 0x%lx\n" + " mempool mask: 0x%lx\n" + " ip: 0x%x\n", + ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING, + options.num_threads, options.cpuset, options.memmask, + options.s_host_spec.ip); +} + +int +main(int argc, char *argv[]) +{ + unsigned int nb_ports; + struct rte_mempool *mbuf_pool; + bool has_host_spec { false }; + + ntr_init(); + + // init dpdk + int ret = rte_eal_init(argc, argv); + if (ret < 0) { + rte_exit(EXIT_FAILURE, "rte_eal_init failed!\n"); + } + + argc -= ret; + argv += ret; + + // set warning level + ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_WARNING); + { + int c; + // parse arguments + while ((c = getopt(argc, argv, "hvA:M:H:")) != -1) { + switch (c) { + case 'v': + ntr_set_level(NTR_DEP_USER1, + ntr_get_level(NTR_DEP_USER1) + 1); + break; + case 'h': + usage(); + rte_exit(EXIT_SUCCESS, "\n"); + case 'A': + options.cpuset = strtoull(optarg, nullptr, 16); + options.num_threads = cmask_get_num_cpus( + options.cpuset); + if (options.num_threads == 0) { + rte_exit(EXIT_FAILURE, + "must run at least one thread\n"); + } + break; + case 'M': + options.memmask = strtoull(optarg, nullptr, 16); + break; + case 'H': + if (str_to_netspec( + optarg, &options.s_host_spec) != 0) { + rte_exit(EXIT_FAILURE, + "invalid host spec\n"); + } + has_host_spec = true; + break; + default: + usage(); + rte_exit( + EXIT_SUCCESS, "unknown argument: %c", c); + } + } + } + + if (!has_host_spec) { + rte_exit(EXIT_FAILURE, "Must specify host spec\n"); + } + + dump_options(); + + // init nm + if (nm_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != 0) { + rte_exit(EXIT_FAILURE, "nm init failed!\n"); + } + + // register dynamic field + PROBE_FLAG_OFFSET = rte_mbuf_dynfield_register( + &rte_mbuf_dynfield_probe_flag); + if (PROBE_FLAG_OFFSET < 0) { + rte_exit(EXIT_FAILURE, "failed to register dynamic field\n"); + } + + nb_ports = rte_eth_dev_count_avail(); + if (nb_ports == 0) { + rte_exit(EXIT_FAILURE, "number of ports must be > 0\n"); + } + + uint16_t portid = rte_eth_find_next(0); + if (portid == RTE_MAX_ETHPORTS) { + rte_exit(EXIT_FAILURE, "cannot find an available port\n"); + } + options.s_portid = portid; + + if (rte_eth_macaddr_get(portid, &options.s_host_spec.mac_addr) != 0) { + rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n", + portid); + } + + // create a mbuf memory pool on the socket + mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", + MBUF_MAX_COUNT * nb_ports, MBUF_CACHE_SIZE, 0, + RTE_MBUF_DEFAULT_BUF_SIZE, rte_eth_dev_socket_id(portid)); + if (mbuf_pool == nullptr) { + rte_exit(EXIT_FAILURE, "cannot create mbuf pool\n"); + } + + options.s_pkt_mempool = mbuf_pool; + + // init threads + uint64_t cpuset = options.cpuset; + for (int i = 0; i < options.num_threads; i++) { + auto *tinfo = new struct thread_info; + tinfo->tid = i; + tinfo->lcore_id = cmask_get_next_cpu(&cpuset); + options.s_thr_info.push_back(tinfo); + } + + if (port_init(portid, mbuf_pool) != 0) { + rte_exit(EXIT_FAILURE, "cannot init port %d\n", portid); + } + + ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, + "Configured port %d on socket %d with mac addr %x:%x:%x:%x:%x:%x\n", + portid, rte_eth_dev_socket_id(portid), + options.s_host_spec.mac_addr.addr_bytes[0], + options.s_host_spec.mac_addr.addr_bytes[1], + options.s_host_spec.mac_addr.addr_bytes[2], + options.s_host_spec.mac_addr.addr_bytes[3], + options.s_host_spec.mac_addr.addr_bytes[4], + options.s_host_spec.mac_addr.addr_bytes[5]); + + sleep(INIT_DELAY); + + for (int i = 0; i < options.num_threads; i++) { + struct thread_info *tinfo = options.s_thr_info.at(i); + ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, + "main: launching thread %d on locore %d\n", tinfo->tid, + tinfo->lcore_id); + if (rte_eal_remote_launch(locore_main, + (void *)options.s_thr_info.at(i), + tinfo->lcore_id) != 0) { + rte_exit(EXIT_FAILURE, + "failed to launch function on locore %d\n", + tinfo->lcore_id); + } + } + + for (int i = 0; i < options.num_threads; i++) { + struct thread_info *tinfo = options.s_thr_info.at(i); + ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, + "main: waiting for locore %d...\n", tinfo->lcore_id); + if (rte_eal_wait_lcore(tinfo->lcore_id) != 0) { + rte_exit(EXIT_FAILURE, "failed to wait for locore %d\n", + tinfo->lcore_id); + } + } + + // shouldn't get here + // clean up + rte_eth_dev_stop(portid); + + return 0; } diff --git a/libgen/generator.cc b/libgen/generator.cc new file mode 100644 index 0000000..d892105 --- /dev/null +++ b/libgen/generator.cc @@ -0,0 +1,95 @@ +// modified from mutilate + +#include "gen.h" + +Generator * +createFacebookKey() +{ + return new GEV(30.7984, 8.20449, 0.078688); +} + +Generator * +createFacebookValue() +{ + Generator *g = new GPareto(15.0, 214.476, 0.348238); + + Discrete *d = new Discrete(g); + d->add(0.00536, 0.0); + d->add(0.00047, 1.0); + d->add(0.17820, 2.0); + d->add(0.09239, 3.0); + d->add(0.00018, 4.0); + d->add(0.02740, 5.0); + d->add(0.00065, 6.0); + d->add(0.00606, 7.0); + d->add(0.00023, 8.0); + d->add(0.00837, 9.0); + d->add(0.00837, 10.0); + d->add(0.08989, 11.0); + d->add(0.00092, 12.0); + d->add(0.00326, 13.0); + d->add(0.01980, 14.0); + + return d; +} + +Generator * +createFacebookIA() +{ + return new GPareto(0, 16.0292, 0.154971); +} + +Generator * +createGenerator(std::string str) +{ + if (!strcmp(str.c_str(), "fb_key")) + return createFacebookKey(); + else if (!strcmp(str.c_str(), "fb_value")) + return createFacebookValue(); + else if (!strcmp(str.c_str(), "fb_ia")) + return createFacebookIA(); + + char *s_copy = new char[str.length() + 1]; + strcpy(s_copy, str.c_str()); + char *saveptr = NULL; + + if (atoi(s_copy) != 0 || !strcmp(s_copy, "0")) { + double v = atof(s_copy); + delete[] s_copy; + return new Fixed(v); + } + + char *t_ptr = strtok_r(s_copy, ":", &saveptr); + char *a_ptr = strtok_r(NULL, ":", &saveptr); + + if (t_ptr == NULL) // || a_ptr == NULL) + DIE("strtok(.., \":\") failed to parse %s", str.c_str()); + + saveptr = NULL; + char *s1 = strtok_r(a_ptr, ",", &saveptr); + char *s2 = strtok_r(NULL, ",", &saveptr); + char *s3 = strtok_r(NULL, ",", &saveptr); + + double a1 = s1 ? atof(s1) : 0.0; + double a2 = s2 ? atof(s2) : 0.0; + double a3 = s3 ? atof(s3) : 0.0; + + delete[] s_copy; + + if (strcasestr(str.c_str(), "fixed")) + return new Fixed(a1); + else if (strcasestr(str.c_str(), "normal")) + return new Normal(a1, a2); + else if (strcasestr(str.c_str(), "exponential")) + return new Exponential(a1); + else if (strcasestr(str.c_str(), "pareto")) + return new GPareto(a1, a2, a3); + else if (strcasestr(str.c_str(), "gev")) + return new GEV(a1, a2, a3); + else if (strcasestr(str.c_str(), "uniform")) + return new Uniform(a1); + + DIE("Unable to create Generator '%s'", str.c_str()); + + return NULL; +} \ No newline at end of file diff --git a/libnm/nm.cc b/libnm/nm.cc new file mode 100644 index 0000000..f43b774 --- /dev/null +++ b/libnm/nm.cc @@ -0,0 +1,187 @@ +#include +#include + +#include +#include + +#include "nm.h" + +#include +#include + +static const char *SYSCTL_TSC = "machdep.tsc_freq"; + +static int verbose = 0; +static uint64_t sysctl_tsc_freq = 0; + +struct nm_obj { + int level; + int id; + struct nm_obj *parent; + std::vector children; +}; + +static bool +nm_obj_comparator(struct nm_obj *a, struct nm_obj *b) +{ + return a->id < b->id; +} + +static std::vector nodes; +static std::vector cores; +static std::vector cpus; + +std::vector * +nm_get_nodes() +{ + return &nodes; +} + +std::vector * +nm_get_cpus() +{ + return &cpus; +} + +std::vector * +nm_get_cores() +{ + return &cores; +} + +hwloc_obj_t +get_parent_type(hwloc_obj_t obj, hwloc_obj_type_t type) +{ + while (obj != nullptr) { + if (obj->type == type) { + break; + } + obj = obj->parent; + } + return obj; +} + +uint64_t +nm_get_uptime_ns() +{ + unsigned int dummy; + return nm_tsc2ns(__rdtscp(&dummy)); +} + +uint64_t +nm_tsc2ns(uint64_t tsc) +{ + return (uint64_t)( + (double)tsc / (double)sysctl_tsc_freq * (double)1000000000ul); +} + +// 0 on success +// -1 on error +int +nm_init(int verbosity) +{ + int ret; + size_t sz = sizeof(sysctl_tsc_freq); + verbose = verbosity; + + // init nm_tsc2ns + if ((ret = sysctlbyname( + SYSCTL_TSC, &sysctl_tsc_freq, &sz, nullptr, 0)) < 0) { + if (verbose) { + fprintf(stderr, + "libnm: failed to query tsc frequency via sysctl (%d)\n", + errno); + } + return ret; + } + + if (verbose) { + fprintf(stdout, "libnm: tsc frequency: %lu\n", sysctl_tsc_freq); + } + + // init numa stuff + hwloc_topology *topo; + if ((ret = hwloc_topology_init(&topo)) != 0) { + return ret; + } + + if ((ret = hwloc_topology_load(topo)) != 0) + return ret; + + // populate numa nodes + hwloc_obj_t obj = nullptr; + while (true) { + obj = hwloc_get_next_obj_by_type(topo, HWLOC_OBJ_PACKAGE, obj); + if (obj == nullptr) { + break; + } + + auto each = new struct nm_obj; + each->id = obj->logical_index; + each->level = NM_LEVEL_NUMA; + each->parent = nullptr; + nodes.push_back(each); + if (verbose) { + fprintf(stdout, "libnm: identified NUMA node %d\n", + each->id); + } + } + std::sort(nodes.begin(), nodes.end(), nm_obj_comparator); + + // populate cpus + obj = nullptr; + while (true) { + obj = hwloc_get_next_obj_by_type(topo, HWLOC_OBJ_CORE, obj); + if (obj == nullptr) { + break; + } + auto each = new struct nm_obj; + each->id = obj->logical_index; + each->level = NM_LEVEL_CPU; + hwloc_obj_t parent = get_parent_type(obj, HWLOC_OBJ_PACKAGE); + if (parent == nullptr) { + return -1; + } + + // XXX: this faults if the OS decides to be stupid + each->parent = nodes.at(parent->logical_index); + each->parent->children.push_back(each); + cpus.push_back(each); + if (verbose) { + fprintf(stdout, + "libnm: identified CPU %d on NUMA node %d\n", + each->id, each->parent->id); + } + } + std::sort(cpus.begin(), cpus.end(), nm_obj_comparator); + + // populate cores + obj = nullptr; + while (true) { + obj = hwloc_get_next_obj_by_type(topo, HWLOC_OBJ_PU, obj); + if (obj == nullptr) { + break; + } + auto each = new struct nm_obj; + each->id = obj->logical_index; + each->level = NM_LEVEL_CORE; + hwloc_obj_t parent = get_parent_type(obj, HWLOC_OBJ_CORE); + if (parent == nullptr) { + return -1; + } + + // XXX: this faults if the OS decides to be stupid + each->parent = cpus.at(parent->logical_index); + each->parent->children.push_back(each); + cores.push_back(each); + if (verbose) { + fprintf(stdout, + "libnm: identified core %d on CPU %d, NUMA node %d\n", + each->id, each->parent->id, + each->parent->parent->id); + } + } + std::sort(cores.begin(), cores.end(), nm_obj_comparator); + + return ret; +} \ No newline at end of file diff --git a/libntr/ntr.c b/libntr/ntr.c new file mode 100644 index 0000000..0c6d502 --- /dev/null +++ b/libntr/ntr.c @@ -0,0 +1,46 @@ +#include "ntr.h" + +static int ntr_log_levels[NTR_DEP_MAX] = { NTR_LEVEL_DEFAULT }; +static FILE *ntr_out; + +void +ntr_init() +{ + ntr_out = stdout; +} + +void +ntr(int dep, int level, const char *fmt, ...) +{ + va_list vl; + va_start(vl, fmt); + if (dep < NTR_DEP_MAX && level <= ntr_log_levels[dep]) { + vfprintf(ntr_out, fmt, vl); + } + va_end(vl); +} + +void +ntr_set_level(int dep, int level) +{ + if (dep < NTR_DEP_MAX) { + ntr_log_levels[dep] = level; + } +} + +void +ntr_set_output(FILE *f) +{ + if (f != NULL) { + ntr_out = f; + } +} + +int +ntr_get_level(int dep) +{ + if (dep < NTR_DEP_MAX) { + return ntr_log_levels[dep]; + } + return 0; +} diff --git a/rat/rat.cc b/rat/rat.cc new file mode 100644 index 0000000..b3bf754 --- /dev/null +++ b/rat/rat.cc @@ -0,0 +1,806 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gen.h" +#include "nm.h" +#include "ntr.h" +#include "pkt.h" +#include "util.h" + +#include +#include +#include + +constexpr static unsigned int MBUF_MAX_COUNT = 65536; +constexpr static unsigned int MBUF_CACHE_SIZE = 512; +constexpr static unsigned int RX_RING_SIZE = 4096; +constexpr static unsigned int TX_RING_SIZE = 4096; +constexpr static unsigned int BURST_SIZE = 8; + +static const struct rte_eth_conf port_conf_default { +}; + +static unsigned int +epoch_mk(unsigned int id, unsigned int epoch) +{ + return (id << 24) | epoch; +} + +static unsigned int +epoch_get_id(unsigned int epoch) +{ + return epoch >> 24; +} + +static unsigned int +epoch_get_epoch(unsigned int epoch) +{ + return epoch & 0x00FFFFFF; +} + +struct thread_info { + unsigned int id { 0 }; + unsigned int lcore_id { 0 }; + unsigned int rxqid { 0 }; + unsigned int txqid { 0 }; + std::atomic total_pkts { 0 }; + Generator *ia_gen { nullptr }; + Generator *load_gen { nullptr }; + std::atomic cur_epoch { 0 }; + std::atomic epoch_recv { true }; +}; + +constexpr static int STATE_SYNC = 0; // waiting for SYNC +constexpr static int STATE_SYNC_ACK = 1; // Waiting for sending SYNC_ACK +constexpr static int STATE_RUNNING = 2; // Running +constexpr static int STATE_FIN = 3; // FIN received + +struct options_t { + unsigned int run_time { 5 }; + // parameters + int slave_mode { 0 }; + unsigned long rage_quit_time { (unsigned long)-1 }; + char ia_gen[256] { "fixed" }; + char ld_gen[256] { "fixed:0" }; + uint32_t target_qps { 0 }; + struct net_spec server_spec { + }; + uint64_t cpu_mask { 0x4 }; // 1 thread @ core 2 + + // states + unsigned int s_num_threads { 1 }; // 1 thread + struct rte_mempool *mbuf_pool { nullptr }; + struct net_spec s_host_spec { + }; + struct net_spec s_master_spec { + }; + struct conn_spec s_master_cspec { + .src = &s_host_spec, .src_port = DEFAULT_RAT_PORT, + .dst = &s_master_spec, .dst_port = DEFAULT_RAT_PORT, + }; + uint16_t s_portid { 0 }; + std::vector s_thr_info; + std::atomic s_state { STATE_RUNNING }; // default non master mode + + // states for qps + std::atomic s_ts_begin { 0 }; +}; + +static struct options_t options; + +static inline uint32_t +calc_qps(uint64_t now) +{ + uint32_t tot = 0; + + for (auto i : options.s_thr_info) { + tot += i->total_pkts.load(); + } + + return (uint32_t)((double)tot / + ((double)(now - options.s_ts_begin.load()) / (double)S2NS)); +} + +static void +proto_loop(struct thread_info *tinfo) +{ + struct rte_mbuf *tx_buf; + struct rte_mbuf *rx_bufs[BURST_SIZE]; + struct pkt_hdr *pkt_data; + + ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, + "proto_loop : waiting for SYNC from cat\n", tinfo->id); + while (options.s_state.load() == STATE_SYNC) { + const uint16_t nb_rx = rte_eth_rx_burst( + options.s_portid, tinfo->rxqid, rx_bufs, BURST_SIZE); + if (nb_rx > 0) { + for (int i = 0; i < nb_rx; i++) { + struct pkt_hdr *each = check_valid_packet( + rx_bufs[i], &options.s_host_spec.mac_addr); + + if (each != nullptr) { + uint16_t type = rte_be_to_cpu_16( + each->type); + if (type == PKT_TYPE_SYNC) { + int expected = STATE_SYNC; + + ntr(NTR_DEP_USER1, + NTR_LEVEL_INFO, + "proto_loop : received SYNC from cat\n", + tinfo->id); + + if (!options.s_state + .compare_exchange_strong( + expected, + STATE_SYNC_ACK)) { + // someone barged in, + // listen to that guy + ntr(NTR_DEP_USER1, + NTR_LEVEL_WARNING, + "proto_loop : failed to cmpxchg sync_recv.\n", + tinfo->id); + } else { + pkt_hdr_to_netspec(each, + &options + .s_master_spec, + nullptr, nullptr, + nullptr); + + if (alloc_pkt_hdr( + options + .mbuf_pool, + PKT_TYPE_SYNC_ACK, + &options + .s_master_cspec, + &tx_buf, + &pkt_data) != + 0) { + rte_exit( + EXIT_FAILURE, + "failed to alloc pkt hdr\n"); + } + + if (rte_eth_tx_burst( + options + .s_portid, + tinfo->txqid, + &tx_buf, + 1) != 1) { + rte_exit( + EXIT_FAILURE, + "failed to send packet\n"); + } + + expected = + STATE_SYNC_ACK; + // we've done our job, + // set off the threads + if (!options.s_state + .compare_exchange_strong( + expected, + STATE_RUNNING)) { + rte_exit( + EXIT_FAILURE, + "state unexpectedly changed\n"); + } + + ntr(NTR_DEP_USER1, + NTR_LEVEL_INFO, + "proto_loop : sent SYNC_ACK to cat\n", + tinfo->id); + } + } else { + ntr(NTR_DEP_USER1, + NTR_LEVEL_DEBUG, + "proto_loop : ignoring invalid packet %p type %d.\n", + tinfo->id, + (void *)rx_bufs[i], type); + } + } else { + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "proto_loop : ignoring invalid packet %p.\n", + tinfo->id, (void *)rx_bufs[i]); + } + + rte_pktmbuf_free(rx_bufs[i]); + } + } + } + + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "proto_loop : exiting loop...\n", tinfo->id); +} + +static void +pkt_loop(struct thread_info *tinfo) +{ + struct rte_mbuf *tx_buf; + struct rte_mbuf *rx_bufs[BURST_SIZE]; + uint64_t next_ts; + uint64_t last_ts; + struct conn_spec srv_cspec; + rdport_generator src_port_gen(MIN_RANDOM_PORT); + rdport_generator dst_port_gen(MIN_RANDOM_PORT); + + srv_cspec.src = &options.s_host_spec; + srv_cspec.dst = &options.server_spec; + + next_ts = nm_get_uptime_ns(); + last_ts = next_ts + options.rage_quit_time * MS2NS; + + ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "pkt_loop : entering\n", + tinfo->id); + + while (options.s_state.load() == STATE_RUNNING) { + uint64_t now = nm_get_uptime_ns(); + // always pop incoming packets + const uint16_t nb_rx = rte_eth_rx_burst( + options.s_portid, tinfo->rxqid, rx_bufs, BURST_SIZE); + + if (nb_rx > 0) { + for (int i = 0; i < nb_rx; i++) { + struct pkt_hdr *each = check_valid_packet( + rx_bufs[i], &options.s_host_spec.mac_addr); + + if (each == nullptr) { + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "pkt_loop : ignoring invalid packet %p.\n", + tinfo->id, (void *)rx_bufs[i]); + rte_pktmbuf_free(rx_bufs[i]); + continue; + } + + uint16_t type = rte_be_to_cpu_16(each->type); + NTR_PKT(NTR_DEP_USER1, NTR_LEVEL_DEBUG, each, + "locore_main : ", tinfo->id); + struct pkt_payload_epoch *pld_epoch; + uint32_t epoch; + uint32_t id; + struct thread_info *other_t; + bool bool_expected = false; + int int_expected = STATE_RUNNING; + switch (type) { + case PKT_TYPE_LOAD_RESP: + pld_epoch = (struct pkt_payload_epoch *) + each->payload; + epoch = rte_be_to_cpu_32( + pld_epoch->epoch); + id = epoch_get_id(epoch); + epoch = epoch_get_epoch(epoch); + tinfo->total_pkts.fetch_add(1); + + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "pkt_loop : packet %p epoch %d id %d.\n", + tinfo->id, (void *)rx_bufs[i], + epoch, id); + + if (id >= options.s_num_threads) { + ntr(NTR_DEP_USER1, + NTR_LEVEL_WARNING, + "pkt_loop : packet %p invalid id %d.\n", + tinfo->id, + (void *)rx_bufs[i], id); + break; + } + + other_t = options.s_thr_info.at(id); + if (epoch != + other_t->cur_epoch.load()) { + ntr(NTR_DEP_USER1, + NTR_LEVEL_WARNING, + "pkt_loop : packet %p invalid epoch %d != %d.\n", + tinfo->id, + (void *)rx_bufs[i], epoch, + other_t->cur_epoch.load()); + break; + } + if (!other_t->epoch_recv + .compare_exchange_strong( + bool_expected, true)) { + ntr(NTR_DEP_USER1, + NTR_LEVEL_WARNING, + "pkt_loop : failed to cmpxchg with thread %d.\n", + tinfo->id, other_t->id); + break; + } + break; + case PKT_TYPE_FIN: + if (rte_is_same_ether_addr( + &each->eth_hdr.s_addr, + &options.s_master_spec + .mac_addr)) { + ntr(NTR_DEP_USER1, + NTR_LEVEL_DEBUG, + "pkt_loop : recved FIN from cat.\n", + tinfo->id); + // master told us to stop! + if (!options.s_state + .compare_exchange_strong( + int_expected, + STATE_FIN)) { + ntr(NTR_DEP_USER1, + NTR_LEVEL_WARNING, + "pkt_loop : failed to cmpxchg state.\n", + tinfo->id); + } + + uint32_t qps = calc_qps(now); + + struct pkt_hdr *pkt_hdr; + if (alloc_pkt_hdr( + options.mbuf_pool, + PKT_TYPE_FIN_ACK, + &options.s_master_cspec, + &tx_buf, + &pkt_hdr) != 0) { + rte_exit(EXIT_FAILURE, + "failed to allocate pkt hdr\n"); + } + + auto pld_qps = + (struct pkt_payload_qps *) + pkt_hdr->payload; + pld_qps->qps = rte_cpu_to_be_32( + qps); + + const uint16_t nb_tx = + rte_eth_tx_burst( + options.s_portid, + tinfo->txqid, &tx_buf, + 1); + + if (nb_tx != 1) { + rte_exit(EXIT_FAILURE, + "failed to send packet\n"); + } + + ntr(NTR_DEP_USER1, + NTR_LEVEL_DEBUG, + "pkt_loop : sent FIN_ACK to cat. QPS = %d.\n", + tinfo->id, qps); + } else { + ntr(NTR_DEP_USER1, + NTR_LEVEL_WARNING, + "pkt_loop : invalid FIN packet from a different cat.\n", + tinfo->id); + } + break; + default: + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "pkt_loop: ignoring packet %p with unknown type %d.\n", + (void *)rx_bufs[i], type); + } + + rte_pktmbuf_free(rx_bufs[i]); + } + } + + if (now >= next_ts && tinfo->epoch_recv.load()) { + struct pkt_payload_load *pld_load; + struct pkt_hdr *pkt_data; + next_ts += (int)(tinfo->ia_gen->generate() * S2NS); + + // change dst port for every packet for RSS + srv_cspec.dst_port = dst_port_gen.next(); + srv_cspec.src_port = src_port_gen.next(); + if (alloc_pkt_hdr(options.mbuf_pool, PKT_TYPE_LOAD, + &srv_cspec, &tx_buf, &pkt_data) != 0) { + rte_exit(EXIT_FAILURE, + "failed to allocate pkt hdr\n"); + } + + // pre-increment the epoch + uint32_t epoch = tinfo->cur_epoch.fetch_add(1) + 1; + pld_load = (struct pkt_payload_load *)pkt_data->payload; + pld_load->load = rte_cpu_to_be_32( + tinfo->load_gen->generate()); + pld_load->epoch = rte_cpu_to_be_32( + epoch_mk(tinfo->id, epoch)); + tinfo->epoch_recv.store(false); + last_ts = now; + + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "pkt_loop : sending packet %p with epoch %d\n", + tinfo->id, (void *)tx_buf, epoch); + + const uint16_t nb_tx = rte_eth_tx_burst( + options.s_portid, tinfo->txqid, &tx_buf, 1); + + if (nb_tx != 1) { + rte_exit( + EXIT_FAILURE, "failed to send packet\n"); + } + } + + if (!tinfo->epoch_recv.load()) { + // if we haven't received the packet, get read to rage + // quit + if (now - last_ts > options.rage_quit_time * MS2NS) { + rte_exit(EXIT_FAILURE, + "waiting too long for resp. I QUIT!!\n"); + } + } + } + + ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, + "pkt_loop : exiting loop...\n", tinfo->id); +} + +static int +locore_main(void *tif) +{ + auto tinfo = (struct thread_info *)tif; + uint32_t core_id = rte_lcore_id(); + + ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, + "locore_main : running on core %d...\n", tinfo->id, + core_id); + + if (rte_eth_dev_socket_id(options.s_portid) > 0 && + rte_eth_dev_socket_id(options.s_portid) != (int)rte_socket_id()) { + ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, + "locore_main : WARNING, port %d is on remote NUMA node to " + "polling thread.\n\tPerformance will " + "not be optimal.\n", + tinfo->id, options.s_portid); + } + + if (options.slave_mode == 1) { + // perform rat protocol + proto_loop(tinfo); + } + + // wait for the primary thread sending SYNC_ACK + while (options.s_state.load() != STATE_RUNNING) { + } + // store the current timestamp + options.s_ts_begin.store(nm_get_uptime_ns()); + pkt_loop(tinfo); + + ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main : exited\n", + tinfo->id); + + return 0; +} + +static int +port_init(uint16_t portid, struct rte_mempool *mbuf_pool) +{ + struct rte_eth_dev_info dev_info { + }; + struct rte_eth_conf port_conf = port_conf_default; + struct rte_eth_txconf txconf { + }; + struct rte_eth_rxconf rxconf { + }; + + uint16_t nb_rxd = RX_RING_SIZE; + uint16_t nb_txd = TX_RING_SIZE; + + if (!rte_eth_dev_is_valid_port(portid)) { + return -1; + } + + int ret = rte_eth_dev_info_get(portid, &dev_info); + if (ret != 0) { + return ret; + } + + port_conf.rxmode.max_rx_pkt_len = RTE_ETHER_MAX_LEN; + port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; + port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_NONFRAG_IPV4_UDP | + ETH_RSS_L2_PAYLOAD | ETH_RSS_NONFRAG_IPV4_TCP; + port_conf.rx_adv_conf.rss_conf.rss_key = nullptr; + port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_UDP_CKSUM; + port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_IPV4_CKSUM; + port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_RSS_HASH; + port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM; + port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM; + port_conf.txmode.offloads |= DEV_TX_OFFLOAD_MBUF_FAST_FREE; + + /* Configure the Ethernet device. */ + ret = rte_eth_dev_configure( + portid, options.s_num_threads, options.s_num_threads, &port_conf); + if (ret != 0) + return ret; + + ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd, &nb_txd); + if (ret != 0) + return ret; + + /* Allocate and set up 1 RX queue per thread . */ + rxconf = dev_info.default_rxconf; + rxconf.offloads = port_conf.rxmode.offloads; + for (uint32_t i = 0; i < options.s_num_threads; i++) { + ret = rte_eth_rx_queue_setup(portid, + options.s_thr_info.at(i)->rxqid, nb_rxd, + rte_eth_dev_socket_id(portid), &rxconf, mbuf_pool); + if (ret < 0) + return ret; + } + + txconf = dev_info.default_txconf; + txconf.offloads = port_conf.txmode.offloads; + /* Allocate and set up 1 TX queue per Ethernet port. */ + for (uint32_t i = 0; i < options.s_num_threads; i++) { + ret = rte_eth_tx_queue_setup(portid, + options.s_thr_info.at(i)->txqid, nb_txd, + rte_eth_dev_socket_id(portid), &txconf); + if (ret < 0) + return ret; + } + + ret = rte_eth_dev_start(portid); + if (ret < 0) + return ret; + + /* Display the port MAC address. */ + struct rte_ether_addr addr { + }; + ret = rte_eth_macaddr_get(portid, &addr); + if (ret != 0) + return ret; + + // no promiscuous mode required + + return 0; +} + +static void +dump_options() +{ + ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, + "Configuration:\n" + " verbosity = +%d\n" + " run time = %d\n" + " num threads = %d\n" + " rage quit time = %ld\n" + " cpu mask = 0x%lx\n" + " slave mode = %d\n" + " interarrival dist = %s\n" + " load dist = %s\n" + " qps = %d\n" + " host IP = 0x%x\n", + ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING, options.run_time, + options.s_num_threads, options.rage_quit_time, options.cpu_mask, + options.slave_mode, options.ia_gen, options.ld_gen, + options.target_qps, options.s_host_spec.ip); +} + +static void +usage() +{ + fprintf(stdout, + "Usage:\n" + " -v(vv): verbose mode\n" + " -h: display the information\n" + " -t: run time\n" + " -s: server net spec\n" + " -S: slave(rat) mode\n" + " -A: affinity mask\n" + " -i: inter-arrival time distribution\n" + " -l: load distribution\n" + " -r: rage quit time (in ms)\n" + " -q: target QPS\n" + " -H: host net spec\n"); +} + +int +main(int argc, char *argv[]) +{ + unsigned int nb_ports; + struct rte_mempool *mbuf_pool; + struct thread_info *tinfo; + bool has_host_spec = false; + + ntr_init(); + + // init dpdk + int ret = rte_eal_init(argc, argv); + if (ret < 0) { + rte_exit(EXIT_FAILURE, "rte_eal_init failed!\n"); + } + + argc -= ret; + argv += ret; + + // set warning level + ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_WARNING); + { + int c; + // parse arguments + while ((c = getopt(argc, argv, "vht:s:SA:i:l:r:q:H:")) != -1) { + switch (c) { + case 'v': + ntr_set_level(NTR_DEP_USER1, + ntr_get_level(NTR_DEP_USER1) + 1); + break; + case 'h': + usage(); + rte_exit(EXIT_SUCCESS, "\n"); + case 't': + options.run_time = strtol(optarg, nullptr, 10); + break; + case 's': + if (str_to_netspec( + optarg, &options.server_spec) != 0) { + rte_exit(EXIT_FAILURE, + "invalid server net spec\n"); + } + break; + case 'S': + options.slave_mode = 1; + options.s_state = + STATE_SYNC; // set state to wait for SYNC + break; + case 'A': + options.cpu_mask = strtoull( + optarg, nullptr, 16); + options.s_num_threads = cmask_get_num_cpus( + options.cpu_mask); + if (options.s_num_threads == 0) { + rte_exit(EXIT_FAILURE, + "invalid cpu mask 0x%lx\n", + options.cpu_mask); + } + break; + case 'i': + strncpy(options.ia_gen, optarg, + sizeof(options.ia_gen) - 1); + break; + case 'l': + strncpy(options.ld_gen, optarg, + sizeof(options.ld_gen) - 1); + break; + case 'r': + options.rage_quit_time = strtol( + optarg, nullptr, 10); + break; + case 'q': + options.target_qps = strtol( + optarg, nullptr, 10); + break; + case 'H': + has_host_spec = true; + if (str_to_netspec( + optarg, &options.s_host_spec) != 0) { + rte_exit(EXIT_FAILURE, + "invalid host net spec.\n"); + } + break; + default: + usage(); + rte_exit( + EXIT_FAILURE, "unknown argument: %c\n", c); + } + } + } + + if (!has_host_spec) { + rte_exit(EXIT_FAILURE, "Must specify host IP.\n"); + } + + dump_options(); + + // init nm + if (nm_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != 0) { + rte_exit(EXIT_FAILURE, "nm init failed!\n"); + } + + nb_ports = rte_eth_dev_count_avail(); + if (nb_ports == 0) { + rte_exit(EXIT_FAILURE, "number of ports must be > 0\n"); + } + + uint16_t portid = rte_eth_find_next(0); + if (portid == RTE_MAX_ETHPORTS) { + rte_exit(EXIT_FAILURE, "cannot find an available port\n"); + } + options.s_portid = portid; + + if (rte_eth_macaddr_get(portid, &options.s_host_spec.mac_addr) != 0) { + rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n", + portid); + } + + // create a mbuf memory pool on the socket + mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", MBUF_MAX_COUNT, + MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, + rte_eth_dev_socket_id(options.s_portid)); + if (mbuf_pool == nullptr) { + rte_exit(EXIT_FAILURE, "cannot create mbuf pool\n"); + } + options.mbuf_pool = mbuf_pool; + + uint64_t cmask = options.cpu_mask; + for (unsigned int i = 0; i < options.s_num_threads; i++) { + tinfo = new thread_info; + tinfo->ia_gen = createGenerator(options.ia_gen); + tinfo->load_gen = createGenerator(options.ld_gen); + if (tinfo->ia_gen == nullptr || tinfo->load_gen == nullptr) { + rte_exit( + EXIT_FAILURE, "invalid ia_gen or ld_gen string\n"); + } + tinfo->ia_gen->set_lambda((double)options.target_qps / + (double)(options.s_num_threads)); + tinfo->id = i; + tinfo->lcore_id = cmask_get_next_cpu(&cmask); + tinfo->rxqid = i; + tinfo->txqid = i; + options.s_thr_info.push_back(tinfo); + } + + if (port_init(portid, mbuf_pool) != 0) { + rte_exit(EXIT_FAILURE, "cannot init port %d\n", portid); + } + + ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, + "Configured port %d with mac addr %x:%x:%x:%x:%x:%x\n", portid, + options.s_host_spec.mac_addr.addr_bytes[0], + options.s_host_spec.mac_addr.addr_bytes[1], + options.s_host_spec.mac_addr.addr_bytes[2], + options.s_host_spec.mac_addr.addr_bytes[3], + options.s_host_spec.mac_addr.addr_bytes[4], + options.s_host_spec.mac_addr.addr_bytes[5]); + + sleep(INIT_DELAY); + + for (unsigned int i = 0; i < options.s_num_threads; i++) { + tinfo = options.s_thr_info.at(i); + ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, + "main: launching thread %d on locore %d\n", tinfo->id, + tinfo->lcore_id); + if (rte_eal_remote_launch(locore_main, + (void *)options.s_thr_info.at(i), + tinfo->lcore_id) != 0) { + rte_exit(EXIT_FAILURE, + "failed to launch function on locore %d\n", + tinfo->lcore_id); + } + } + + // poor man's timer + uint32_t second = 0; + uint32_t qps = 0; + // this loop exit is signaled by SYNC_FIN in slave mode and by itself in + // non slave mode + while (options.s_state.load() != STATE_FIN) { + if (options.slave_mode != 1) { + if (second >= options.run_time) { + options.s_state.store(STATE_FIN); + qps = calc_qps(nm_get_uptime_ns()); + break; + } + usleep(1 * S2US); + second++; + } + } + + for (unsigned int i = 0; i < options.s_num_threads; i++) { + tinfo = options.s_thr_info.at(i); + ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, + "main: waiting for locore %d...\n", tinfo->lcore_id); + if (rte_eal_wait_lcore(tinfo->lcore_id) != 0) { + rte_exit(EXIT_FAILURE, "failed to wait for locore %d\n", + tinfo->lcore_id); + } + } + + if (options.slave_mode != 1) { + ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: total QPS = %d\n", + qps); + } + + // clean up + rte_eth_dev_stop(portid); + + return 0; +} \ No newline at end of file diff --git a/scripts/compile.sh b/scripts/compile.sh new file mode 100755 index 0000000..93a3cd9 --- /dev/null +++ b/scripts/compile.sh @@ -0,0 +1,38 @@ +#!/bin/sh +test_dir="/numam.d" +root=".." +servers="skylake2.rcs.uwaterloo.ca skylake3.rcs.uwaterloo.ca skylake6.rcs.uwaterloo.ca" +rsync_flags="-vchr" +ssh_args="-o StrictHostKeyChecking=no -p77" + +user=$1 + +if [ -z $user ] +then + user=$(whoami) +fi + +echo "USER: $user" + +compile() { + # separate these functions because we might change kernel (reboot) without needing to recompile + echo "====================$1====================" + echo "Syncing directories..." + ssh $(echo $ssh_args $user@$1) "sudo mkdir -p $test_dir" + ssh $(echo $ssh_args $user@$1) "sudo chmod 777 $test_dir" + rsync $(echo $rsync_flags) -e 'ssh -p 77' $root/ $user@$1:$test_dir/ + echo "Compiling..." + ssh $(echo $ssh_args $user@$1) "sudo mkdir -p $test_dir/build; cd $test_dir/build; sudo rm -rf *; sudo cmake ../; sudo make clean all -j8" & + wait + echo "$1 Done." + echo "" +} + +i=0 +for server in $servers +do + i=$(expr $i + 1) + compile "$server" & +done + +wait \ No newline at end of file diff --git a/scripts/histo.py b/scripts/histo.py new file mode 100644 index 0000000..baeb12d --- /dev/null +++ b/scripts/histo.py @@ -0,0 +1,105 @@ + +import pandas as pd +import matplotlib.pyplot as plt +import matplotlib.mlab as mlab +import numpy as np +import sys +import re +import os +import json +import getopt +import math +import concurrent.futures as CF +import libpar as par + +num_bins = 1000 +extra_pct = [] + +def saveplot(fp : str, data : [], title : str): + plt.hist(data, num_bins) + plt.xlabel("Delay") + plt.title(title) + plt.ylabel("Frequency") + f = plt.gcf() + f.set_size_inches(11.69, 8.27) + f.savefig(fp + "_" + title + "_" + ".png", dpi=160) + plt.clf() + print("Generated - " + fp + "_" + title + "_" + ".png") + +executor = CF.ProcessPoolExecutor(max_workers=int(os.cpu_count())) + +def clean_data(dat: []): + ret = [] + arr = np.array(dat) + cutoff = np.percentile(arr, 99) + for i in arr: + if i <= cutoff: + ret.append(i) + return ret + +def process_file(each_dir): + try: + print("Processing " + each_dir + " ...") + with open(each_dir, 'r') as f: + parser = par.khat_parser() + parser.parse(f.read()) + + sh = [] + ss = [] + ch = [] + cs = [] + for pt in parser.datapt: + sh.append(pt.s_htx - pt.s_hrx) + ss.append(pt.s_stx - pt.s_srx) + ch.append(pt.c_hrx - pt.c_htx) + cs.append(pt.c_srx - pt.c_stx) + + sh = clean_data(sh) + ss = clean_data(ss) + ch = clean_data(ch) + cs = clean_data(cs) + + saveplot(each_dir, sh, "server_hw_delay") + saveplot(each_dir, ss, "server_sw_delay") + saveplot(each_dir, ch, "client_hw_delay") + saveplot(each_dir, cs, "client_sw_delay") + + # output median, etc. + with open(each_dir + "_" + "stats.txt", 'w') as f: + f.write("===================== SERVER HW ====================\n") + f.write(par.mutilate_data.build_mut_output(sh, [len(sh)])) + f.write("\n===================== SERVER SW ====================\n") + f.write(par.mutilate_data.build_mut_output(ss, [len(ss)])) + f.write("\n===================== CLIENT HW ====================\n") + f.write(par.mutilate_data.build_mut_output(ch, [len(ch)])) + f.write("\n===================== CLIENT SW ====================\n") + f.write(par.mutilate_data.build_mut_output(cs, [len(cs)])) + + except Exception: + print("Unexpected error:", sys.exc_info()) + +def process_dir(rootdir): + for subdir in os.listdir(rootdir): + each_dir = os.path.join(rootdir, subdir) + if os.path.isfile(each_dir): + if each_dir.endswith("sample.txt") or each_dir.endswith(".sample"): + process_file(each_dir) + else: + process_dir(each_dir) + +def main(): + datdir = None + options = getopt.getopt(sys.argv[1:], 'd:')[0] + + for opt, arg in options: + if opt in ('-d'): + datdir = arg + + if datdir == None: + raise Exception("Must specify -d parameter") + + process_dir(datdir) + executor.shutdown() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/libs/libpar.py b/scripts/libs/libpar.py new file mode 100644 index 0000000..169f5d2 --- /dev/null +++ b/scripts/libs/libpar.py @@ -0,0 +1,113 @@ +import json +import numpy as np + +class khat_parser: + class pt: + def __init__(self): + self.s_htx = 0 + self.s_hrx = 0 + self.s_stx = 0 + self.s_srx = 0 + self.c_htx = 0 + self.c_hrx = 0 + self.c_stx = 0 + self.c_srx = 0 + + def __init__(self): + self.datapt = [] + + def parse(self, output : str): + for line in output.splitlines(): + cells = line.split(',') + if len(cells) != 8: + raise Exception("Invalid line:" + line) + pt = self.pt() + pt.c_srx = int(cells[0]) + pt.c_stx = int(cells[1]) + pt.c_hrx = int(cells[2]) + pt.c_htx = int(cells[3]) + pt.s_srx = int(cells[4]) + pt.s_stx = int(cells[5]) + pt.s_hrx = int(cells[6]) + pt.s_htx = int(cells[7]) + self.datapt.append(pt) + + +class mutilate_data: + def __init__(self): + self.dat = {} + self.qps = 0 + + def to_string(self): + ret = "Throughput: " + str(self.qps) + "\n" + json.dumps(self.dat) + return ret + + @staticmethod + def parse_mut_output(output): + ret = mutilate_data() + succ_qps = False + succ_read = False + table = [None, "avg", "std", "min", "5th", "10th", "50th", "90th", "95th", "99th"] + table_legacy = [None, "avg", "std", "min", "5th", "10th", "90th", "95th", "99th"] + for line in output.splitlines(): + if line.find("Total QPS") != -1: + spl = line.split() + if len(spl) == 7: + ret.qps = float(spl[3]) + succ_qps = True + else: + break + elif line.find("read") != -1: + spl = line.split() + if len(spl) == 10: + for i in range(1, len(spl)): + ret.dat[table[i]] = float(spl[i]) + succ_read = True + elif len(spl) == 9: + for i in range(1, len(spl)): + ret.dat[table_legacy[i]] = float(spl[i]) + succ_read = True + else: + break + + if not (succ_qps and succ_read): + raise Exception("Failed to parse data") + + return ret + + @staticmethod + def parse_mut_sample(fn): + f = open(fn, "r") + qps = [] + lat = [] + lines = f.readlines() + for line in lines: + entry = line.split() + if len(entry) != 2: + raise Exception("Unrecognized line: " + line) + qps.append(float(entry[0])) + lat.append(float(entry[1])) + f.close() + return qps, lat + + + # generate mutilate output format + @staticmethod + def build_mut_output(lat_arr, qps_arr): + output = '{0: <10}'.format('#type') + '{0: >10}'.format('avg') + '{0: >10}'.format('std') + \ + '{0: >10}'.format('min') + '{0: >10}'.format('5th') + '{0: >10}'.format('10th') + \ + '{0: >10}'.format('50th') + '{0: >10}'.format('90th') + '{0: >10}'.format('95th') + '{0: >10}'.format('99th') + "\n" + + output += '{0: <10}'.format('read') + '{0: >10}'.format("{:.1f}".format(np.mean(lat_arr))) + ' ' + \ + '{0: >10}'.format("{:.1f}".format(np.std(lat_arr))) + ' ' + \ + '{0: >10}'.format("{:.1f}".format(np.min(lat_arr))) + ' ' + \ + '{0: >10}'.format("{:.1f}".format(np.percentile(lat_arr, 5))) + ' ' + \ + '{0: >10}'.format("{:.1f}".format(np.percentile(lat_arr, 10))) + ' ' + \ + '{0: >10}'.format("{:.1f}".format(np.percentile(lat_arr, 50))) + ' ' + \ + '{0: >10}'.format("{:.1f}".format(np.percentile(lat_arr, 90))) + ' ' + \ + '{0: >10}'.format("{:.1f}".format(np.percentile(lat_arr, 95))) + ' ' + \ + '{0: >10}'.format("{:.1f}".format(np.percentile(lat_arr, 99))) + ' ' + "\n" \ + + output += "\n" + "Total QPS = " + "{:.1f}".format(np.mean(qps_arr)) + " (0 / 0s)" + + return output \ No newline at end of file diff --git a/scripts/libs/libtc.py b/scripts/libs/libtc.py new file mode 100644 index 0000000..8463c6d --- /dev/null +++ b/scripts/libs/libtc.py @@ -0,0 +1,172 @@ +import subprocess as sp +import time +import select +import os +import pwd +import sys +import datetime +import random +import re +from threading import Thread + +tc_logfile = None + +def log_print(info): + print(info) + if tc_logfile != None: + tc_logfile.write(info + "\n") + tc_logfile.flush() + +tc_output_dir="" +tc_cur_test = "" +tc_test_id = 0 + +def init(odir = "./results.d/"): + global tc_output_dir + tc_output_dir = odir + "_" + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + tc_output_dir = os.path.expanduser(tc_output_dir) + os.system("mkdir -p " + tc_output_dir) + global tc_logfile + tc_logfile = open(tc_output_dir + "/log.txt", "w+") + +def begin(name): + global tc_test_id + global tc_cur_test + tc_cur_test = name + tc_test_id += 1 + os.system("mkdir -p " + get_odir()) + log_print("\n===== Test #" + str(tc_test_id) + " - " + tc_cur_test + " started =====") + +def end(): + global tc_cur_test + log_print("\n===== Test #" + str(tc_test_id) + " - " + tc_cur_test + " completed =====") + tc_cur_test = None + +def get_odir(): + return tc_output_dir + "/" + tc_cur_test + +SCHED_QUEUE = 1 +SCHED_CPU = 2 +SCHED_BEST = 4 +SCHED_FEAT_WS = 1 +def make_sched_flag(sched, args, feat = 0, fargs = 0): + return (sched & 0xFF) | (args & 0xFF) << 8 | (feat & 0xFF) << 16 | (fargs & 0xFF) << 24 + +TUNE_RTSHARE = 2 +TUNE_TFREQ = 1 +def make_tune_flag(obj, val): + return (obj & 0xFFFF) | (val & 0xFFFF) << 16 + +def get_username(): + return pwd.getpwuid( os.getuid() )[0] + +ssh_param = "" +def set_ssh_param(para): + global ssh_param + ssh_param = para + +ssh_user = None +def set_ssh_user(user): + global ssh_user + ssh_user = user + +def remote_exec(srv, cmd, blocking=True, check=True): + sub = [] + for s in srv: + p = sp.Popen(["ssh " + ssh_param + " " + ((ssh_user + "@") if ssh_user != None else "") + s + " \"" + cmd +"\""], shell=True, stdout=sp.PIPE, stderr=sp.PIPE) + sub.append(p) + + if blocking: + for p in sub: + p.wait() + if check and p.returncode != 0: + raise Exception("Command failed " + cmd) + + return sub + + +def scan_stderr(p, exclude = None): + for err in p.stderr: + fail = True + err = err.decode() + err = err.strip() + +# print(err) + + if len(err) == 0: + continue + + if exclude != None: + for exc in exclude: + if (exc != None) and (re.match(exc, err) != None): + fail = False + break + + if fail: + log_print("Error detected: " + err) + return False + + return True + +# stderr threads +errthr_objs = [] +errthr_sigstop = False +errthr_failed = False + +def errthr_get_failed(): + return errthr_failed + +def thr_check_stderr(p : sp.Popen, exclude): + global errthr_failed + while(not errthr_sigstop): + if not scan_stderr(p, exclude=exclude): + errthr_failed = True + time.sleep(0.5 + random.uniform(-0.1, 0.1)) + +def errthr_start(): + global errthr_sigstop + global errthr_failed + errthr_sigstop = False + errthr_failed = False + for thr in errthr_objs: + thr.start() + +def errthr_create(cp, exclude = None): + global errthr_objs + for p in cp: + errthr_objs.append(Thread(target = thr_check_stderr, args=(p, exclude))) + +def errthr_stop(): + global errthr_objs + global errthr_sigstop + errthr_sigstop = True +# print("waiting!") + for thr in errthr_objs: + thr.join() + errthr_objs.clear() + +def parse_hostfile(fp): + ret = {} + fh = open(fp, "r") + content = fh.readlines() + fh.close() + content = [x.strip() for x in content] + for line in content: + spl = line.split(" ") + if len(spl) >= 2: + ret[spl[0]] = spl[1] + log_print("Parsed: hostname \"" + spl[0] + "\" -> \"" + spl[1] + "\"") + return ret + +def process_hostnames(names, hosts): + ret = [] + for line in names: + if line in hosts: + ret.append(hosts[line]) + else: + ret.append(line) + return ret + +def get_cpuset_core(threads): + ret = "cpuset -l 0-" + str(threads * 2 - 1) + " " + return ret \ No newline at end of file diff --git a/scripts/run.py b/scripts/run.py new file mode 100755 index 0000000..181d703 --- /dev/null +++ b/scripts/run.py @@ -0,0 +1,229 @@ +import subprocess as sp +import time +import select +import os +import datetime +import pwd +import sys +import getopt +import numpy as np +import re + +import libpar as par +import libtc as tc + +step_inc_pct = 100 +init_step = 20000 # +start_step = 10000 +term_qps = 85000000000 + +term_pct = 1 +inc_pct = 50 +server_port = 23444 + +# paths +test_dir = "/numam.d/build" +file_dir = os.path.dirname(os.path.realpath(__file__)) +root_dir = os.path.join(file_dir,"..") +sample_filename = "sample.txt" + +affinity = [ + "0x4", # core 2 + "0x400", # core 10 + "0x100000", # core 20 + "0x1000000", # core 24 + "0x40000000", # core 30 + "0x10000000000" # core 40 +] + +master = ["skylake3.rcs.uwaterloo.ca"] +master_mac = ["3c:15:fb:c9:f3:4b"] + +server = ["skylake2.rcs.uwaterloo.ca"] +server_mac = ["3c:15:fb:c9:f3:36"] + +clients = [] +client_mac = [] + +rage_quit = 1000 #1s +warmup = 5 +duration = 25 +cooldown = 0 +cacheline = 0 +SSH_PARAM = "-o StrictHostKeyChecking=no -p77" +SSH_USER = "oscar" + +hostfile = None +lockstat = False +client_only = False + +def stop_all(): + # stop clients + tc.log_print("Stopping clients...") + tc.remote_exec(clients, "sudo killall -9 rat", check=False) + + if not client_only: + # stop server + tc.log_print("Stopping server...") + tc.remote_exec(server, "sudo killall -9 khat", check=False) + + # stop master + tc.log_print("Stopping master...") + tc.remote_exec(master, "sudo killall -9 cat", check=False) + +def get_client_str(clt): + ret = " " + for client in clt: + ret += " -a " + client + " " + return ret + +def run_exp(sc, ld): + while True: + if client_only: + ssrv = None + else: + # start server + tc.log_print("Starting server...") + server_cmd = "sudo " + test_dir + "/khat -- -A " + sc + tc.log_print(server_cmd) + + ssrv = tc.remote_exec(server, server_cmd, blocking=False) + + # start clients + # tc.log_print("Starting clients...") + # client_cmd = tc.get_cpuset_core(client_threads) + " " + test_dir + "/pingpong/build/dismember -A" + # tc.log_print(client_cmd) + # sclt = tc.remote_exec(ssh_clients, client_cmd, blocking=False) + + time.sleep(3) + # start master + tc.log_print("Starting master...") + master_cmd = "sudo " + test_dir + "/cat -- " + \ + " -s " + server_mac[0] + \ + " -o " + test_dir + "/" + sample_filename + \ + " -t " + str(duration) + \ + " -T " + str(warmup) + \ + " -i fixed:0.001" + \ + " -r " + str(rage_quit) + \ + " -A 0x4" + + tc.log_print(master_cmd) + sp = tc.remote_exec(master, master_cmd, blocking=False) + p = sp[0] + + + # launch stderr monitoring thread + tc.errthr_create(sp, exclude=[".*EAL.*"]) + tc.errthr_create(ssrv, exclude=[".*EAL.*"]) + tc.errthr_start() + success = False + cur = 0 + while True: + # either failed or timeout + # we use failure detection to save time for long durations + if tc.errthr_get_failed() or cur >= int(warmup + duration) + 5 : + break + + if p.poll() != None: + success = True + break + + time.sleep(1) + cur = cur + 1 + + stop_all() + tc.errthr_stop() + print("Cooling down...") + time.sleep(cooldown) + + if success: + return + +def keep_results(): + scpcmd = "scp -P77 oscar@" + master[0] + ":" + test_dir + "/" + sample_filename + " " + tc.get_odir() + "/sample.txt" + tc.log_print(scpcmd) + sp.check_call(scpcmd, shell=True) + + with open(tc.get_odir() + "/sample.txt", 'r') as f: + tc.log_print("Total requests: " + str(len(f.readlines()))) + + return + +def main(): + global hostfile + global server + global master + global clients + global client_only + + tc.set_ssh_param(SSH_PARAM) + tc.set_ssh_user(SSH_USER) + + options = getopt.getopt(sys.argv[1:], 'h:sldcp')[0] + for opt, arg in options: + if opt in ('-h'): + hostfile = arg + elif opt in ('-s'): + stop_all() + return + elif opt in ('-c'): + client_only=True + + tc.init("~/results.d/numam/") + + tc.log_print("Configuration:\n" + \ + "Hostfile: " + ("None" if hostfile == None else hostfile) + "\n" \ + "Client only: " + str(client_only) + "\n") + + if hostfile != None: + hosts = tc.parse_hostfile(hostfile) + server = tc.process_hostnames(server, hosts) + clients = tc.process_hostnames(clients, hosts) + master = tc.process_hostnames(master, hosts) + + stop_all() + + for i in range(0, len(affinity)): + eaff = affinity[i] + # step_mul = 100 + # last_load = 0 + # cur_load = start_step + + tc.begin(eaff) + + tc.log_print("============ Affinity: " + str(eaff) + " Load: MAX" + " ============") + run_exp(eaff, 0) + keep_results() + stop_all() + + # while True: + # tc.log_print("============ Sched: " + str(ename) + " Flag: " + format(esched, '#04x') + " Load: " + str(cur_load) + " ============") + + # output, sout, serr = run_exp(esched, cur_load, lockstat) + + # qps = keep_results(output, sout, serr) + + # pct = int((qps - last_load) / init_step * 100) + # tc.log_print("last_load: " + str(last_load) + " this_load: " + str(qps) + " inc_pct: " + str(pct) + "%") + + # if cur_load > term_qps: + # tc.log_print("qps more than " + str(term_qps) + "%. Done.") + # break + + # if pct <= term_pct: + # tc.log_print("inc_pct less than TERM_PCT " + str(term_pct) + "%. Done.") + # break + + # if pct <= inc_pct: + # step_mul += step_inc_pct + # tc.log_print("inc_pct less than INC_PCT " + str(inc_pct) + "%. Increasing step multiplier to " + str(step_mul) + "%") + + # last_load = qps + # cur_load += int(init_step * step_mul / 100) + # tc.log_print("") + + tc.end() + + stop_all() + +main() \ No newline at end of file