change run.py defaults

+histogram
fix redundant global
2021-02-02 00:46:18 -05:00 · 2021-02-02 00:43:45 -05:00 · 2021-02-01 23:49:30 -05:00 · 2021-02-01 23:48:14 -05:00 · 2021-01-31 02:50:58 -05:00 · 2021-01-28 05:24:59 -05:00
23 changed files with 2959 additions and 390 deletions
--- a/.arcconfig
+++ b/.arcconfig
@ -0,0 +1,3 @@
+{
+  "phabricator.uri" : "https://review.rcs.uwaterloo.ca/"
+}
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,271 @@
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+################ C STUFF ##########################
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+
+# Prerequisites
+*.d
+
+# Object files
+*.o
+*.ko
+*.obj
+*.elf
+
+# Linker output
+*.ilk
+*.map
+*.exp
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Libraries
+*.lib
+*.a
+*.la
+*.lo
+
+# Shared objects (inc. Windows DLLs)
+*.dll
+*.so
+*.so.*
+*.dylib
+
+# Executables
+*.exe
+*.out
+*.app
+*.i*86
+*.x86_64
+*.hex
+
+# Debug files
+*.dSYM/
+*.su
+*.idb
+*.pdb
+
+# Kernel Module Compile Results
+*.mod*
+*.cmd
+.tmp_versions/
+modules.order
+Module.symvers
+Mkfile.old
+dkms.conf
+
+
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+################ PYTHON STUFF ##########################
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+################ C++ STUFF ##########################
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+# Prerequisites
+*.d
+
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Fortran module files
+*.mod
+*.smod
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.exe
+*.out
+*.app
--- a/.pyenv
+++ b/.pyenv
@ -0,0 +1 @@
+PYTHONPATH="./scripts/libs"
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -10,24 +10,49 @@ project(khat)

 list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}")
 find_package(dpdk REQUIRED)
+find_package(Hwloc REQUIRED)

 set(CC_FLAGS -O2 -g -Wall -Wextra -Werror -std=c++11 
        -Wno-deprecated-declarations 
        -Wno-packed-not-aligned
        -Wno-address-of-packed-member
-        -msse4)
+        -Wno-zero-length-array
+        -Wno-gnu-zero-variadic-macro-arguments
+        -msse4
+        -mavx)
+

 include_directories(${CMAKE_SOURCE_DIR}/inc)
 include_directories(${dpdk_INCLUDE_DIRS})
+include_directories(${Hwloc_INCLUDE_DIRS})
+
+set(LIBNM_CC_FLAGS -O2 -g -Wall -Wextra -Werror -std=c++11)
+set(LIBNTR_C_FLAGS -O2 -g -Wall -Wextra -Werror -std=c11)
+set(LIBGEN_CC_FLAGS -O2 -g -Wall -Wextra -Werror -std=c++11)
+
+set(KHAT_LINKLIBS pthread nm ntr)
+set(CAT_LINKLIBS pthread nm ntr gen)
+set(RAT_LINKLIBS pthread nm ntr gen)
+
+add_library(nm libnm/nm.cc)
+target_link_libraries(nm ${Hwloc_LIBRARIES})
+target_compile_options(nm PRIVATE ${LIBNM_CC_FLAGS})
+
+add_library(ntr libntr/ntr.c)
+target_compile_options(ntr PRIVATE ${LIBNTR_C_FLAGS})
+
+add_library(gen libgen/generator.cc)
+target_link_libraries(gen ${Hwloc_LIBRARIES})
+target_compile_options(gen PRIVATE ${LIBGEN_CC_FLAGS})

 add_executable(khat khat/khat.cc)
-add_executable(cat cat/cat.cc)
-
-set(LINK_LIBS ${dpdk_LIBRARIES} pthread)
-
-target_link_libraries(khat ${LINK_LIBS})
+target_link_libraries(khat ${dpdk_LIBRARIES} ${KHAT_LINKLIBS})
 target_compile_options(khat PRIVATE ${CC_FLAGS})

-target_link_libraries(cat ${LINK_LIBS})
+add_executable(cat cat/cat.cc)
+target_link_libraries(cat ${dpdk_LIBRARIES} ${CAT_LINKLIBS})
 target_compile_options(cat PRIVATE ${CC_FLAGS})

+add_executable(rat rat/rat.cc)
+target_link_libraries(rat ${dpdk_LIBRARIES} ${RAT_LINKLIBS})
+target_compile_options(rat PRIVATE ${CC_FLAGS})
--- a/FindHwloc.cmake
+++ b/FindHwloc.cmake
@ -0,0 +1,213 @@
+#.rst:
+# FindHwloc
+# ----------
+#
+# Try to find Portable Hardware Locality (hwloc) libraries.
+# http://www.open-mpi.org/software/hwloc
+#
+# You may declare HWLOC_ROOT environment variable to tell where
+# your hwloc library is installed. 
+#
+# Once done this will define::
+#
+#   Hwloc_FOUND            - True if hwloc was found
+#   Hwloc_INCLUDE_DIRS     - include directories for hwloc
+#   Hwloc_LIBRARIES        - link against these libraries to use hwloc
+#   Hwloc_VERSION          - version
+#   Hwloc_CFLAGS           - include directories as compiler flags
+#   Hwloc_LDLFAGS          - link paths and libs as compiler flags
+#
+
+#=============================================================================
+# Copyright 2014 Mikael Lepistö
+#
+# Distributed under the OSI-approved BSD License (the "License");
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+
+if(WIN32)
+  find_path(Hwloc_INCLUDE_DIR
+    NAMES
+      hwloc.h
+    PATHS
+      ENV "PROGRAMFILES(X86)"
+      ENV HWLOC_ROOT
+    PATH_SUFFIXES
+      include
+  )
+
+  find_library(Hwloc_LIBRARY
+    NAMES 
+      libhwloc.lib
+    PATHS
+      ENV "PROGRAMFILES(X86)"
+      ENV HWLOC_ROOT
+    PATH_SUFFIXES
+      lib
+  )
+
+  #
+  # Check if the found library can be used to linking 
+  #
+  SET (_TEST_SOURCE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/linktest.c")
+  FILE (WRITE "${_TEST_SOURCE}"
+    "
+    #include <hwloc.h>
+    int main()
+    { 
+      hwloc_topology_t topology;
+      int nbcores;
+      hwloc_topology_init(&topology);
+      hwloc_topology_load(topology);
+      nbcores = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_CORE);
+      hwloc_topology_destroy(topology);
+      return 0;
+    }
+    "
+  )
+
+  TRY_COMPILE(_LINK_SUCCESS ${CMAKE_BINARY_DIR} "${_TEST_SOURCE}"
+    CMAKE_FLAGS
+    "-DINCLUDE_DIRECTORIES:STRING=${Hwloc_INCLUDE_DIR}"
+    CMAKE_FLAGS
+    "-DLINK_LIBRARIES:STRING=${Hwloc_LIBRARY}"
+  )
+
+  IF(NOT _LINK_SUCCESS)
+    if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+      message(STATUS "You are building 64bit target.")
+    ELSE()
+      message(STATUS "You are building 32bit code. If you like to build x64 use e.g. -G 'Visual Studio 12 Win64' generator." )
+    ENDIF()
+    message(FATAL_ERROR "Library found, but linking test program failed.")
+  ENDIF()
+
+  #
+  # Resolve version if some compiled binary found...
+  #
+  find_program(HWLOC_INFO_EXECUTABLE
+    NAMES 
+      hwloc-info
+    PATHS
+      ENV HWLOC_ROOT 
+    PATH_SUFFIXES
+      bin
+  )
+  
+  if(HWLOC_INFO_EXECUTABLE)
+    execute_process(
+      COMMAND ${HWLOC_INFO_EXECUTABLE} "--version" 
+      OUTPUT_VARIABLE HWLOC_VERSION_LINE 
+      OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    string(REGEX MATCH "([0-9]+.[0-9]+)$" 
+      Hwloc_VERSION "${HWLOC_VERSION_LINE}")
+    unset(HWLOC_VERSION_LINE)
+  endif()
+  
+  #
+  # All good
+  #
+
+  set(Hwloc_LIBRARIES ${Hwloc_LIBRARY})
+  set(Hwloc_INCLUDE_DIRS ${Hwloc_INCLUDE_DIR})
+
+  include(FindPackageHandleStandardArgs)
+  find_package_handle_standard_args(
+    Hwloc
+    FOUND_VAR Hwloc_FOUND
+    REQUIRED_VARS Hwloc_LIBRARY Hwloc_INCLUDE_DIR Hwloc_VERSION_PARSED Hwloc_VERSION_MAJOR Hwloc_VERSION_MINOR
+    VERSION_VAR Hwloc_VERSION)
+
+  mark_as_advanced(
+    Hwloc_INCLUDE_DIR
+    Hwloc_LIBRARY)
+
+  foreach(arg ${Hwloc_INCLUDE_DIRS})
+    set(Hwloc_CFLAGS "${Hwloc_CFLAGS} /I${arg}")
+  endforeach()
+
+  set(Hwloc_LDFLAGS "${Hwloc_LIBRARY}")
+
+else()
+
+  if(CMAKE_CROSSCOMPILING)
+
+  find_path(Hwloc_INCLUDE_DIRS
+    NAMES
+      hwloc.h
+    PATHS
+      ENV HWLOC_ROOT
+  )
+
+  find_library(Hwloc_LIBRARIES
+    NAMES
+      hwloc
+    PATHS
+      ENV HWLOC_ROOT
+  )
+
+  if(Hwloc_INCLUDE_DIRS AND Hwloc_LIBRARIES)
+    message(WARNING "HWLOC library found using find_library() - cannot determine version. Assuming 1.7.0")
+    set(Hwloc_FOUND 1)
+    set(Hwloc_VERSION "1.7.0")
+  endif()
+
+  else() # Find with pkgconfig for non-crosscompile builds
+
+  find_package(PkgConfig)
+
+  if(HWLOC_ROOT)
+    set(ENV{PKG_CONFIG_PATH} "${HWLOC_ROOT}/lib/pkgconfig")
+  else()
+    foreach(PREFIX ${CMAKE_PREFIX_PATH})
+      set(PKG_CONFIG_PATH "${PKG_CONFIG_PATH}:${PREFIX}/lib/pkgconfig")
+    endforeach()
+    set(ENV{PKG_CONFIG_PATH} "${PKG_CONFIG_PATH}:$ENV{PKG_CONFIG_PATH}")
+  endif()
+
+  if(hwloc_FIND_REQUIRED)
+    set(_hwloc_OPTS "REQUIRED")
+  elseif(hwloc_FIND_QUIETLY)
+    set(_hwloc_OPTS "QUIET")
+  else()
+    set(_hwloc_output 1)
+  endif()
+
+  if(hwloc_FIND_VERSION)
+    if(hwloc_FIND_VERSION_EXACT)
+      pkg_check_modules(Hwloc ${_hwloc_OPTS} hwloc=${hwloc_FIND_VERSION})
+    else()
+      pkg_check_modules(Hwloc ${_hwloc_OPTS} hwloc>=${hwloc_FIND_VERSION})
+    endif()
+  else()
+    pkg_check_modules(Hwloc ${_hwloc_OPTS} hwloc)
+  endif()
+
+  if(Hwloc_FOUND)
+    string(REPLACE "." ";" Hwloc_VERSION_PARSED "${Hwloc_VERSION}")
+    set(Hwloc_VERSION "${Hwloc_VERSION}" CACHE STRING "version of Hwloc as a list")
+    list(GET Hwloc_VERSION_PARSED 0 Hwloc_VERSION_MAJOR)
+    set(Hwloc_VERSION_MAJOR "${Hwloc_VERSION_MAJOR}" CACHE STRING "Major version of Hwloc")
+    list(GET Hwloc_VERSION_PARSED 1 Hwloc_VERSION_MINOR)
+    set(Hwloc_VERSION_MINOR "${Hwloc_VERSION_MINOR}" CACHE STRING "Minor version of Hwloc")
+
+    include(FindPackageHandleStandardArgs)
+    find_package_handle_standard_args(Hwloc DEFAULT_MSG Hwloc_LIBRARIES)
+
+    if(NOT ${Hwloc_VERSION} VERSION_LESS 1.7.0)
+      set(Hwloc_GL_FOUND 1)
+    endif()
+
+    if(_hwloc_output)
+      message(STATUS
+        "Found hwloc ${Hwloc_VERSION} in ${Hwloc_INCLUDE_DIRS}:${Hwloc_LIBRARIES}")
+    endif()
+  endif()
+
+  endif() # cross-compile else
+
+endif()
--- a/cat/cat.cc
+++ b/cat/cat.cc
@ -11,68 +11,103 @@
 #include <rte_ether.h>
 #include <rte_launch.h>
 #include <rte_log.h>
+#include <rte_byteorder.h>
+#include <rte_ip.h>
 #include <atomic>
 #include <vector>
 #include <fstream>
 #include <unistd.h>

-#include "ntrlog.h"
+#include "nm.h"
+#include "gen.h"
+#include "ntr.h"
 #include "pkt.h"
-#include "rte_byteorder.h"
-#include "rte_ip.h"
+#include "util.h"

-// init NTRLOG
-NTR_DECL_IMPL;
-
-constexpr unsigned int MBUF_MAX_COUNT = 8191;
-constexpr unsigned int MBUF_CACHE_SIZE = 250;
-constexpr unsigned int RX_RING_SIZE = 1024;
-constexpr unsigned int TX_RING_SIZE = 1024;
-constexpr unsigned int RX_RING_NUM = 1;
-constexpr unsigned int TX_RING_NUM = 1;
-constexpr unsigned int BURST_SIZE = 32;
+constexpr static unsigned int MBUF_MAX_COUNT = 16384;
+constexpr static unsigned int MBUF_CACHE_SIZE = 512;
+constexpr static unsigned int RX_RING_SIZE = 4096;
+constexpr static unsigned int TX_RING_SIZE = 4096;
+constexpr static unsigned int BURST_SIZE = 32;

 static const struct rte_eth_conf port_conf_default{};

-struct datapt{
-    uint64_t server_proc = 0;
-    uint64_t rtt = 0;
+struct datapt {
+    uint32_t epoch;
+    uint32_t valid;
+    uint64_t clt_hw_tx;
+    uint64_t clt_sw_tx;
+    uint64_t clt_hw_rx;
+    uint64_t clt_sw_rx;
+    uint64_t srv_hw_tx;
+    uint64_t srv_sw_tx;
+    uint64_t srv_hw_rx;
+    uint64_t srv_sw_rx;
 };

 struct options_t {
-    unsigned int run_time = 5;
-    unsigned int warmup_time = 0;
+    // parameters
+    unsigned int run_time{5};
+    unsigned int warmup_time{3};
    char output[256] = "output.txt";
+    char ia_gen_str[256] = "fixed:0.01";
    struct rte_ether_addr server_mac;
+    uint64_t cpu_mask{0x2}; // 2nd  core
+    std::vector<struct rte_ether_addr *> slaves;
+    unsigned long rage_quit_time = (unsigned long)-1;
+    unsigned long last_sent_ts = 0;
+
    // states
-    std::atomic<bool> s_stop {false};
-    std::atomic<bool> s_record {false};
-    std::vector<struct datapt *> s_stats;
-    struct rte_mempool * s_mbuf_pool;
-    uint16_t s_portid;
+    struct rte_mempool * mbuf_pool;
    struct rte_ether_addr s_host_mac;
+    uint16_t s_portid;
+    unsigned int s_rxqid;
+    unsigned int s_txqid;
+    unsigned int s_total_pkts{0};
+    Generator * s_iagen{nullptr};
+    std::vector<struct datapt *> s_data;
+    struct datapt * s_last_datapt{nullptr};
+    uint32_t s_epoch;
+    std::atomic<bool> s_stop {false};
+    std::atomic<uint32_t> s_record {0};
 };

-struct options_t options;
+static struct options_t options; 

 static uint16_t
-rx_calc_latency(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
+rx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
        struct rte_mbuf **pkts, uint16_t nb_pkts, uint16_t max_pkts __rte_unused, void *_ __rte_unused)
 {
-    // XXX: need to get the timestamp in every loop?
    uint64_t now = rte_rdtsc();
-    struct packet_data * pkt_data;
+    struct pkt_hdr * pkt_data;
+    struct timespec ts;
+    int ret;

    for (int i = 0; i < nb_pkts; i++) {
        pkt_data = check_valid_packet(pkts[i]);

        if (pkt_data == NULL) {
-            ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "rx_calc_latency: ignoring invalid packet 0x%p.\n", (void*)pkts[i]);
+            ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "rx_add_timestamp: ignoring invalid packet 0x%p.\n", (void*)pkts[i]);
            continue;  
        }

-        ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "rx_calc_latency: tagged packet %p with %llu.\n", (void*)pkts[i], now); 
-        pkt_data->clt_ts_rx = rte_cpu_to_be_64(now);
+        if (rte_be_to_cpu_16(pkt_data->type) == PKT_TYPE_PROBE_RESP) {
+            uint32_t epoch = rte_be_to_cpu_32(((struct pkt_payload_epoch *)pkt_data->payload)->epoch);
+            if (options.s_last_datapt != nullptr && options.s_last_datapt->epoch == epoch) {           
+                if ((ret = rte_eth_timesync_read_rx_timestamp(port, &ts, pkts[i]->timesync & 0x3)) == 0) {
+                    // has hw rx timestamp
+                    options.s_last_datapt->clt_hw_rx = ts.tv_sec * S2NS + ts.tv_nsec;
+                    options.s_last_datapt->clt_sw_rx = now;
+                    ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "rx_add_timestamp: tagged packet %p with sw: %llu hw: %llu.\n", (void*)pkts[i], now, options.s_last_datapt->clt_hw_rx); 
+                } else {
+                    rte_exit(EXIT_FAILURE, "rx_add_timestamp: packet %p not tagged - hw ts not available - %d.\n", (void*)pkts[i], ret); 
+                }
+            } else {
+                ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "rx_add_timestamp: packet %p epoch %d != last epoch %d.\n", (void*)pkts[i], epoch, options.s_last_datapt->epoch); 
+            }
+        } else {
+            ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "rx_add_timestamp: packet %p not tagged - type %d.\n", (void*)pkts[i], rte_be_to_cpu_16(pkt_data->type));
+        }
    }

    return nb_pkts;
@ -82,9 +117,8 @@ static uint16_t
 tx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
 		struct rte_mbuf **pkts, uint16_t nb_pkts, void *_ __rte_unused)
 {
-    // XXX: need to get the timestamp in every loop?
    uint64_t now = rte_rdtsc();
-    struct packet_data * pkt_data;
+    struct pkt_hdr * pkt_data;

    for (int i = 0; i < nb_pkts; i++) {
        pkt_data = check_valid_packet(pkts[i]);
@ -94,26 +128,37 @@ tx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
            continue;
        }

-        ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "tx_add_timestamp: tagged packet %p with %llu.\n", (void*)pkts[i], now); 
-        pkt_data->clt_ts_tx = rte_cpu_to_be_64(now);
+        if (rte_be_to_cpu_16(pkt_data->type) == PKT_TYPE_PROBE) {
+            uint32_t epoch = rte_be_to_cpu_32(((struct pkt_payload_epoch *)pkt_data->payload)->epoch);
+
+            if (options.s_last_datapt == nullptr || epoch != options.s_last_datapt->epoch) {
+                rte_exit(EXIT_FAILURE, "tx_add_timestamp: packet epoch %d != last epoch %d\n", epoch, options.s_last_datapt->epoch);
+            }
+
+            options.s_last_datapt->clt_sw_tx = now;
+            ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "tx_add_timestamp: tagged packet %p with sw: %llu.\n", (void*)pkts[i], now); 
+        } else {
+            ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "tx_add_timestamp: packet %p not tagged - type %d.\n", (void*)pkts[i], pkt_data->type);
+        }
    }

    return nb_pkts;
 }

-#define STATE_SEND (0)
-#define STATE_RECV (1)
-
 static int
-locore_main(void * _unused __rte_unused)
+locore_main(void * tif __rte_unused)
 {
    struct rte_mbuf *tx_buf;
    struct rte_mbuf *rx_bufs[BURST_SIZE];
-    struct packet_data *pkt_data;
+    struct pkt_hdr *pkt_data;
    uint32_t core_id = rte_lcore_id();
-    uint32_t epoch = 0;
-    int state = STATE_SEND;
+    int32_t ret; 

+    bool read_tx = true;
+    bool recv_stat = true;
+    bool recv_resp = true;
+
+    uint64_t next_ts;
    // XXX: check link status instead

    sleep(1);
@ -125,70 +170,154 @@ locore_main(void * _unused __rte_unused)

 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: core %d running...\n", core_id);

-    tx_buf = rte_pktmbuf_alloc(options.s_mbuf_pool);
+    next_ts = get_time_us();
+
+    while(!options.s_stop.load()) {
+        uint64_t now = get_time_us();
+        // always pop incoming packets
+        const uint16_t nb_rx = rte_eth_rx_burst(options.s_portid, 0, rx_bufs, BURST_SIZE);
+
+        if (nb_rx > 0) {
+            for (int i = 0; i < nb_rx; i++) {
+                struct pkt_hdr * each = check_valid_packet(rx_bufs[i]);
+                
+                if (each == NULL) {
+                    ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "locore_main: ignoring invalid packet %p.\n", (void*)rx_bufs[i]);
+                    rte_pktmbuf_free(rx_bufs[i]);
+                    continue;
+                }
+
+                uint16_t type = rte_be_to_cpu_16(each->type);
+                ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "locore_main: received packet %p type %d.\n", (void*)rx_bufs[i], type);
+                switch (type) {
+                    struct pkt_payload_epoch * pld_epoch;
+                    struct pkt_payload_stat * pld_stat;
+                    uint32_t epoch;
+
+                    case PKT_TYPE_PROBE_RESP:
+                        pld_epoch = (struct pkt_payload_epoch *)each->payload;
+                        epoch = rte_be_to_cpu_32(pld_epoch->epoch);
+
+                        if (options.s_last_datapt == nullptr || epoch != options.s_last_datapt->epoch) {
+                            ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "locore_main: packet %p epoch %d doesn't match datapt %d.\n", (void*)rx_bufs[i], epoch, options.s_last_datapt->epoch);
+                            break;
+                        }
+
+                        options.s_total_pkts++;
+
+                        recv_resp = true;
+                        break;
+                    case PKT_TYPE_STAT:
+                        pld_stat = (struct pkt_payload_stat *)each->payload;
+                        epoch = rte_be_to_cpu_32(pld_stat->epoch);
+
+                        if (options.s_last_datapt == nullptr || epoch != options.s_last_datapt->epoch) {
+                            ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "locore_main: packet %p epoch %d doesn't match datapt %d.\n", (void*)rx_bufs[i], epoch, options.s_last_datapt->epoch);
+                            break;
+                        }
+
+                        options.s_last_datapt->srv_hw_tx = rte_be_to_cpu_64(pld_stat->hw_tx);
+                        options.s_last_datapt->srv_hw_rx = rte_be_to_cpu_64(pld_stat->hw_rx);
+                        options.s_last_datapt->srv_sw_tx = rte_be_to_cpu_64(pld_stat->sw_tx);
+                        options.s_last_datapt->srv_sw_rx = rte_be_to_cpu_64(pld_stat->sw_rx);
+
+                        recv_stat = true;
+                        break;
+                    default:
+                        ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "locore_main: ignoring packet %p with unknown type %d.\n", (void*)rx_bufs[i], type);
+                        rte_pktmbuf_free(rx_bufs[i]);
+                        continue;
+                }
+
+                rte_pktmbuf_free(rx_bufs[i]);
+            }
+        }
+
+        if (read_tx && recv_stat & recv_resp) {
+            // if we have all the data
+
+            if (options.s_last_datapt != nullptr) {
+                // push the data to the queue if we haven't done so already
+                options.s_data.push_back(options.s_last_datapt);
+
+                ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "locore_main: datapt for epoch %d dump:\n" \
+                                                    "                    Valid: %d\n"
+                                                    "                    client TX HW: %llu\n" \
+                                                    "                    client TX SW: %llu\n" \
+                                                    "                    client RX HW: %llu\n" \
+                                                    "                    client RX SW: %llu\n" \
+                                                    "                    server TX HW: %llu\n" \
+                                                    "                    server TX SW: %llu\n" \
+                                                    "                    server RX HW: %llu\n" \
+                                                    "                    server RX SW: %llu\n\n",
+                                                    options.s_last_datapt->epoch,
+                                                    options.s_last_datapt->valid,
+                                                    options.s_last_datapt->clt_hw_tx,
+                                                    options.s_last_datapt->clt_sw_tx,
+                                                    options.s_last_datapt->clt_hw_rx,
+                                                    options.s_last_datapt->clt_sw_rx,
+                                                    options.s_last_datapt->srv_hw_tx,
+                                                    options.s_last_datapt->srv_sw_tx,
+                                                    options.s_last_datapt->srv_hw_rx,
+                                                    options.s_last_datapt->srv_sw_rx);
+                options.s_last_datapt = nullptr;
+            }
+
+            if (now >= next_ts) {
+                struct pkt_payload_epoch * pld_epoch;
+                uint32_t epoch;
+
+                next_ts += (int)(options.s_iagen->generate() * 1000000.0);
+
+                // generate the packet
+                tx_buf = rte_pktmbuf_alloc(options.mbuf_pool);

                if (tx_buf == NULL) {
                    rte_exit(EXIT_FAILURE, "cannot allocate tx_buf\n");
                }

-    pkt_data = construct_udp_pkt_hdr(tx_buf, 
-                                    &options.s_host_mac, &options.server_mac, 
-                                    RTE_IPV4(192, 168, 100, 150), RTE_IPV4(192, 168, 100, 151), 
-                                    1337, 1337);
+                pkt_data = construct_pkt_hdr(tx_buf, PKT_TYPE_PROBE,
+                                                &options.s_host_mac, &options.server_mac);
                if (pkt_data == NULL) {
                    rte_exit(EXIT_FAILURE, "cannot allocate space for packet_data in mbuf\n");
                }
-    pkt_data->magic = rte_cpu_to_be_32(ETHER_FRAME_MAGIC);

-    while(!options.s_stop.load()) {
-        // always pop incoming packets
-        const uint16_t nb_rx = rte_eth_rx_burst(options.s_portid, 0, rx_bufs, BURST_SIZE);
+                epoch = options.s_epoch;
+                options.s_epoch++;
+                pld_epoch = (struct pkt_payload_epoch *)pkt_data->payload;
+                pld_epoch->epoch = rte_cpu_to_be_32(epoch);
+                options.s_last_datapt = new struct datapt;
+                options.s_last_datapt->epoch = epoch;
+                options.s_last_datapt->valid = options.s_record.load();

-        if (nb_rx != 0) {
-            // only process packets when we are ready to receive
-            for (int i = 0; i < nb_rx; i++) {
-                struct packet_data * each = check_valid_packet(rx_bufs[i]);
+                read_tx = false;
+                recv_resp = false;
+                recv_stat = false;
+                options.last_sent_ts = get_time_us();

-                if (each == NULL) {
-                    ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "locore_main: ignoring invalid packet %p.\n", (void*)rx_bufs[i]);
-                    dump_pkt(rx_bufs[i]);
-                    rte_pktmbuf_free(rx_bufs[i]);
-                    continue;
-                }
+                ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "locore_main: sending packet %p with epoch %d\n", (void*)tx_buf, epoch);
+                const uint16_t nb_tx = rte_eth_tx_burst(options.s_portid, options.s_txqid, &tx_buf, 1);
                
-                if (rte_be_to_cpu_32(each->epoch) == epoch && state == STATE_RECV) {
-                   ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: received packet %p for epoch %d\n", (void*)rx_bufs[i], epoch);
-
-                    if (options.s_record.load()) {
-                        // keep statistics
-                        struct datapt * dpt = new datapt;
-                        dpt->rtt = rte_be_to_cpu_64(each->clt_ts_rx) - rte_be_to_cpu_64(each->clt_ts_tx);
-                        dpt->server_proc = rte_be_to_cpu_64(each->srv_ts_tx) - rte_be_to_cpu_64(each->srv_ts_rx);
-                        options.s_stats.push_back(dpt);
-                    }
-
-                    // bump the epoch and stop processing other packets
-                    state = STATE_SEND;
-                    epoch++;
-                } else {
-                    ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: ignoring packet 0x%p with invalid epoch %d.\n", (void*)rx_bufs[i], epoch);
-                }
-
-                rte_pktmbuf_free(rx_bufs[i]);
-            }
-        }
-
-        if (state == STATE_SEND) {
-            // set new epoch
-            pkt_data->epoch = rte_cpu_to_be_32(epoch);
-            ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: sending packet %p with epoch %d\n", (void*)tx_buf, epoch);
-
-            const uint16_t nb_tx = rte_eth_tx_burst(options.s_portid, 0, &tx_buf, 1);
-
-            if (nb_tx < 1) {
+                if (nb_tx != 1) {
                    rte_exit(EXIT_FAILURE, "failed to send packet 0x%p, epoch %d\n", (void*)tx_buf, epoch);
                }
-            state = STATE_RECV;
+            }
+        }
+
+        if (!recv_stat) {
+            // if we haven't recevied the stats get ready to rage quit
+            if(get_time_us() - options.last_sent_ts > options.rage_quit_time * 1000) {
+                rte_exit(EXIT_FAILURE, "waiting too long for resp. I QUIT!!\n");
+            }
+        }
+
+        if (!read_tx) {
+            struct timespec ts;
+            if ((ret = rte_eth_timesync_read_tx_timestamp(options.s_portid, &ts)) == 0) {
+                ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "locore_main: read hw tx timestamp %lld.\n", ts.tv_nsec + ts.tv_sec * S2NS);
+                options.s_last_datapt->clt_hw_tx = ts.tv_nsec + ts.tv_sec * S2NS;
+                read_tx = true;
+            }
        }
 	}
    
@ -228,7 +357,7 @@ port_init(uint16_t portid, struct rte_mempool *mbuf_pool)
    port_conf.txmode.offloads |= DEV_TX_OFFLOAD_MBUF_FAST_FREE;
    
    /* Configure the Ethernet device. */
-    ret = rte_eth_dev_configure(portid, RX_RING_NUM, TX_RING_NUM, &port_conf);
+    ret = rte_eth_dev_configure(portid, 1, 1, &port_conf);
 	if (ret != 0)
 		return ret;

@ -236,10 +365,10 @@ port_init(uint16_t portid, struct rte_mempool *mbuf_pool)
 	if (ret != 0)
 		return ret;

-	/* Allocate and set up 1 RX queue per Ethernet port. */
+	/* Allocate and set up 1 RX queue per thread . */
    rxconf = dev_info.default_rxconf;
    rxconf.offloads = port_conf.rxmode.offloads;
-	for (uint32_t i = 0; i < RX_RING_NUM; i++) {
+	for (uint32_t i = 0; i < 1; i++) {
 		ret = rte_eth_rx_queue_setup(portid, i, nb_rxd, rte_eth_dev_socket_id(portid), &rxconf, mbuf_pool);
 		if (ret < 0)
 			return ret;
@ -248,7 +377,7 @@ port_init(uint16_t portid, struct rte_mempool *mbuf_pool)
    txconf = dev_info.default_txconf;
 	txconf.offloads = port_conf.txmode.offloads;
 	/* Allocate and set up 1 TX queue per Ethernet port. */
-	for (uint32_t i = 0; i < TX_RING_NUM; i++) {
+	for (uint32_t i = 0; i < 1; i++) {
 		ret = rte_eth_tx_queue_setup(portid, i, nb_txd, rte_eth_dev_socket_id(portid), &txconf);
 		if (ret < 0)
 			return ret;
@ -264,13 +393,17 @@ port_init(uint16_t portid, struct rte_mempool *mbuf_pool)
    if (ret != 0)
        return ret;

+    ret = rte_eth_timesync_enable(portid);
+    if (ret != 0)
+        return ret;
+
    /* Enable RX in promiscuous mode for the Ethernet device. */
    ret = rte_eth_promiscuous_enable(portid);
 	if (ret != 0)
 		return ret;

    rte_eth_add_tx_callback(portid, 0, tx_add_timestamp, NULL);
-    rte_eth_add_rx_callback(portid, 0, rx_calc_latency, NULL);
+    rte_eth_add_rx_callback(portid, 0, rx_add_timestamp, NULL);

 	return 0;
 }
@ -281,10 +414,12 @@ static void dump_options()
            "    run time = %d\n" \
            "    warmup time = %d\n" \
            "    output file = %s\n" \
-            "    server MAC = %x:%x:%x:%x:%x:%x\n",
+            "    rage quit time = %ld\n"\
+            "    host MAC = %x:%x:%x:%x:%x:%x\n",
            options.run_time,
            options.warmup_time,
            options.output,
+            options.rage_quit_time,
            options.server_mac.addr_bytes[0],
            options.server_mac.addr_bytes[1],
            options.server_mac.addr_bytes[2],
@ -298,19 +433,31 @@ static void usage()
    fprintf(stdout, 
            "Usage:\n " \
            "    -v(vv): verbose mode\n" \
-            "    -h: display the information\n" \
-            "    -o: output filename\n" \
+            "    -s: server's mac\n" \
+            "    -S: slave(rat)'s mac\n" \
            "    -t: run time\n" \
            "    -T: warmup time\n" \
-            "    -s: server's mac\n\n" );
+            "    -h: display the information\n" \
+            "    -o: output filename\n" \
+            "    -A: affinity mask\n" \
+            "    -i: inter-arrival time distribution\n" \
+            "    -r: rage quit time (in ms)\n");
+    fflush(stdout);
 }

 int main(int argc, char* argv[])
 {
    unsigned int nb_ports;
-    struct rte_mempool *mbuf_pool, *mbuf_pool_pkt;
+    struct rte_mempool *mbuf_pool;
    std::ofstream log_file;

+    ntr_init();
+    if (nm_init() != 0)
+        rte_exit(EXIT_FAILURE, "failed to init libnm\n");
+    
+    // create default generator
+    options.s_iagen = createGenerator(options.ia_gen_str);
+
    // init dpdk
    int ret = rte_eal_init(argc, argv);
    if (ret < 0) {
@ -325,8 +472,9 @@ int main(int argc, char* argv[])
    {
        int c;
        // parse arguments
-        while((c = getopt(argc, argv, "hvo:t:T:s:")) != -1) {
+        while((c = getopt(argc, argv, "vs:S:t:T:ho:A:i:r:")) != -1) {
            switch (c) {
+                struct rte_ether_addr * addr;
                case 'v':
                    ntr_set_level(NTR_DEP_USER1, ntr_get_level(NTR_DEP_USER1) + 1);
                    break;
@ -335,6 +483,13 @@ int main(int argc, char* argv[])
                        rte_exit(EXIT_FAILURE, "cannot parse %s as mac address.\n", optarg);
                    }
                    break;
+                case 'S':
+                    addr = new struct rte_ether_addr;
+                    if (rte_ether_unformat_addr(optarg, addr) == -1) {
+                        rte_exit(EXIT_FAILURE, "cannot parse %s as mac address.\n", optarg);
+                    }
+                    options.slaves.push_back(addr);
+                    break;
                case 't':
                    options.run_time = atoi(optarg);
                    break;
@ -343,11 +498,26 @@ int main(int argc, char* argv[])
                    break;
                case 'h':
                    usage();
-                    rte_exit(EXIT_SUCCESS, NULL);
-                    break;
+                    rte_exit(EXIT_SUCCESS, "\n");
                case 'o':
                    strncpy(options.output, optarg, sizeof(options.output) - 1);
                    break;
+                case 'A':
+                    options.cpu_mask = strtoull(optarg, nullptr, 16);
+                    break;
+                case 'i':
+                    strncpy(options.ia_gen_str, optarg, sizeof(options.ia_gen_str) - 1);
+                    if (options.s_iagen != nullptr) {
+                        delete options.s_iagen;
+                    }
+                    options.s_iagen = createGenerator(options.ia_gen_str);
+                    if (options.s_iagen == nullptr) {
+                        rte_exit(EXIT_FAILURE, "invalid generator string %s\n", options.ia_gen_str);
+                    }
+                    break;
+                case 'r':
+                    options.rage_quit_time = atoi(optarg);
+                    break;
                default:
                    usage();
                    rte_exit(EXIT_FAILURE, "unknown argument: %c\n", c);
@ -367,24 +537,19 @@ int main(int argc, char* argv[])
        rte_exit(EXIT_FAILURE, "number of ports must be > 0\n");
    }

-    // create a mbuf memory pool on the socket
-    mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", MBUF_MAX_COUNT * nb_ports, MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());
-    if (mbuf_pool == nullptr) {
-        rte_exit(EXIT_FAILURE, "cannot create mbuf pool\n");
-    }
-
-    mbuf_pool_pkt = rte_pktmbuf_pool_create("MBUF_POOL_PKT", MBUF_MAX_COUNT * nb_ports, MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());
-    if (mbuf_pool_pkt == nullptr) {
-        rte_exit(EXIT_FAILURE, "cannot create mbuf pool\n");
-    }
-    options.s_mbuf_pool = mbuf_pool_pkt;
-
    uint16_t portid = rte_eth_find_next(0);
    if (portid == RTE_MAX_ETHPORTS) {
        rte_exit(EXIT_FAILURE, "cannot find an available port\n");
    }
    options.s_portid = portid;

+    // create a mbuf memory pool on the socket
+    mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", MBUF_MAX_COUNT, MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_eth_dev_socket_id(options.s_portid));
+    if (mbuf_pool == nullptr) {
+        rte_exit(EXIT_FAILURE, "cannot create mbuf pool\n");
+    }
+    options.mbuf_pool = mbuf_pool;
+
    if (port_init(portid, mbuf_pool) != 0) {
        rte_exit(EXIT_FAILURE, "cannot init port %d\n", portid);
    }
@ -403,39 +568,51 @@ int main(int argc, char* argv[])

    dump_options();

-    uint16_t core_id = rte_get_next_lcore(0, true, false);
-    if (rte_eal_remote_launch(locore_main, NULL, core_id) != 0) {
+    sleep(1);
+
+    uint64_t cmask = options.cpu_mask;
+    const int16_t core_id = cmask_get_next_cpu(&cmask);
+    if (core_id == NEXT_CPU_NULL) {
+        rte_exit(EXIT_FAILURE, "invalid cpu mask 0x%lx\n", cmask);
+    }
+    ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: launching thread on core %d\n", core_id);
+    if (rte_eal_remote_launch(locore_main, nullptr, core_id) != 0) {
        rte_exit(EXIT_FAILURE, "failed to launch function on locore\n");
    }

-    // poor man's timer
-    // XXX: use kqueue instead
-    struct timespec ts;
-    ts.tv_sec = 1;
-    ts.tv_nsec = 0;
+    // XXX: poor man's timer
    uint32_t second = 0;
    while(true) {
        if (second >= options.warmup_time) {
-            options.s_record.store(true);
+            options.s_record.store(1);
        }
        if (second >= options.run_time + options.warmup_time) {
            options.s_stop.store(true);
            break;
        }
-        clock_nanosleep(CLOCK_REALTIME, 0, &ts, NULL);
+        usleep(S2US);
        second++;
    }

    if (rte_eal_wait_lcore(core_id) < 0)
        rte_exit(EXIT_FAILURE, "failed to wait for job completion\n");

+
+    uint32_t qps = 0;
    // dump stats
-    for (auto it = std::begin(options.s_stats); it != std::end(options.s_stats); ++it) {
-        log_file << (*it)->rtt << "," << (*it)->server_proc << std::endl;
-        delete *it;
+    for (auto it : options.s_data) {
+        if (it->valid) {
+            qps++;
+            log_file << it->clt_sw_rx << ',' << it->clt_sw_tx << ','
+                        << it->clt_hw_rx << ',' << it->clt_hw_tx << ','
+                        << it->srv_sw_rx << ',' << it->srv_sw_tx << ','
+                        << it->srv_hw_rx << ',' << it->srv_hw_tx << std::endl;
+        }
    }
    log_file.close();

+    ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "Processed %d packets in %d seconds, QPS: %d\n", qps, options.run_time, qps);
+
    // clean up
    rte_eth_dev_stop(portid);
    rte_eth_dev_close(portid);
--- a/compile_flags.txt
+++ b/compile_flags.txt
@ -2,8 +2,12 @@
 -O2
 -std=c++11
 -Wall
+-Wextra
 -Werror
-Wpedantic
 -I/usr/include/dpdk
 -Iinc
 -Wno-deprecated-declarations 
+-Wno-packed-not-aligned
+-Wno-address-of-packed-member
+-Wno-zero-length-array
+-Wno-gnu-zero-variadic-macro-arguments
--- a/inc/gen.h
+++ b/inc/gen.h
@ -0,0 +1,234 @@
+// modified from mutilate
+// -*- c++ -*-
+
+// 1. implement "fixed" generator
+// 2. implement discrete generator
+// 3. implement combine generator? 
+
+#pragma once
+
+#include <netinet/in.h>
+
+#include <string>
+#include <vector>
+#include <utility>
+
+#include <assert.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/param.h>
+
+#include "util.h"
+
+#define D(fmt, ...) 
+#define DIE(fmt, ...) (void)0;
+
+#define FNV_64_PRIME (0x100000001b3ULL)
+#define FNV1_64_INIT (0xcbf29ce484222325ULL)
+static inline uint64_t fnv_64_buf(const void* buf, size_t len) {
+    uint64_t hval = FNV1_64_INIT;
+
+    unsigned char *bp = (unsigned char *)buf;   /* start of buffer */
+    unsigned char *be = bp + len;               /* beyond end of buffer */
+
+    while (bp < be) {
+    hval ^= (uint64_t)*bp++;
+    hval *= FNV_64_PRIME;
+    }
+
+    return hval;
+}
+	
+static inline uint64_t fnv_64(uint64_t in) { return fnv_64_buf(&in, sizeof(in)); }
+
+
+// Generator syntax:
+//
+// \d+ == fixed
+// n[ormal]:mean,sd
+// e[xponential]:lambda
+// p[areto]:scale,shape
+// g[ev]:loc,scale,shape
+// fb_value, fb_key, fb_rate
+
+class Generator {
+public:
+  Generator() {}
+  //  Generator(const Generator &g) = delete;
+  //  virtual Generator& operator=(const Generator &g) = delete;
+  virtual ~Generator() {}
+
+  virtual double generate(double U = -1.0) = 0;
+  virtual void set_lambda(double) {DIE("set_lambda() not implemented");}
+protected:
+  std::string type;
+};
+
+class Fixed : public Generator {
+public:
+  Fixed(double _value = 1.0) : value(_value) { D("Fixed(%f)", value); }
+  virtual double generate(double) { return value; }
+  virtual void set_lambda(double lambda) {
+    if (lambda > 0.0) value = 1.0 / lambda;
+    else value = 0.0;
+  }
+
+private:
+  double value;
+};
+
+class Uniform : public Generator {
+public:
+  Uniform(double _scale) : scale(_scale) { D("Uniform(%f)", scale); }
+
+  virtual double generate(double U = -1.0) {
+    if (U < 0.0) U = drand48();
+    return scale * U;
+  }
+
+  virtual void set_lambda(double lambda) {
+    if (lambda > 0.0) scale = 2.0 / lambda;
+    else scale = 0.0;
+  }
+
+private:
+  double scale;
+};
+
+class Normal : public Generator {
+public:
+  Normal(double _mean = 1.0, double _sd = 1.0) : mean(_mean), sd(_sd) {
+    D("Normal(mean=%f, sd=%f)", mean, sd);
+  }
+
+  virtual double generate(double U = -1.0) {
+    if (U < 0.0) U = drand48();
+    double V = U; // drand48();
+    double N = sqrt(-2 * log(U)) * cos(2 * M_PI * V);
+    return mean + sd * N;
+  }
+
+  virtual void set_lambda(double lambda) {
+    if (lambda > 0.0) mean = 1.0 / lambda;
+    else mean = 0.0;
+  }
+
+private:
+  double mean, sd;
+};
+
+class Exponential : public Generator {
+public:
+  Exponential(double _lambda = 1.0) : lambda(_lambda) {
+    D("Exponential(lambda=%f)", lambda);
+  }
+
+  virtual double generate(double U = -1.0) {
+    if (lambda <= 0.0) return 0.0;
+    if (U < 0.0) U = drand48();
+    return -log(U) / lambda;
+  }
+
+  virtual void set_lambda(double lambda) { this->lambda = lambda; }
+
+private:
+  double lambda;
+};
+
+class GPareto : public Generator {
+public:
+  GPareto(double _loc = 0.0, double _scale = 1.0, double _shape = 1.0) :
+    loc(_loc), scale(_scale), shape(_shape) {
+    assert(shape != 0.0);
+    D("GPareto(loc=%f, scale=%f, shape=%f)", loc, scale, shape);
+  }
+
+  virtual double generate(double U = -1.0) {
+    if (U < 0.0) U = drand48();
+    return loc + scale * (pow(U, -shape) - 1) / shape;
+  }
+
+  virtual void set_lambda(double lambda) {
+    if (lambda <= 0.0) scale = 0.0;
+    else scale = (1 - shape) / lambda - (1 - shape) * loc;
+  }
+
+private:
+  double loc /* mu */;
+  double scale /* sigma */, shape /* k */;
+};
+
+class GEV : public Generator {
+public:
+  GEV(double _loc = 0.0, double _scale = 1.0, double _shape = 1.0) :
+    e(1.0), loc(_loc), scale(_scale), shape(_shape) {
+    assert(shape != 0.0);
+    D("GEV(loc=%f, scale=%f, shape=%f)", loc, scale, shape);
+  }
+
+  virtual double generate(double U = -1.0) {
+    return loc + scale * (pow(e.generate(U), -shape) - 1) / shape;
+  }
+
+private:
+  Exponential e;
+  double loc /* mu */, scale /* sigma */, shape /* k */;
+};
+
+class Discrete : public Generator {
+public:
+  ~Discrete() { delete def; }
+  Discrete(Generator* _def = NULL) : def(_def) {
+    if (def == NULL) def = new Fixed(0.0);
+  }
+
+  virtual double generate(double U = -1.0) {
+    double Uc = U;
+    if (pv.size() > 0 && U < 0.0) U = drand48();
+
+    double sum = 0;
+ 
+    for (auto p: pv) {
+      sum += p.first;
+      if (U < sum) return p.second;
+    }
+
+    return def->generate(Uc);
+  }
+
+  void add(double p, double v) {
+    pv.push_back(std::pair<double,double>(p, v));
+  }
+
+private:
+  Generator *def;
+  std::vector< std::pair<double,double> > pv;
+};
+
+class KeyGenerator {
+public:
+  KeyGenerator(Generator* _g, double _max = 10000) : g(_g), max(_max) {}
+  std::string generate(uint64_t ind) {
+    uint64_t h = fnv_64(ind);
+    double U = (double) h / (double)ULLONG_MAX;
+    double G = g->generate(U);
+    int keylen = MAX(round(G), floor(log10(max)) + 1);
+    char key[256];
+    snprintf(key, 256, "%0*" PRIu64, keylen, ind);
+
+    //    D("%d = %s", ind, key);
+    return std::string(key);
+  }
+private:
+  Generator* g;
+  double max;
+};
+
+Generator* createGenerator(std::string str);
+Generator* createFacebookKey();
+Generator* createFacebookValue();
+Generator* createFacebookIA();
--- a/inc/nm.h
+++ b/inc/nm.h
@ -0,0 +1,16 @@
+#pragma once
+
+#include <vector>
+
+constexpr static int NM_LEVEL_NUMA = 0;
+constexpr static int NM_LEVEL_CPU = 1;
+constexpr static int NM_LEVEL_CORE = 2;
+
+
+std::vector<struct nm_obj *> * nm_get_nodes();
+std::vector<struct nm_obj *> * nm_get_cpus();
+std::vector<struct nm_obj *> * nm_get_cores();
+
+// 0 on success
+// -1 on error
+int nm_init();
--- a/inc/ntr.h
+++ b/inc/ntr.h
@ -0,0 +1,37 @@
+#pragma once
+
+#include <stdio.h>
+#include <stdarg.h>
+
+#define NTR_LEVEL_NONE (0)
+#define NTR_LEVEL_ERROR (1)
+#define NTR_LEVEL_WARNING (2)
+#define NTR_LEVEL_INFO (3)
+#define NTR_LEVEL_DEBUG (4)
+#define NTR_LEVEL_DEFAULT (NTR_LEVEL_WARNING)
+
+#define NTR_DEP_NTR (0)
+#define NTR_DEP_USER1 (1)
+#define NTR_DEP_USER2 (2)
+#define NTR_DEP_USER3 (3)
+#define NTR_DEP_USER4 (4)
+#define NTR_DEP_USER5 (5)
+#define NTR_DEP_MAX (NTR_DEP_USER5 + 1)
+
+#ifdef __cplusplus
+extern "C" {
+#endif 
+
+void ntr_init();
+
+void ntr(int dep, int level, const char * fmt, ...);
+
+void ntr_set_level(int dep, int level);
+
+void ntr_set_output(FILE * f);
+
+int ntr_get_level(int dep);
+
+#ifdef __cplusplus
+}
+#endif
--- a/inc/ntrlog.h
+++ b/inc/ntrlog.h
@ -1,61 +0,0 @@
-#pragma once
-
-#include <stdio.h>
-
-#define NTR_LEVEL_NONE (0)
-#define NTR_LEVEL_ERROR (1)
-#define NTR_LEVEL_WARNING (2)
-#define NTR_LEVEL_INFO (3)
-#define NTR_LEVEL_DEBUG (4)
-#define NTR_LEVEL_DEFAULT (NTR_LEVEL_WARNING)
-
-#define NTR_DEP_NTR (0)
-#define NTR_DEP_USER1 (1)
-#define NTR_DEP_USER2 (2)
-#define NTR_DEP_USER3 (3)
-#define NTR_DEP_USER4 (4)
-#define NTR_DEP_USER5 (5)
-#define NTR_DEP_MAX (NTR_DEP_USER5 + 1)
-
-#define NTR_DECL_IMPL \
-int ntr_log_levels[NTR_DEP_MAX] = {NTR_LEVEL_DEFAULT}; \
-FILE * ntr_out = stdout
-
-extern int ntr_log_levels[];
-extern FILE * ntr_out;
-
-static inline
-void ntr(int dep, int level, const char * fmt, ...)
-{
-    va_list vl;
-    va_start(vl, fmt);
-    if (dep < NTR_DEP_MAX && level <= ntr_log_levels[dep]) {
-        vfprintf(ntr_out, fmt, vl);
-    }
-    va_end(vl);
-}
-
-static inline
-void ntr_set_level(int dep, int level)
-{
-    if (dep < NTR_DEP_MAX) {
-        ntr_log_levels[dep] = level;
-    }
-}
-
-static inline
-void ntr_set_output(FILE * f)
-{
-    if (f != NULL) {
-        ntr_out = f;
-    }
-}
-
-static inline
-int ntr_get_level(int dep)
-{
-    if (dep < NTR_DEP_MAX) {
-        return ntr_log_levels[dep];
-    }
-    return 0;
-}
--- a/inc/pkt.h
+++ b/inc/pkt.h
@ -12,29 +12,49 @@
 #include <rte_net.h>
 #include <rte_vxlan.h>

-#define IP_DEFTTL 64 /* from RFC 1340. */
-#define IP_VERSION 0x40
-#define IP_HDRLEN 0x05 /* default IP header length == five 32-bits words. */
-#define IP_VHL_DEF (IP_VERSION | IP_HDRLEN)
-#define IP_ADDR_FMT_SIZE 15
-
 constexpr static uint32_t ETHER_FRAME_MAGIC = 0xDCDCE5E5;
+const static struct rte_ether_addr PROBE_MAC_ADDR {0x01,0x1B,0x19,0x00,0x00,0x00};
+const static uint16_t ETHER_TYPE_LOCAL_EXP = 0x88b5;

-struct packet_hdr {
-    struct rte_ether_hdr eth_hdr;
-    struct rte_ipv4_hdr ipv4_hdr;
-    struct rte_udp_hdr udp_hdr;
+struct ptp_hdr {
+    uint8_t ptp_msg_type;
+    uint8_t ptp_ver;
+    uint8_t unused[34];
 } __attribute__((packed));

-struct packet_data
-{
-    struct packet_hdr pkt_hdr;
+struct pkt_hdr {
+    struct rte_ether_hdr eth_hdr;
+    struct ptp_hdr ptp_hdr;
+    uint16_t type;
    uint32_t magic;
+    char payload[0];
+} __attribute__((packed));
+
+constexpr static uint16_t PKT_TYPE_LOAD = 0;
+constexpr static uint16_t PKT_TYPE_PROBE = 1;
+constexpr static uint16_t PKT_TYPE_LOAD_RESP = 2;
+constexpr static uint16_t PKT_TYPE_PROBE_RESP = 3;
+struct pkt_payload_epoch {
    uint32_t epoch;
-    uint64_t clt_ts_tx;
-    uint64_t clt_ts_rx;
-    uint64_t srv_ts_tx;
-    uint64_t srv_ts_rx;
+};
+
+constexpr static uint16_t PKT_TYPE_STAT = 4;
+struct pkt_payload_stat {
+    uint32_t epoch;
+    uint64_t hw_rx;
+    uint64_t hw_tx;
+    uint64_t sw_rx;
+    uint64_t sw_tx;
+};
+
+constexpr static uint16_t NUM_PKT_TYPES = PKT_TYPE_STAT + 1;
+// for fast packet verification
+static const uint32_t expected_payload_size[NUM_PKT_TYPES] {
+    sizeof(struct pkt_payload_epoch), // LOAD
+    sizeof(struct pkt_payload_epoch), // PROBE
+    sizeof(struct pkt_payload_epoch), // LOAD_RESP
+    sizeof(struct pkt_payload_epoch), // PROBE_RESP
+    sizeof(struct pkt_payload_stat) //STAT
 };

 static inline void
@ -103,17 +123,17 @@ dump_pkt(struct rte_mbuf *pkt)

 }

+
+// fills the packet with the information except for the payload itself
 static inline
-struct packet_data * construct_udp_pkt_hdr(struct rte_mbuf * buf,
-                    struct rte_ether_addr * src_mac, struct rte_ether_addr * dst_mac,
-                    uint32_t src_ip, uint32_t dst_ip, uint16_t src_port, uint16_t dst_port)
+struct pkt_hdr * construct_pkt_hdr(struct rte_mbuf * buf, uint16_t type,
+                    struct rte_ether_addr * src_mac, struct rte_ether_addr * dst_mac)
 {
    rte_pktmbuf_reset(buf);

-    struct packet_data * pkt_data = (struct packet_data *)rte_pktmbuf_append(buf, sizeof(struct packet_data));
+    const uint32_t total_sz = sizeof(struct pkt_hdr) + expected_payload_size[type];
+    struct pkt_hdr * pkt_data = (struct pkt_hdr *)rte_pktmbuf_append(buf, total_sz);
    struct rte_ether_hdr * eth_hdr;
-    struct rte_ipv4_hdr * ipv4_hdr;
-    struct rte_udp_hdr * udp_hdr;

    if (pkt_data == NULL)
        return NULL;
@ -122,52 +142,47 @@ struct packet_data * construct_udp_pkt_hdr(struct rte_mbuf * buf,
    buf->nb_segs = 1;
    
    // construct l2 header
-    eth_hdr = &pkt_data->pkt_hdr.eth_hdr;
+    eth_hdr = &pkt_data->eth_hdr;
    rte_ether_addr_copy(src_mac, &eth_hdr->s_addr);
+    if (type == PKT_TYPE_PROBE || type == PKT_TYPE_PROBE_RESP) {
+        rte_ether_addr_copy(&PROBE_MAC_ADDR, &eth_hdr->d_addr);
+        eth_hdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_1588);
+        pkt_data->ptp_hdr.ptp_ver = 0x2; // VER 2
+        buf->ol_flags |= PKT_TX_IEEE1588_TMST;
+    } else {
        rte_ether_addr_copy(dst_mac, &eth_hdr->d_addr);
-    eth_hdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4);
+        eth_hdr->ether_type = rte_cpu_to_be_16(ETHER_TYPE_LOCAL_EXP);
+        pkt_data->ptp_hdr.ptp_ver = 0xff;
+    }
    buf->l2_len = sizeof(struct rte_ether_hdr);

-    // construct l3 header
-    ipv4_hdr = &pkt_data->pkt_hdr.ipv4_hdr;
-    memset(ipv4_hdr, 0, sizeof(struct rte_ipv4_hdr));
-    ipv4_hdr->version_ihl = IP_VHL_DEF;
-    ipv4_hdr->type_of_service = 0;
-    ipv4_hdr->fragment_offset = 0;
-    ipv4_hdr->time_to_live = IP_DEFTTL;
-    ipv4_hdr->next_proto_id = IPPROTO_UDP;
-    ipv4_hdr->packet_id = 0;
-    ipv4_hdr->src_addr = rte_cpu_to_be_32(src_ip);
-    ipv4_hdr->dst_addr = rte_cpu_to_be_32(dst_ip);
-    ipv4_hdr->total_length = rte_cpu_to_be_16(sizeof(struct packet_data) - sizeof(struct rte_ether_hdr));
-    ipv4_hdr->hdr_checksum = 0;
-    buf->l3_len = sizeof(struct rte_ipv4_hdr);
-
-    // construct l4 header
-    udp_hdr = &pkt_data->pkt_hdr.udp_hdr;
-    udp_hdr->src_port = rte_cpu_to_be_16(src_port);
-    udp_hdr->dst_port = rte_cpu_to_be_16(dst_port);
-    udp_hdr->dgram_cksum = 0; /* No UDP checksum. */
-    udp_hdr->dgram_len = rte_cpu_to_be_16(sizeof(struct packet_data) -
-                                          sizeof(struct rte_ether_hdr) -
-                                          sizeof(struct rte_udp_hdr));
-    buf->l4_len = sizeof(struct rte_udp_hdr);
+    pkt_data->ptp_hdr.ptp_msg_type = 0x0; // SYNC
+    pkt_data->type = rte_cpu_to_be_16(type);
+    pkt_data->magic = rte_cpu_to_be_32(ETHER_FRAME_MAGIC);

    return pkt_data;
 }

 static inline
-struct packet_data * check_valid_packet(struct rte_mbuf * pkt)
+struct pkt_hdr * check_valid_packet(struct rte_mbuf * pkt)
 {
-    struct packet_data * pkt_data = NULL;
+    struct pkt_hdr * pkt_data = NULL;
+    const uint32_t data_len = rte_pktmbuf_data_len(pkt);

-    if (rte_pktmbuf_data_len(pkt) < sizeof(struct packet_data)) {
+    if (data_len < sizeof(struct pkt_hdr)) {
        return NULL;
    }

-    pkt_data = rte_pktmbuf_mtod(pkt, struct packet_data *);
+    pkt_data = rte_pktmbuf_mtod(pkt, struct pkt_hdr *);

-    if (rte_be_to_cpu_32(pkt_data->magic) == ETHER_FRAME_MAGIC) {
+    // check MAGIC
+    if (rte_be_to_cpu_32(pkt_data->magic) != ETHER_FRAME_MAGIC) {
+        return NULL;
+    }
+
+    // check type and payload size
+    if ((rte_be_to_cpu_16(pkt_data->type) < NUM_PKT_TYPES) && 
+        (data_len >= (sizeof(struct pkt_hdr) + expected_payload_size[rte_be_to_cpu_16(pkt_data->type)]))) {
        return pkt_data;
    }

--- a/inc/util.h
+++ b/inc/util.h
@ -0,0 +1,33 @@
+#pragma  once
+#include <stdint.h>
+#include <time.h>
+#include <rte_ip.h>
+
+constexpr static unsigned long S2NS = 100000000UL;
+constexpr static unsigned long S2US = 1000000L;
+constexpr static uint16_t SERVER_LOAD_PORT = 1234;
+constexpr static uint16_t SERVER_PROBE_PORT = 319;
+constexpr static uint32_t SERVER_IP = RTE_IPV4(192,168,123,0);
+
+static inline uint64_t
+get_time_us() 
+{
+    struct timespec ts;
+    clock_gettime(CLOCK_REALTIME, &ts);
+    return ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
+}
+
+constexpr static int NEXT_CPU_NULL = -1;
+static inline int
+cmask_get_next_cpu(uint64_t * mask)
+{
+    int ffs = ffsll(*mask);
+    *mask &= ~(1 << (ffs - 1));
+    return ffs - 1;
+}
+
+static inline int
+cmask_get_num_cpus(const uint64_t mask)
+{
+    return _mm_popcnt_u64(mask);
+}
--- a/khat/khat.cc
+++ b/khat/khat.cc
@ -1,50 +1,100 @@
 #include <cstdio>
-#include <cstdlib>
+#include <cassert>
+#include <ctime>
+#include <netinet/in.h>
+#include <rte_config.h>
 #include <rte_common.h>
 #include <rte_eal.h>
 #include <rte_ethdev.h>
 #include <rte_cycles.h>
 #include <rte_lcore.h>
 #include <rte_mbuf.h>
-#include <rte_byteorder.h>
-#include <rte_config.h>
 #include <rte_ether.h>
 #include <rte_launch.h>
+#include <rte_log.h>
 #include <atomic>
+#include <vector>
+#include <fstream>
 #include <unistd.h>
+#include <signal.h>

+#include "nm.h"
 #include "pkt.h"
-#include "ntrlog.h"
-#include "rte_arp.h"
-#include "rte_mbuf_core.h"
+#include "ntr.h"
+#include "util.h"

-NTR_DECL_IMPL;

-constexpr unsigned int MBUF_MAX_COUNT = 8191;
-constexpr unsigned int MBUF_CACHE_SIZE = 250;
-constexpr unsigned int RX_RING_SIZE = 1024;
-constexpr unsigned int TX_RING_SIZE = 1024;
-constexpr unsigned int RX_RING_NUM = 1;
-constexpr unsigned int TX_RING_NUM = 1;
-constexpr unsigned int BURST_SIZE = 32;
+/*  Protocol:
+*   regular client:
+*        client -> LOAD -> server
+*        server -> LOAD_RESP -> client
+*   measuring client:
+*        client -> PROBE -> server (client tx timestamps)
+*        server -> PROBE_RESP -> client (client rx timestamps and server tx/rx timestamps)
+*        server -> STAT -> client (server sends its tx/rx timestamps)
+*/
+
+static void * const PROBE_MAGIC = (void*)0x12344444;
+constexpr static unsigned int MBUF_MAX_COUNT = 65536;
+constexpr static unsigned int MBUF_CACHE_SIZE = 512;
+constexpr static unsigned int RX_RING_SIZE = 4096;
+constexpr static unsigned int TX_RING_SIZE = 4096;
+constexpr static unsigned int BURST_SIZE = 32;
+

 static const struct rte_eth_conf port_conf_default{};

+// keep track of the probe state
+// when a probe packet first arrives this state is set to be influx and the rte_mbuf's userdata is set to PROBE_MAGIC
+// which prevents other probe packets to be processed
+// when the server sends the probe stats back to user influx is released
+// this is to guarantee that the server only processes one probe packet at the time
+// XXX: also this can be attached to the mbuf itself and processed by the lcore thread
+//      I kept this global because globally there could be only one pending probe request
+//      and rx_add_timestamp can save their shit here too
+struct probe_state_t {
+    struct rte_ether_hdr hdr;
+    uint32_t epoch;
+    uint32_t timesync;
+    uint64_t last_sw_rx;
+    uint64_t last_sw_tx;
+    uint64_t last_hw_rx;
+};
+
+struct thread_info {
+    int tid;
+    int rxqid;
+    int txqid;
+    int lcore_id;
+};
+
+// state machine:
+constexpr static int SERVER_STATE_WAIT = 0;
+constexpr static int SERVER_STATE_PROBE = 1;
+
 struct options_t {
+    //config
+    int num_threads{1};
+    uint64_t cpuset{0b010}; //2nd core
+
    //states
    uint16_t s_portid;
    struct rte_ether_addr s_host_mac;
    struct rte_mempool * s_pkt_mempool;
+    std::atomic<int> s_state {SERVER_STATE_WAIT};
+    struct probe_state_t s_probe_info;
+    std::vector<struct thread_info *> s_thr_info;
 };

-struct options_t options;
+static struct options_t options;

 static uint16_t
 rx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
        struct rte_mbuf **pkts, uint16_t nb_pkts, uint16_t max_pkts __rte_unused, void *_ __rte_unused)
 {
    uint64_t now = rte_rdtsc();
-    struct packet_data * pkt_data;
+    struct timespec ts;
+    struct pkt_hdr * pkt_data;
    for (int i = 0; i < nb_pkts; i++) {
        pkt_data = check_valid_packet(pkts[i]);

@ -53,135 +103,229 @@ rx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
            continue;
        }

-        ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "rx_add_timestamp: tagged packet %p with %llu.\n", (void*)pkts[i], now);        
-        pkt_data->srv_ts_rx = rte_cpu_to_be_64(now);
+        if (rte_be_to_cpu_16(pkt_data->type) == PKT_TYPE_PROBE) {
+            int state_wait = SERVER_STATE_WAIT;
+            pkts[i]->userdata = nullptr;
+            if (rte_eth_timesync_read_rx_timestamp(port, &ts, pkts[i]->timesync & 0x3) == 0) {
+                if (options.s_state.compare_exchange_strong(state_wait, SERVER_STATE_PROBE)) {
+                    // mark the mbuf as probe packet being processed
+                    // only the locore that receives the pkt w/ userdata != nullptr processes that packet
+                    pkts[i]->userdata = PROBE_MAGIC;
+                    // tag with timestamps
+                    options.s_probe_info.last_hw_rx = ts.tv_nsec + ts.tv_sec * S2NS;
+                    options.s_probe_info.last_sw_rx = now;
+                    ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "rx_add_timestamp: tagged packet %p epoch %d with sw: %llu hw:%llu.\n", (void*)pkts[i], options.s_probe_info.epoch, now, options.s_probe_info.last_hw_rx);
+                } else
+                    ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "rx_add_timestamp: packet %p not tagged - server is processing a probe.\n", (void*)pkts[i]);
+            } else
+                ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "rx_add_timestamp: packet %p not tagged - hw rx timestamp not available.\n", (void*)pkts[i]);    
+        } else
+            ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "rx_add_timestamp: packet %p not tagged - type %d.\n", (void*)pkts[i], rte_be_to_cpu_16(pkt_data->type));        
    }

    return nb_pkts;
 }

 static uint16_t
-tx_calc_latency(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
+tx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
 		struct rte_mbuf **pkts, uint16_t nb_pkts, void *_ __rte_unused)
 {
    uint64_t now = rte_rdtsc();
-    struct packet_data * pkt_data;
+    struct pkt_hdr * pkt_data;

    for (int i = 0; i < nb_pkts; i++) {

        pkt_data = check_valid_packet(pkts[i]);

        if (pkt_data == NULL) {
-            ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "tx_calc_latency: ignoring invalid packet %p.\n", (void*)pkts[i]);
+            ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "tx_add_timestamp: ignoring invalid packet %p.\n", (void*)pkts[i]);
            continue;
        }

-        ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "tx_calc_latency: tagged packet %p with %llu.\n", (void*)pkts[i], now);    
-        pkt_data->srv_ts_tx = rte_cpu_to_be_64(now);
+        if (rte_be_to_cpu_16(pkt_data->type) == PKT_TYPE_PROBE_RESP) {
+            // this packet is the response to PROBE packets
+            
+            // at this time the packet is not sent to the NIC yet so
+            // the state must be waiting stats
+            // XXX: this should be an assert
+            if(options.s_state.load() != SERVER_STATE_PROBE || pkts[i]->userdata != PROBE_MAGIC) {
+                rte_exit(EXIT_FAILURE, "packet %p sent to NIC before sw callback\n", (void*)pkts[i]);
+            }
+
+            options.s_probe_info.last_sw_tx = now;
+
+            ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "tx_add_timestamp: tagged packet %p with sw tx %llu\n", (void*)pkts[i], options.s_probe_info.last_sw_tx);
+        } else {
+            ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "tx_add_timestamp: packet %p not tagged - type %d\n", (void*)pkts[i], pkt_data->type);
+        }
    }

    return nb_pkts;
 }

 static int
-locore_main(void * _unused __rte_unused)
+locore_main(void * ti)
 {
+    struct thread_info * tinfo = (struct thread_info *)ti;
    struct rte_mbuf *bufs[BURST_SIZE];
+    // + 1 because it might involve an extra PKT_TYPE_STAT packet 
+    // when all tx timestamps are ready
    struct rte_mbuf *tx_bufs[BURST_SIZE];
-    struct packet_data *pkt_data;
-    uint32_t core_id = rte_lcore_id();
+    struct pkt_hdr *pkt_data;
+
+    bool pending_probe = false;

    if (rte_eth_dev_socket_id(options.s_portid) > 0 && rte_eth_dev_socket_id(options.s_portid) != (int)rte_socket_id()) {
-        ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,  "locore_main: WARNING, port %d is on remote NUMA node to "
+        ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,  "locore_main <thread %d>: WARNING, port %d is on remote NUMA node to "
                "polling thread.\n\tPerformance will "
-                "not be optimal.\n", options.s_portid);
+                "not be optimal.\n", tinfo->tid, options.s_portid);
    }

-	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: core %d running.\n", core_id);
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main <thread %d>: running on locore %d with txidx %d and rxidx %d.\n", tinfo->tid, rte_lcore_id(), tinfo->txqid, tinfo->rxqid);

 	while(true) {
        uint16_t nb_tx = 0;
-        const uint16_t nb_rx = rte_eth_rx_burst(options.s_portid, 0, bufs, BURST_SIZE);
-
-        if (nb_rx == 0) {
-            continue;
-        }
+        const uint16_t nb_rx = rte_eth_rx_burst(options.s_portid, tinfo->rxqid, bufs, BURST_SIZE);
+        struct rte_mbuf * pkt_buf;
+        struct pkt_hdr * tx_data;

        for(int i = 0; i < nb_rx; i++) {
-
+            // XXX: optimization: in rx_add_timestamp every packet is already validated once
+            // can just mark valid packet with a value so we can avoid this redundant check
            pkt_data = check_valid_packet(bufs[i]);

            if (pkt_data == NULL) {
-                ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "locore_main: core %d skipping invalid packet %p.\n", core_id, (void*)bufs[i]);
-                dump_pkt(bufs[i]);
+                ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "locore_main <thread %d>: skipping invalid packet %p.\n", tinfo->tid, (void*)bufs[i]);
+                //dump_pkt(bufs[i]);
                rte_pktmbuf_free(bufs[i]);
                continue;
            }

-            uint32_t dst_ip = rte_be_to_cpu_32(pkt_data->pkt_hdr.ipv4_hdr.dst_addr);
-            uint32_t src_ip = rte_be_to_cpu_32(pkt_data->pkt_hdr.ipv4_hdr.src_addr);
-            uint16_t src_port = rte_be_to_cpu_16(pkt_data->pkt_hdr.udp_hdr.src_port);
-            uint16_t dst_port = rte_be_to_cpu_16(pkt_data->pkt_hdr.udp_hdr.dst_port);
-            ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: core %d packet %p from %d.%d.%d.%d(%x:%x:%x:%x:%x:%x) to %d.%d.%d.%d(%x:%x:%x:%x:%x:%x), sport %d, dport %d, epoch %d\n", 
-                                                                                            core_id,
+            ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main <thread %d>: packet %p from %x:%x:%x:%x:%x:%x to %x:%x:%x:%x:%x:%x, type %d\n", 
+                                                                                            tinfo->tid,
                                                                                            (void*)bufs[i],
-                                                                                            (src_ip >> 24) & 0xff,
-                                                                                            (src_ip >> 16) & 0xff,
-                                                                                            (src_ip >> 8) & 0xff,
-                                                                                            (src_ip >> 0) & 0xff,
-                                                                                            pkt_data->pkt_hdr.eth_hdr.s_addr.addr_bytes[0],
-                                                                                            pkt_data->pkt_hdr.eth_hdr.s_addr.addr_bytes[1],
-                                                                                            pkt_data->pkt_hdr.eth_hdr.s_addr.addr_bytes[2],
-                                                                                            pkt_data->pkt_hdr.eth_hdr.s_addr.addr_bytes[3],
-                                                                                            pkt_data->pkt_hdr.eth_hdr.s_addr.addr_bytes[4],
-                                                                                            pkt_data->pkt_hdr.eth_hdr.s_addr.addr_bytes[5],
-                                                                                            (dst_ip >> 24) & 0xff,
-                                                                                            (dst_ip >> 16) & 0xff,
-                                                                                            (dst_ip >> 8) & 0xff,
-                                                                                            (dst_ip >> 0) & 0xff,
-                                                                                            pkt_data->pkt_hdr.eth_hdr.d_addr.addr_bytes[0],
-                                                                                            pkt_data->pkt_hdr.eth_hdr.d_addr.addr_bytes[1],
-                                                                                            pkt_data->pkt_hdr.eth_hdr.d_addr.addr_bytes[2],
-                                                                                            pkt_data->pkt_hdr.eth_hdr.d_addr.addr_bytes[3],
-                                                                                            pkt_data->pkt_hdr.eth_hdr.d_addr.addr_bytes[4],
-                                                                                            pkt_data->pkt_hdr.eth_hdr.d_addr.addr_bytes[5], 
-                                                                                            src_port,
-                                                                                            dst_port,
-                                                                                            rte_be_to_cpu_32(pkt_data->epoch));
-            // swap s_addr and d_addr
-            struct rte_mbuf * pkt_buf = rte_pktmbuf_alloc(options.s_pkt_mempool);
+                                                                                            pkt_data->eth_hdr.s_addr.addr_bytes[0],
+                                                                                            pkt_data->eth_hdr.s_addr.addr_bytes[1],
+                                                                                            pkt_data->eth_hdr.s_addr.addr_bytes[2],
+                                                                                            pkt_data->eth_hdr.s_addr.addr_bytes[3],
+                                                                                            pkt_data->eth_hdr.s_addr.addr_bytes[4],
+                                                                                            pkt_data->eth_hdr.s_addr.addr_bytes[5],
+                                                                                            pkt_data->eth_hdr.d_addr.addr_bytes[0],
+                                                                                            pkt_data->eth_hdr.d_addr.addr_bytes[1],
+                                                                                            pkt_data->eth_hdr.d_addr.addr_bytes[2],
+                                                                                            pkt_data->eth_hdr.d_addr.addr_bytes[3],
+                                                                                            pkt_data->eth_hdr.d_addr.addr_bytes[4],
+                                                                                            pkt_data->eth_hdr.d_addr.addr_bytes[5], 
+                                                                                            rte_be_to_cpu_16(pkt_data->type));
+        
+
+            switch (rte_be_to_cpu_16(pkt_data->type)) {
+                case PKT_TYPE_PROBE: {
+                    if (options.s_state.load() == SERVER_STATE_PROBE && bufs[i]->userdata == PROBE_MAGIC) {
+                            // send back probe_resp pkt to probe for return latency
+                            pending_probe = true;
+
+                            // book keep probe results
+                            options.s_probe_info.epoch = rte_be_to_cpu_32(((struct pkt_payload_epoch *)pkt_data->payload)->epoch);
+                            options.s_probe_info.timesync = bufs[i]->timesync;
+                            rte_memcpy(&options.s_probe_info.hdr, &pkt_data->eth_hdr, sizeof(struct rte_ether_hdr));
+
+                            pkt_buf = rte_pktmbuf_alloc(options.s_pkt_mempool);
+                    
                            if (pkt_buf == NULL) {
-                rte_exit(EXIT_FAILURE, "locore_main: failed to allocate memory for pkt_buf");
+                                rte_exit(EXIT_FAILURE, "failed to allocate memory for pkt_buf\n");
                            }

-            struct packet_data * tx_data = construct_udp_pkt_hdr(pkt_buf, 
+                            tx_data = construct_pkt_hdr(pkt_buf, PKT_TYPE_PROBE_RESP, 
                                                                &options.s_host_mac, 
-                                                &pkt_data->pkt_hdr.eth_hdr.s_addr, 
-                                                dst_ip, 
-                                                src_ip, 
-                                                dst_port, 
-                                                src_port);
+                                                                &pkt_data->eth_hdr.s_addr);
+
                            if (tx_data == NULL) {
                                rte_exit(EXIT_FAILURE, "failed to construct tx packet %p", (void*)pkt_buf);
                            }
-            // copy, endianess doesn't matter
-            tx_data->epoch = pkt_data->epoch;
-            tx_data->magic = pkt_data->magic;
-            tx_data->clt_ts_rx = pkt_data->clt_ts_rx;
-            tx_data->clt_ts_tx = pkt_data->clt_ts_tx;
-            tx_data->srv_ts_rx = pkt_data->srv_ts_rx;
-            tx_data->srv_ts_tx = pkt_data->srv_ts_tx;
+
+                            rte_memcpy(tx_data->payload, pkt_data->payload, sizeof(struct pkt_payload_epoch));
+
+                            pkt_buf->userdata = PROBE_MAGIC;
+
                            // queue for burst send
                            tx_bufs[nb_tx++] = pkt_buf;
-            // free rx packet
+                    }
+                    break;
+                }
+                case PKT_TYPE_LOAD: {
+                    // we reply to load packet regardless of the server state
+                    pkt_buf = rte_pktmbuf_alloc(options.s_pkt_mempool);
+
+                    if (pkt_buf == NULL) {
+                        rte_exit(EXIT_FAILURE, "failed to allocate memory for pkt_buf\n");
+                    }
+
+                    tx_data = construct_pkt_hdr(pkt_buf, PKT_TYPE_LOAD_RESP,
+                                    &options.s_host_mac, 
+                                    &pkt_data->eth_hdr.s_addr);
+                    
+                    if (tx_data == NULL) {
+                        rte_exit(EXIT_FAILURE, "failed to construct tx packet %p", (void*)pkt_buf);
+                    }
+
+                    rte_memcpy(tx_data->payload, pkt_data->payload, sizeof(struct pkt_payload_epoch));
+                    
+                    // queue for burst send
+                    tx_bufs[nb_tx++] = pkt_buf;
+                    break;
+                }
+                default:
+                    break;
+            }
            rte_pktmbuf_free(bufs[i]);
        }
            
-        const uint16_t nb_tx_succ = rte_eth_tx_burst(options.s_portid, 0, tx_bufs, nb_tx);
-        // cleanup unsent packets
-        // don't need to free others because it's offloaded
+        // send the packets
+        if (nb_tx > 0) {
+            const uint16_t nb_tx_succ = rte_eth_tx_burst(options.s_portid, tinfo->txqid, tx_bufs, nb_tx);
            if (nb_tx_succ < nb_tx) {
-            rte_exit(EXIT_FAILURE, "locore_main: failed to send some packets.\n");
+                rte_exit(EXIT_FAILURE, "failed to send some packets.\n");
+            }
+        }
+
+        // we wanna check every loop not only when there are packets
+        if (pending_probe) {
+            struct timespec ts;
+            struct pkt_payload_stat * stat; 
+            if (rte_eth_timesync_read_tx_timestamp(options.s_portid, &ts) == 0) {
+                ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "locore_main <thread %d>: obtained hw tx timestamp %lld.\n", tinfo->tid, ts.tv_sec * S2NS + ts.tv_nsec);
+                // now we have everything we need 
+                pkt_buf = rte_pktmbuf_alloc(options.s_pkt_mempool);
+                if (pkt_buf == NULL) {
+                    rte_exit(EXIT_FAILURE, "failed to allocate memory for pkt_buf\n");
+                }
+
+                tx_data = construct_pkt_hdr(pkt_buf, PKT_TYPE_STAT,
+                    &options.s_host_mac, 
+                    &options.s_probe_info.hdr.s_addr);
+
+                // populate stats
+                stat = (struct pkt_payload_stat *)tx_data->payload;
+                stat->epoch = rte_cpu_to_be_32(options.s_probe_info.epoch);
+                stat->hw_rx = rte_cpu_to_be_64(options.s_probe_info.last_hw_rx);
+                stat->hw_tx = rte_cpu_to_be_64(ts.tv_nsec + ts.tv_sec * S2NS);
+                stat->sw_rx = rte_cpu_to_be_64(options.s_probe_info.last_sw_rx);
+                stat->sw_tx = rte_cpu_to_be_64(options.s_probe_info.last_sw_tx);
+                
+                // send the packet
+                if (rte_eth_tx_burst(options.s_portid, 0, &pkt_buf, 1) < 1) {
+                    rte_exit(EXIT_FAILURE, "failed to send some packets.\n");
+                }
+
+                // release flux
+                pending_probe = false;
+
+                int expected = SERVER_STATE_PROBE;
+                if (!options.s_state.compare_exchange_strong(expected, SERVER_STATE_WAIT)) {
+                    rte_exit(EXIT_FAILURE, "s_state changed unexpectedly!");
+                }
+            }
        }
 	}

@ -216,7 +360,7 @@ port_init(uint16_t portid, struct rte_mempool *mbuf_pool)
    port_conf.txmode.offloads |= DEV_TX_OFFLOAD_MBUF_FAST_FREE;
    
    /* Configure the Ethernet device. */
-    ret = rte_eth_dev_configure(portid, RX_RING_NUM, TX_RING_NUM, &port_conf);
+    ret = rte_eth_dev_configure(portid, options.num_threads, options.num_threads, &port_conf);
 	if (ret != 0)
 		return ret;

@ -224,21 +368,23 @@ port_init(uint16_t portid, struct rte_mempool *mbuf_pool)
 	if (ret != 0)
 		return ret;

-	/* Allocate and set up 1 RX queue per Ethernet port. */
+	/* Allocate and set up 1 RX queue per thread per Ethernet port. */
    rxconf = dev_info.default_rxconf;
-	for (uint32_t i = 0; i < RX_RING_NUM; i++) {
+	for (int i = 0; i < options.num_threads; i++) {
 		ret = rte_eth_rx_queue_setup(portid, i, nb_rxd, rte_eth_dev_socket_id(portid), &rxconf, mbuf_pool);
 		if (ret < 0)
 			return ret;
+        options.s_thr_info.at(i)->rxqid = i;
 	}

    txconf = dev_info.default_txconf;
 	txconf.offloads = port_conf.txmode.offloads;
-	/* Allocate and set up 1 TX queue per Ethernet port. */
-	for (uint32_t i = 0; i < TX_RING_NUM; i++) {
+	/* Allocate and set up 1 TX queue per thread per Ethernet port. */
+	for (int i = 0; i < options.num_threads; i++) {
 		ret = rte_eth_tx_queue_setup(portid, i, nb_txd, rte_eth_dev_socket_id(portid), &txconf);
 		if (ret < 0)
 			return ret;
+        options.s_thr_info.at(i)->txqid = i;
 	}

    ret = rte_eth_dev_start(portid);
@ -251,14 +397,21 @@ port_init(uint16_t portid, struct rte_mempool *mbuf_pool)
    if (ret != 0)
        return ret;

+    ret = rte_eth_timesync_enable(portid);
+    if (ret != 0)
+        return ret;
+
    /* Enable RX in promiscuous mode for the Ethernet device. */
    ret = rte_eth_promiscuous_enable(portid);
 	if (ret != 0)
 		return ret;

-    if (rte_eth_add_tx_callback(portid, 0, tx_calc_latency, NULL) == NULL || rte_eth_add_rx_callback(portid, 0, rx_add_timestamp, NULL) == NULL) {
+    for (int i = 0; i < options.num_threads; i++) {
+        if (rte_eth_add_tx_callback(portid, options.s_thr_info.at(i)->txqid, tx_add_timestamp, NULL) == NULL || 
+            rte_eth_add_rx_callback(portid, options.s_thr_info.at(i)->rxqid, rx_add_timestamp, NULL) == NULL) {
            return -1;
        }
+    }

 	return 0;
 }
@ -268,13 +421,33 @@ static void usage()
    fprintf(stdout, 
            "Usage:\n" \
            "    -v(vv): verbose mode\n" \
-            "    -h: display the information\n");
+            "    -h: seek help\n" \
+            "    -A: cpu mask for worker threads\n");
+    fflush(stdout);
+}
+
+static void dump_options()
+{
+    ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
+           "main: khat configuration:\n" \
+           "          verbosity: +%d\n" \
+           "          thread count: %d\n" \
+           "          thread mask: %lld\n\n",
+           ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_DEFAULT,
+           options.num_threads,
+           options.cpuset);
 }

 int main(int argc, char* argv[])
 {
    unsigned int nb_ports;
-    struct rte_mempool *mbuf_pool, *mbuf_pool_pkt;
+    struct rte_mempool *mbuf_pool;
+    
+    ntr_init();
+
+    if (nm_init() != 0) {
+        rte_exit(EXIT_FAILURE, "nm init failed!\n");
+    }

    // init dpdk
    int ret = rte_eal_init(argc, argv);
@ -290,14 +463,21 @@ int main(int argc, char* argv[])
    {
        int c;
        // parse arguments
-        while((c = getopt(argc, argv, "hv")) != -1) {
+        while((c = getopt(argc, argv, "hvA:")) != -1) {
            switch (c) {
                case 'v':
                    ntr_set_level(NTR_DEP_USER1, ntr_get_level(NTR_DEP_USER1) + 1);
                    break;
                case 'h':
                    usage();
-                    rte_exit(EXIT_SUCCESS, NULL);
+                    rte_exit(EXIT_SUCCESS, "\n");
+                    break;
+                case 'A':
+                    options.cpuset = strtoull(optarg, nullptr, 16);
+                    options.num_threads = cmask_get_num_cpus(options.cpuset);
+                    if (options.num_threads == 0) {
+                        rte_exit(EXIT_FAILURE, "must run at least one thread\n");
+                    }
                    break;
                default:
                    usage();
@ -307,33 +487,36 @@ int main(int argc, char* argv[])
        }
    }

-    // XXX: singal handler to exit
+    dump_options();

    nb_ports = rte_eth_dev_count_avail();
    if (nb_ports == 0) {
        rte_exit(EXIT_FAILURE, "number of ports must be > 0\n");
    }

-    // create a mbuf memory pool on the socket
-    mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", MBUF_MAX_COUNT * nb_ports, MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());
-    if (mbuf_pool == nullptr) {
-        rte_exit(EXIT_FAILURE, "cannot create mbuf pool\n");
-    }
-
-    // create a pkt mbuf memory pool on the socket
-    mbuf_pool_pkt = rte_pktmbuf_pool_create("MBUF_POOL_PKT", MBUF_MAX_COUNT * nb_ports, MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());
-    if (mbuf_pool_pkt == nullptr) {
-        rte_exit(EXIT_FAILURE, "cannot create mbuf_pkt pool\n");
-    }
-    options.s_pkt_mempool = mbuf_pool_pkt;
-
-
    uint16_t portid = rte_eth_find_next(0);
    if (portid == RTE_MAX_ETHPORTS) {
        rte_exit(EXIT_FAILURE, "cannot find an available port\n");
    }
    options.s_portid = portid;

+    // create a mbuf memory pool on the socket
+    mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", MBUF_MAX_COUNT * nb_ports, MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_eth_dev_socket_id(portid));
+    if (mbuf_pool == nullptr) {
+        rte_exit(EXIT_FAILURE, "cannot create mbuf pool\n");
+    }
+
+    options.s_pkt_mempool = mbuf_pool;
+
+    // init threads
+    uint64_t cpuset = options.cpuset;
+    for(int i = 0; i < options.num_threads; i++) {
+        struct thread_info * tinfo = new thread_info;
+        tinfo->tid = i;
+        tinfo->lcore_id = cmask_get_next_cpu(&cpuset);
+        options.s_thr_info.push_back(tinfo);
+    }
+
    if (port_init(portid, mbuf_pool) != 0) {
        rte_exit(EXIT_FAILURE, "cannot init port %d\n", portid);
    }
@ -342,7 +525,7 @@ int main(int argc, char* argv[])
        rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n", portid);
    }

-    ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "Configured port %d with mac addr %x:%x:%x:%x:%x:%x\n", portid,
+    ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "Configured port %d on socket %d with mac addr %x:%x:%x:%x:%x:%x\n", portid, rte_eth_dev_socket_id(portid),
                                                                                                options.s_host_mac.addr_bytes[0],
                                                                                                options.s_host_mac.addr_bytes[1],
                                                                                                options.s_host_mac.addr_bytes[2],
@ -350,26 +533,22 @@ int main(int argc, char* argv[])
                                                                                                options.s_host_mac.addr_bytes[4],
                                                                                                options.s_host_mac.addr_bytes[5]);
    
+    usleep(S2US);

-    uint16_t lcore_id = rte_get_next_lcore(0, true, false);
-
-    if (lcore_id == RTE_MAX_LCORE) {
-        rte_exit(EXIT_FAILURE, "cannot detect lcores.\n");
+    for(int i = 0; i < options.num_threads; i++) {
+        struct thread_info * tinfo = options.s_thr_info.at(i);
+        ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: launching thread %d on locore %d\n", tinfo->tid, tinfo->lcore_id);
+        if (rte_eal_remote_launch(locore_main, (void *)options.s_thr_info.at(i), tinfo->lcore_id) != 0) {
+            rte_exit(EXIT_FAILURE, "failed to launch function on locore %d\n", tinfo->lcore_id);
+        }
    }

-    if (rte_eal_remote_launch(locore_main, NULL, lcore_id) != 0) {
-        rte_exit(EXIT_FAILURE, "failed to launch function on locore %d\n", lcore_id);
+    for(int i = 0; i < options.num_threads; i++) {
+        struct thread_info * tinfo = options.s_thr_info.at(i);
+        ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: waiting for locore %d...\n", tinfo->lcore_id);
+        if (rte_eal_wait_lcore(tinfo->lcore_id) != 0) {
+            rte_exit(EXIT_FAILURE, "failed to wait for locore %d\n", tinfo->lcore_id);
        }
-
-    // while(true) {
-    //     struct rte_eth_stats stats;
-    //     rte_eth_stats_get(portid, &stats);
-    //     printf("recv: %d missed: %d err: %d\n",(uint32_t)stats.ipackets, (uint32_t)stats.imissed,(uint32_t)stats.ierrors);
-    //     usleep(1000000);
-    // }
-
-    if (rte_eal_wait_lcore(lcore_id) != 0) {
-        rte_exit(EXIT_FAILURE, "failed to wait for locore %d\n", lcore_id);
    }

    // shouldn't get here
--- a/libgen/generator.cc
+++ b/libgen/generator.cc
@ -0,0 +1,74 @@
+// modified from mutilate
+
+#include "gen.h"
+
+Generator* createFacebookKey() { return new GEV(30.7984, 8.20449, 0.078688); }
+
+Generator* createFacebookValue() {
+  Generator* g = new GPareto(15.0, 214.476, 0.348238);
+
+  Discrete* d = new Discrete(g);
+  d->add(0.00536, 0.0);
+  d->add(0.00047, 1.0);
+  d->add(0.17820, 2.0);
+  d->add(0.09239, 3.0);
+  d->add(0.00018, 4.0);
+  d->add(0.02740, 5.0);
+  d->add(0.00065, 6.0);
+  d->add(0.00606, 7.0);
+  d->add(0.00023, 8.0);
+  d->add(0.00837, 9.0);
+  d->add(0.00837, 10.0);
+  d->add(0.08989, 11.0);
+  d->add(0.00092, 12.0);
+  d->add(0.00326, 13.0);
+  d->add(0.01980, 14.0);
+
+  return d;
+}
+
+Generator* createFacebookIA() { return new GPareto(0, 16.0292, 0.154971); }
+
+Generator* createGenerator(std::string str) {
+  if (!strcmp(str.c_str(), "fb_key")) return createFacebookKey();
+  else if (!strcmp(str.c_str(), "fb_value")) return createFacebookValue();
+  else if (!strcmp(str.c_str(), "fb_ia")) return createFacebookIA();
+
+  char *s_copy = new char[str.length() + 1];
+  strcpy(s_copy, str.c_str());
+  char *saveptr = NULL;
+
+  if (atoi(s_copy) != 0 || !strcmp(s_copy, "0")) {
+    double v = atof(s_copy);
+    delete[] s_copy;
+    return new Fixed(v);
+  }
+
+  char *t_ptr = strtok_r(s_copy, ":", &saveptr);
+  char *a_ptr = strtok_r(NULL, ":", &saveptr);
+
+  if (t_ptr == NULL) // || a_ptr == NULL)
+    DIE("strtok(.., \":\") failed to parse %s", str.c_str());
+
+  saveptr = NULL;
+  char *s1 = strtok_r(a_ptr, ",", &saveptr);
+  char *s2 = strtok_r(NULL, ",", &saveptr);
+  char *s3 = strtok_r(NULL, ",", &saveptr);
+
+  double a1 = s1 ? atof(s1) : 0.0;
+  double a2 = s2 ? atof(s2) : 0.0;
+  double a3 = s3 ? atof(s3) : 0.0;
+
+  delete[] s_copy;
+
+  if      (strcasestr(str.c_str(), "fixed")) return new Fixed(a1);
+  else if (strcasestr(str.c_str(), "normal")) return new Normal(a1, a2);
+  else if (strcasestr(str.c_str(), "exponential")) return new Exponential(a1);
+  else if (strcasestr(str.c_str(), "pareto")) return new GPareto(a1, a2, a3);
+  else if (strcasestr(str.c_str(), "gev")) return new GEV(a1, a2, a3);
+  else if (strcasestr(str.c_str(), "uniform")) return new Uniform(a1);
+
+  DIE("Unable to create Generator '%s'", str.c_str());
+
+  return NULL;
+}
--- a/libnm/nm.cc
+++ b/libnm/nm.cc
@ -0,0 +1,127 @@
+#include <hwloc.h>
+#include <vector>
+#include <algorithm>
+
+#include "nm.h"
+
+struct nm_obj {
+    int level;
+    int id;
+    struct nm_obj *parent;
+    std::vector<struct nm_obj *> children;
+};
+
+static bool nm_obj_comparator(struct nm_obj * a, struct nm_obj * b)
+{
+    return a->id < b->id;
+}
+
+static std::vector<struct nm_obj *> nodes;
+static std::vector<struct nm_obj *> cores;
+static std::vector<struct nm_obj *> cpus;
+
+std::vector<struct nm_obj *> * nm_get_nodes()
+{
+    return &nodes;
+}
+
+std::vector<struct nm_obj *> * nm_get_cpus()
+{
+    return &cpus;
+}
+
+std::vector<struct nm_obj *> * nm_get_cores()
+{
+    return &cores;
+}
+
+hwloc_obj_t get_parent_type(hwloc_obj_t obj, hwloc_obj_type_t type)
+{
+    while(obj != nullptr) {
+        if (obj->type == type) {
+            break;
+        }
+        obj = obj->parent;
+    }
+    return obj;
+}
+
+// 0 on success
+// -1 on error
+int nm_init()
+{
+    int ret;
+
+    hwloc_topology * topo;
+    if ((ret = hwloc_topology_init(&topo)) != 0) {
+        return ret;
+    }
+    
+    if ((ret = hwloc_topology_load(topo)) != 0)
+        return ret;
+    
+    // populate numa nodes
+    hwloc_obj_t obj = nullptr;
+    while(1) {
+        obj = hwloc_get_next_obj_by_type(topo, HWLOC_OBJ_PACKAGE, obj);
+        if (obj == nullptr) {
+            break;
+        }
+
+        struct nm_obj * each = new struct nm_obj;
+        each->id = obj->logical_index;
+        each->level = NM_LEVEL_NUMA;
+        each->parent = nullptr;
+        nodes.push_back(each);
+        printf("libnm: identified NUMA node %d\n", each->id);
+    }
+    std::sort(nodes.begin(), nodes.end(), nm_obj_comparator);
+    
+    // populate cpus
+    obj = nullptr;
+    while(1) {
+        obj = hwloc_get_next_obj_by_type(topo, HWLOC_OBJ_CORE, obj);
+        if (obj == nullptr) {
+            break;
+        }
+        struct nm_obj * each = new struct nm_obj;
+        each->id = obj->logical_index;
+        each->level = NM_LEVEL_CPU;
+        hwloc_obj_t parent = get_parent_type(obj, HWLOC_OBJ_PACKAGE);
+        if (parent == nullptr) {
+            return -1;
+        }
+        
+        // XXX: this faults if the OS decides to be stupid
+        each->parent = nodes.at(parent->logical_index);
+        each->parent->children.push_back(each);
+        cpus.push_back(each);
+        printf("libnm: identified CPU %d on NUMA node %d\n", each->id, each->parent->id);
+    }
+    std::sort(cpus.begin(), cpus.end(), nm_obj_comparator);
+
+    // populate cores
+    obj = nullptr;
+    while(1) {
+        obj = hwloc_get_next_obj_by_type(topo, HWLOC_OBJ_PU, obj);
+        if (obj == nullptr) {
+            break;
+        }
+        struct nm_obj * each = new struct nm_obj;
+        each->id = obj->logical_index;
+        each->level = NM_LEVEL_CORE;
+        hwloc_obj_t parent = get_parent_type(obj, HWLOC_OBJ_CORE);
+        if (parent == nullptr) {
+            return -1;
+        }
+        
+        // XXX: this faults if the OS decides to be stupid
+        each->parent = cpus.at(parent->logical_index);
+        each->parent->children.push_back(each);
+        cores.push_back(each);
+        printf("libnm: identified core %d on CPU %d, NUMA node %d\n", each->id, each->parent->id, each->parent->parent->id);
+    }
+    std::sort(cores.begin(), cores.end(), nm_obj_comparator);
+
+    return ret;
+}
--- a/libntr/ntr.c
+++ b/libntr/ntr.c
@ -0,0 +1,43 @@
+#include "ntr.h"
+
+static int ntr_log_levels[NTR_DEP_MAX] = {NTR_LEVEL_DEFAULT};
+static FILE * ntr_out;
+
+void ntr_init()
+{
+    ntr_out = stdout;
+}
+
+void ntr(int dep, int level, const char * fmt, ...)
+{
+    va_list vl;
+    va_start(vl, fmt);
+    if (dep < NTR_DEP_MAX && level <= ntr_log_levels[dep]) {
+        vfprintf(ntr_out, fmt, vl);
+    }
+    va_end(vl);
+}
+
+void ntr_set_level(int dep, int level)
+{
+    if (dep < NTR_DEP_MAX) {
+        ntr_log_levels[dep] = level;
+    }
+}
+
+
+void ntr_set_output(FILE * f)
+{
+    if (f != NULL) {
+        ntr_out = f;
+    }
+}
+
+
+int ntr_get_level(int dep)
+{
+    if (dep < NTR_DEP_MAX) {
+        return ntr_log_levels[dep];
+    }
+    return 0;
+}
--- a/rat/rat.cc
+++ b/rat/rat.cc
@ -0,0 +1,544 @@
+#include <cstdio>
+#include <ctime>
+#include <netinet/in.h>
+#include <rte_config.h>
+#include <rte_common.h>
+#include <rte_eal.h>
+#include <rte_ethdev.h>
+#include <rte_cycles.h>
+#include <rte_lcore.h>
+#include <rte_mbuf.h>
+#include <rte_ether.h>
+#include <rte_launch.h>
+#include <rte_log.h>
+#include <rte_byteorder.h>
+#include <rte_ip.h>
+#include <atomic>
+#include <vector>
+#include <fstream>
+#include <unistd.h>
+
+#include "nm.h"
+#include "gen.h"
+#include "ntr.h"
+#include "pkt.h"
+#include "util.h"
+
+constexpr static unsigned int MBUF_MAX_COUNT = 16384;
+constexpr static unsigned int MBUF_CACHE_SIZE = 512;
+constexpr static unsigned int RX_RING_SIZE = 4096;
+constexpr static unsigned int TX_RING_SIZE = 4096;
+constexpr static unsigned int BURST_SIZE = 32;
+
+constexpr static unsigned int MODE_MASTER = 0;
+constexpr static unsigned int MODE_CLIENT = 1;
+
+static const struct rte_eth_conf port_conf_default{};
+
+struct datapt {
+    uint32_t epoch;
+    uint32_t valid;
+    uint64_t clt_hw_tx;
+    uint64_t clt_sw_tx;
+    uint64_t clt_hw_rx;
+    uint64_t clt_sw_rx;
+    uint64_t srv_hw_tx;
+    uint64_t srv_sw_tx;
+    uint64_t srv_hw_rx;
+    uint64_t srv_sw_rx;
+};
+
+struct thread_info {
+    unsigned int id;
+    unsigned int rxqid{0};
+    unsigned int txqid{0};
+    std::vector<struct datapt *> data;
+    struct datapt * last_datapt{nullptr};
+    unsigned int tot_send{0};
+    unsigned int tot_recv{0};
+    Generator * ia_gen;
+};
+
+struct options_t {
+    unsigned int run_time{5};
+    unsigned int warmup_time{0};
+    unsigned int num_threads{1};
+    unsigned int mode{MODE_MASTER};
+    char output[256] = "output.txt";
+    char ia_gen[256] = "fixed:1";
+    struct rte_ether_addr server_mac;
+    uint64_t cpu_mask;
+    // states
+    struct rte_mempool * mbuf_pool;
+    struct rte_ether_addr s_host_mac;
+    uint16_t s_portid;
+    std::vector<struct thread_info *> s_thr_info;
+    std::atomic<uint32_t> s_epoch;
+    std::atomic<bool> s_stop {false};
+    std::atomic<uint32_t> s_record {0};
+};
+
+static struct options_t options; 
+
+// static struct thread_info * get_thread_info(int qid)
+// {
+//     return options.s_thr_info.at(qid);
+// }
+
+static int
+locore_main(void * tif)
+{
+    struct thread_info * tinfo = (struct thread_info *)tif;
+    struct rte_mbuf *tx_buf;
+    struct rte_mbuf *rx_bufs[BURST_SIZE];
+    struct pkt_hdr *pkt_data;
+    uint32_t core_id = rte_lcore_id();
+    int32_t ret; 
+
+    bool read_tx = true;
+    bool recv_stat = true;
+    bool recv_resp = true;
+
+    uint64_t next_ts;
+    // XXX: check link status instead
+
+    sleep(1);
+    if (rte_eth_dev_socket_id(options.s_portid) > 0 && rte_eth_dev_socket_id(options.s_portid) != (int)rte_socket_id()) {
+        ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "locore_main: WARNING, port %d is on remote NUMA node to "
+                "polling thread.\n\tPerformance will "
+                "not be optimal.\n", options.s_portid);
+    }
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: core %d running thread %d...\n", core_id, tinfo->id);
+
+    next_ts = get_time_us();
+
+    while(!options.s_stop.load()) {
+        uint64_t now = get_time_us();
+        // always pop incoming packets
+        const uint16_t nb_rx = rte_eth_rx_burst(options.s_portid, 0, rx_bufs, BURST_SIZE);
+
+        if (nb_rx > 0) {
+            for (int i = 0; i < nb_rx; i++) {
+                struct pkt_hdr * each = check_valid_packet(rx_bufs[i]);
+                
+                if (each == NULL) {
+                    ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "locore_main: ignoring invalid packet %p.\n", (void*)rx_bufs[i]);
+                    rte_pktmbuf_free(rx_bufs[i]);
+                    continue;
+                }
+
+                uint16_t type = rte_be_to_cpu_16(each->type);
+                ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "locore_main: received packet %p type %d.\n", (void*)rx_bufs[i], type);
+                switch (type) {
+                    struct pkt_payload_epoch * pld_epoch;
+                    struct pkt_payload_stat * pld_stat;
+                    uint32_t epoch;
+
+                    case PKT_TYPE_PROBE_RESP:
+                        pld_epoch = (struct pkt_payload_epoch *)each->payload;
+                        epoch = rte_be_to_cpu_32(pld_epoch->epoch);
+
+                        if (tinfo->last_datapt == nullptr || epoch != tinfo->last_datapt->epoch) {
+                            ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "locore_main: packet %p epoch %d doesn't match datapt %d.\n", (void*)rx_bufs[i], epoch, tinfo->last_datapt->epoch);
+                            break;
+                        }
+
+                        tinfo->tot_recv++;
+
+                        recv_resp = true;
+                        break;
+                    case PKT_TYPE_STAT:
+                        pld_stat = (struct pkt_payload_stat *)each->payload;
+                        epoch = rte_be_to_cpu_32(pld_stat->epoch);
+
+                        if (tinfo->last_datapt == nullptr || epoch != tinfo->last_datapt->epoch) {
+                            ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "locore_main: packet %p epoch %d doesn't match datapt %d.\n", (void*)rx_bufs[i], epoch, tinfo->last_datapt->epoch);
+                            break;
+                        }
+
+                        tinfo->last_datapt->srv_hw_tx = rte_be_to_cpu_64(pld_stat->hw_tx);
+                        tinfo->last_datapt->srv_hw_rx = rte_be_to_cpu_64(pld_stat->hw_rx);
+                        tinfo->last_datapt->srv_sw_tx = rte_be_to_cpu_64(pld_stat->sw_tx);
+                        tinfo->last_datapt->srv_sw_rx = rte_be_to_cpu_64(pld_stat->sw_rx);
+
+                        tinfo->tot_recv++;
+
+                        recv_stat = true;
+                        break;
+                    default:
+                        ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "locore_main: ignoring packet %p with unknown type %d.\n", (void*)rx_bufs[i], type);
+                        rte_pktmbuf_free(rx_bufs[i]);
+                        continue;
+                }
+
+                rte_pktmbuf_free(rx_bufs[i]);
+            }
+        }
+
+        if (read_tx && recv_stat & recv_resp) {
+            // if we have all the data
+
+            if (tinfo->last_datapt != nullptr) {
+                // push the data to the queue if we haven't done so already
+                tinfo->data.push_back(tinfo->last_datapt);
+
+                ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "locore_main: datapt for epoch %d dump:\n" \
+                                                    "                    Valid: %d\n"
+                                                    "                    client TX HW: %llu\n" \
+                                                    "                    client TX SW: %llu\n" \
+                                                    "                    client RX HW: %llu\n" \
+                                                    "                    client RX SW: %llu\n" \
+                                                    "                    server TX HW: %llu\n" \
+                                                    "                    server TX SW: %llu\n" \
+                                                    "                    server RX HW: %llu\n" \
+                                                    "                    server RX SW: %llu\n\n",
+                                                    tinfo->last_datapt->epoch,
+                                                    tinfo->last_datapt->valid,
+                                                    tinfo->last_datapt->clt_hw_tx,
+                                                    tinfo->last_datapt->clt_sw_tx,
+                                                    tinfo->last_datapt->clt_hw_rx,
+                                                    tinfo->last_datapt->clt_sw_rx,
+                                                    tinfo->last_datapt->srv_hw_tx,
+                                                    tinfo->last_datapt->srv_sw_tx,
+                                                    tinfo->last_datapt->srv_hw_rx,
+                                                    tinfo->last_datapt->srv_sw_rx);
+                tinfo->last_datapt = nullptr;
+            }
+
+            if (now >= next_ts) {
+                struct pkt_payload_epoch * pld_epoch;
+                uint32_t epoch;
+
+                next_ts += (int)(tinfo->ia_gen->generate() * 1000000.0);
+
+                // generate the packet
+                tx_buf = rte_pktmbuf_alloc(options.mbuf_pool);
+
+                if (tx_buf == NULL) {
+                    rte_exit(EXIT_FAILURE, "cannot allocate tx_buf\n");
+                }
+
+                pkt_data = construct_pkt_hdr(tx_buf, PKT_TYPE_PROBE,
+                                                &options.s_host_mac, &options.server_mac);
+                if (pkt_data == NULL) {
+                    rte_exit(EXIT_FAILURE, "cannot allocate space for packet_data in mbuf\n");
+                }
+
+                epoch = options.s_epoch.fetch_add(1);
+                pld_epoch = (struct pkt_payload_epoch *)pkt_data->payload;
+                pld_epoch->epoch = rte_cpu_to_be_32(epoch);
+                tinfo->last_datapt = new struct datapt;
+                tinfo->last_datapt->epoch = epoch;
+                tinfo->last_datapt->valid = options.s_record.load();
+
+                read_tx = false;
+                recv_resp = false;
+                recv_stat = false;
+
+                ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "locore_main: sending packet %p with epoch %d\n", (void*)tx_buf, epoch);
+                const uint16_t nb_tx = rte_eth_tx_burst(options.s_portid, tinfo->txqid, &tx_buf, 1);
+                
+                if (nb_tx != 1) {
+                    rte_exit(EXIT_FAILURE, "failed to send packet 0x%p, epoch %d\n", (void*)tx_buf, epoch);
+                }
+            }
+        }
+
+        if (!read_tx) {
+            struct timespec ts;
+            if ((ret = rte_eth_timesync_read_tx_timestamp(options.s_portid, &ts)) == 0) {
+                ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "locore_main: read hw tx timestamp %lld.\n", ts.tv_nsec + ts.tv_sec * S2NS);
+                tinfo->last_datapt->clt_hw_tx = ts.tv_nsec + ts.tv_sec * S2NS;
+                read_tx = true;
+            }
+        }
+	}
+    
+    rte_pktmbuf_free(tx_buf);
+
+    ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: core %d successfully stopped.\n", core_id);
+
+    return 0;
+}
+
+static int 
+port_init(uint16_t portid, struct rte_mempool *mbuf_pool)
+{
+    struct rte_eth_dev_info dev_info;
+    struct rte_eth_conf port_conf = port_conf_default;
+    struct rte_eth_txconf txconf;
+    struct rte_eth_rxconf rxconf;
+
+    uint16_t nb_rxd = RX_RING_SIZE;
+	uint16_t nb_txd = TX_RING_SIZE; 
+    port_conf.rxmode.max_rx_pkt_len = RTE_ETHER_MAX_LEN;
+
+    if(!rte_eth_dev_is_valid_port(portid)) {
+        return -1;
+    }
+
+    int ret = rte_eth_dev_info_get(portid, &dev_info);
+    if (ret != 0) {
+        return ret;
+    }
+
+    port_conf.rxmode.max_rx_pkt_len = RTE_ETHER_MAX_LEN;
+    port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_UDP_CKSUM;
+    port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_IPV4_CKSUM;
+    port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM;
+    port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
+    port_conf.txmode.offloads |= DEV_TX_OFFLOAD_MBUF_FAST_FREE;
+    
+    /* Configure the Ethernet device. */
+    ret = rte_eth_dev_configure(portid, options.num_threads, options.num_threads, &port_conf);
+	if (ret != 0)
+		return ret;
+
+	ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd, &nb_txd);
+	if (ret != 0)
+		return ret;
+
+	/* Allocate and set up 1 RX queue per thread . */
+    rxconf = dev_info.default_rxconf;
+    rxconf.offloads = port_conf.rxmode.offloads;
+	for (uint32_t i = 0; i < options.num_threads; i++) {
+		ret = rte_eth_rx_queue_setup(portid, i, nb_rxd, rte_eth_dev_socket_id(portid), &rxconf, mbuf_pool);
+		if (ret < 0)
+			return ret;
+	}
+
+    txconf = dev_info.default_txconf;
+	txconf.offloads = port_conf.txmode.offloads;
+	/* Allocate and set up 1 TX queue per Ethernet port. */
+	for (uint32_t i = 0; i < options.num_threads; i++) {
+		ret = rte_eth_tx_queue_setup(portid, i, nb_txd, rte_eth_dev_socket_id(portid), &txconf);
+		if (ret < 0)
+			return ret;
+	}
+
+    ret = rte_eth_dev_start(portid);
+    if (ret < 0)
+        return ret;
+
+	/* Display the port MAC address. */
+    struct rte_ether_addr addr;
+    ret = rte_eth_macaddr_get(portid, &addr);
+    if (ret != 0)
+        return ret;
+
+    /* Enable RX in promiscuous mode for the Ethernet device. */
+    ret = rte_eth_promiscuous_enable(portid);
+	if (ret != 0)
+		return ret;
+
+	return 0;
+}
+
+static void dump_options()
+{
+    fprintf(stdout, "Configuration:\n" \
+            "    run time = %d\n" \
+            "    warmup time = %d\n" \
+            "    output file = %s\n" \
+            "    server MAC = %x:%x:%x:%x:%x:%x\n",
+            options.run_time,
+            options.warmup_time,
+            options.output,
+            options.server_mac.addr_bytes[0],
+            options.server_mac.addr_bytes[1],
+            options.server_mac.addr_bytes[2],
+            options.server_mac.addr_bytes[3],
+            options.server_mac.addr_bytes[4],
+            options.server_mac.addr_bytes[5]);
+}
+
+static void usage()
+{
+    fprintf(stdout, 
+            "Usage:\n " \
+            "    -v(vv): verbose mode\n" \
+            "    -h: display the information\n" \
+            "    -o: output filename\n" \
+            "    -t: run time\n" \
+            "    -T: warmup time\n" \
+            "    -s: server's mac\n" \
+            "    -A: affinity mask\n" \
+            "    -a: number of threads\n" \
+            "    -C: client mode\n"
+            "    -i: inter-arrival time distribution\n\n");
+}
+// static void int_handler(int)
+// {
+//     //rte_exit(EXIT_SUCCESS, "Caught SIGINT, exiting...\n");
+// }
+
+int main(int argc, char* argv[])
+{
+    unsigned int nb_ports;
+    struct rte_mempool *mbuf_pool;
+    std::ofstream log_file;
+    struct thread_info *tinfo;
+
+    ntr_init();
+    if (nm_init() != 0)
+        rte_exit(EXIT_FAILURE, "failed to init libnm\n");
+    // signal(SIGINT, int_handler);
+    
+    // init dpdk
+    int ret = rte_eal_init(argc, argv);
+    if (ret < 0) {
+        rte_exit(EXIT_FAILURE, "rte_eal_init failed!\n");
+    }
+
+    argc -= ret;
+    argv += ret;
+
+    // set warning level
+    ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_WARNING);
+    {
+        int c;
+        // parse arguments
+        while((c = getopt(argc, argv, "hvo:t:T:s:A:a:Ci:")) != -1) {
+            switch (c) {
+                case 'v':
+                    ntr_set_level(NTR_DEP_USER1, ntr_get_level(NTR_DEP_USER1) + 1);
+                    break;
+                case 's':
+                    if (rte_ether_unformat_addr(optarg, &options.server_mac) == -1) {
+                        rte_exit(EXIT_FAILURE, "cannot parse %s as mac address.\n", optarg);
+                    }
+                    break;
+                case 't':
+                    options.run_time = atoi(optarg);
+                    break;
+                case 'T':
+                    options.warmup_time = atoi(optarg);
+                    break;
+                case 'h':
+                    usage();
+                    rte_exit(EXIT_SUCCESS, "success\n");
+                case 'o':
+                    strncpy(options.output, optarg, sizeof(options.output) - 1);
+                    break;
+                case 'A':
+                    options.cpu_mask = atoll(optarg);
+                    break;
+                case 'a':
+                    options.num_threads = atoi(optarg);
+                    break;
+                case 'C':
+                    options.mode = MODE_CLIENT;
+                    break;
+                case 'i':
+                    strncpy(options.ia_gen, optarg, sizeof(options.ia_gen) - 1);
+                    break;
+                default:
+                    usage();
+                    rte_exit(EXIT_FAILURE, "unknown argument: %c\n", c);
+                    break;
+            }
+        }
+    }
+
+    // open log file for writing
+    if (options.mode == MODE_MASTER) {
+        log_file.open(options.output, std::ofstream::out);
+        if (!log_file) {
+            rte_exit(EXIT_FAILURE, "failed to open log file %s\n", options.output);
+        }
+    }
+
+    nb_ports = rte_eth_dev_count_avail();
+    if (nb_ports == 0) {
+        rte_exit(EXIT_FAILURE, "number of ports must be > 0\n");
+    }
+
+    uint16_t portid = rte_eth_find_next(0);
+    if (portid == RTE_MAX_ETHPORTS) {
+        rte_exit(EXIT_FAILURE, "cannot find an available port\n");
+    }
+    options.s_portid = portid;
+
+
+    // create a mbuf memory pool on the socket
+    mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", MBUF_MAX_COUNT, MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_eth_dev_socket_id(options.s_portid));
+    if (mbuf_pool == nullptr) {
+        rte_exit(EXIT_FAILURE, "cannot create mbuf pool\n");
+    }
+    options.mbuf_pool = mbuf_pool;
+
+    for(int i = 0; i < 1; i++) {
+        tinfo = new thread_info;
+        tinfo->id = i;
+        tinfo->ia_gen = createGenerator(options.ia_gen);
+        options.s_thr_info.push_back(tinfo);
+    }
+
+    if (port_init(portid, mbuf_pool) != 0) {
+        rte_exit(EXIT_FAILURE, "cannot init port %d\n", portid);
+    }
+
+    if (rte_eth_macaddr_get(portid, &options.s_host_mac) != 0) {
+        rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n", portid);
+    }
+
+    ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "Configured port %d with mac addr %x:%x:%x:%x:%x:%x\n", portid,
+                                                                                                options.s_host_mac.addr_bytes[0],
+                                                                                                options.s_host_mac.addr_bytes[1],
+                                                                                                options.s_host_mac.addr_bytes[2],
+                                                                                                options.s_host_mac.addr_bytes[3],
+                                                                                                options.s_host_mac.addr_bytes[4],
+                                                                                                options.s_host_mac.addr_bytes[5]);
+
+    dump_options();
+
+    sleep(1);
+
+    uint16_t core_id = rte_get_next_lcore(0, true, false);
+
+    if (rte_eal_remote_launch(locore_main, options.s_thr_info.at(0), core_id) != 0) {
+        rte_exit(EXIT_FAILURE, "failed to launch function on locore\n");
+    }
+
+    // poor man's timer
+    // XXX: use kqueue instead
+    struct timespec ts;
+    ts.tv_sec = 1;
+    ts.tv_nsec = 0;
+    uint32_t second = 0;
+    while(true) {
+        if (second >= options.warmup_time) {
+            options.s_record.store(1);
+        }
+        if (second >= options.run_time + options.warmup_time) {
+            options.s_stop.store(true);
+            break;
+        }
+        clock_nanosleep(CLOCK_REALTIME, 0, &ts, NULL);
+        second++;
+    }
+
+    if (rte_eal_wait_lcore(core_id) < 0)
+        rte_exit(EXIT_FAILURE, "failed to wait for job completion\n");
+
+    // dump stats
+    if (options.mode == MODE_MASTER) {
+        thread_info * master_thrd = options.s_thr_info.at(0);
+        for (auto it : master_thrd->data) {
+            if (it->valid) {
+                log_file << it->clt_sw_rx << ',' << it->clt_sw_tx << ','
+                         << it->clt_hw_rx << ',' << it->clt_hw_tx << ','
+                         << it->srv_sw_rx << ',' << it->srv_sw_tx << ','
+                         << it->srv_hw_rx << ',' << it->srv_hw_tx << std::endl;
+            }
+        }
+    }
+    log_file.close();
+
+    // clean up
+    rte_eth_dev_stop(portid);
+    rte_eth_dev_close(portid);
+
+    return 0;
+}
--- a/scripts/compile.sh
+++ b/scripts/compile.sh
@ -0,0 +1,38 @@
+#!/bin/sh
+test_dir="/numam.d"
+root=".."
+servers="skylake2.rcs.uwaterloo.ca skylake3.rcs.uwaterloo.ca" 
+rsync_flags="-vchr"
+ssh_args="-o StrictHostKeyChecking=no -p77"
+
+user=$1
+
+if [ -z $user ]
+then 
+    user=$(whoami)
+fi
+
+echo "USER: $user"
+
+compile() {
+    # separate these functions because we might change kernel (reboot) without needing to recompile
+    echo "====================$1===================="
+    echo "Syncing directories..."
+    ssh $(echo $ssh_args $user@$1) "sudo mkdir -p $test_dir"
+    ssh $(echo $ssh_args $user@$1) "sudo chmod 777 $test_dir"
+    rsync $(echo $rsync_flags) -e 'ssh -p 77' $root/ $user@$1:$test_dir/
+    echo "Compiling..."
+    ssh $(echo $ssh_args $user@$1) "mkdir -p $test_dir/build; cd $test_dir/build; cmake ../; make clean all -j8" &
+    wait
+    echo "$1 Done."
+    echo ""
+}
+
+i=0
+for server in $servers
+do
+	i=$(expr $i + 1)
+    compile "$server" &
+done
+
+wait
--- a/scripts/histo.py
+++ b/scripts/histo.py
@ -0,0 +1,83 @@
+	
+import pandas as pd
+import matplotlib.pyplot as plt
+import matplotlib.mlab as mlab
+import numpy as np
+import sys
+import re
+import os
+import json
+import getopt
+import math
+import concurrent.futures as CF
+import libpar as par
+
+num_bins = 100
+extra_pct = []
+
+def saveplot(fp : str, data : [], title : str):
+    plt.hist(data, num_bins)
+    plt.xlabel("Delay")
+    plt.title(title)
+    plt.ylabel("Frequency")
+    plt.title(os.path.basename(fp))
+    f = plt.gcf()
+    f.set_size_inches(11.69, 8.27)
+    f.savefig(fp + "_" + title + "_" + ".png", dpi=160)
+    plt.clf()
+    print("Generated - " + fp + "_" + title + "_" + ".png")
+
+executor = CF.ProcessPoolExecutor(max_workers=int(os.cpu_count()))
+
+def process_file(each_dir):
+    try:
+        print("Processing " + each_dir + " ...")
+        with open(each_dir, 'r') as f:
+            parser = par.khat_parser()
+            parser.parse(f.read())
+
+        sh = []
+        ss = []
+        ch = []
+        cs = []
+        for pt in parser.datapt:
+            sh.append(pt.s_htx - pt.s_hrx)
+            ss.append(pt.s_stx - pt.s_srx)
+            ch.append(pt.c_hrx - pt.c_htx)
+            cs.append(pt.c_srx - pt.c_stx)
+
+        saveplot(each_dir, sh, "server_hw_delay")
+        saveplot(each_dir, ss, "server_sw_delay")
+        saveplot(each_dir, ch, "client_hw_delay")
+        saveplot(each_dir, cs, "client_sw_delay")
+
+    except Exception:
+        print("Unexpected error:", sys.exc_info())
+
+def process_dir(rootdir):
+    for subdir in os.listdir(rootdir):
+        each_dir = os.path.join(rootdir, subdir)
+        if os.path.isfile(each_dir):
+            if each_dir.endswith("sample.txt") or each_dir.endswith(".sample"):
+                #executor.submit(process_file, each_dir)
+                process_file(each_dir)
+        else:
+            process_dir(each_dir)
+
+def main():    
+    datdir = None
+    options = getopt.getopt(sys.argv[1:], 'd:')[0]
+
+    for opt, arg in options:
+        if opt in ('-d'):
+            datdir = arg
+
+    if datdir == None:
+        datdir = "/home/oscar/projs/kqsched/scripts/pingpong/results.d/sample"
+        #raise Exception("Must specify -d parameter")
+
+    process_dir(datdir)
+    executor.shutdown()
+
+if __name__ == "__main__":
+    main()
--- a/scripts/libs/libpar.py
+++ b/scripts/libs/libpar.py
@ -0,0 +1,113 @@
+import json
+import numpy as np
+
+class khat_parser:
+    class pt:
+        def __init__(self):
+            self.s_htx = 0
+            self.s_hrx = 0
+            self.s_stx = 0
+            self.s_srx = 0
+            self.c_htx = 0
+            self.c_hrx = 0
+            self.c_stx = 0
+            self.c_srx = 0
+
+    def __init__(self):
+        self.datapt = []
+    
+    def parse(self, output : str):
+        for line in output.splitlines():
+            cells = line.split(',')
+            if len(cells) != 8:
+                raise Exception("Invalid line:" + line)
+            pt = self.pt()
+            pt.c_srx = int(cells[0])
+            pt.c_stx = int(cells[1])
+            pt.c_hrx = int(cells[2])
+            pt.c_htx = int(cells[3])
+            pt.s_srx = int(cells[4])
+            pt.s_stx = int(cells[5])
+            pt.s_hrx = int(cells[6])
+            pt.s_htx = int(cells[7])
+            self.datapt.append(pt)
+            
+
+class mutilate_data:
+    def __init__(self):
+        self.dat = {}
+        self.qps = 0
+
+    def to_string(self):
+        ret = "Throughput: " + str(self.qps) + "\n" + json.dumps(self.dat)
+        return ret
+
+    @staticmethod
+    def parse_mut_output(output):
+        ret = mutilate_data()
+        succ_qps = False
+        succ_read = False
+        table = [None, "avg", "std", "min", "5th", "10th", "50th", "90th", "95th", "99th"]
+        table_legacy = [None, "avg", "std", "min", "5th", "10th", "90th", "95th", "99th"]
+        for line in output.splitlines():
+            if line.find("Total QPS") != -1:
+                spl = line.split()
+                if len(spl) == 7:
+                    ret.qps = float(spl[3])
+                    succ_qps = True
+                else:
+                    break
+            elif line.find("read") != -1:
+                spl = line.split()
+                if len(spl) == 10:
+                    for i in range(1, len(spl)):
+                        ret.dat[table[i]] = float(spl[i])
+                    succ_read = True
+                elif len(spl) == 9:
+                    for i in range(1, len(spl)):
+                        ret.dat[table_legacy[i]] = float(spl[i])
+                    succ_read = True
+                else:
+                    break
+        
+        if not (succ_qps and succ_read):
+            raise Exception("Failed to parse data")
+
+        return ret
+
+    @staticmethod
+    def parse_mut_sample(fn):
+        f = open(fn, "r")
+        qps = []
+        lat = []
+        lines = f.readlines()
+        for line in lines:
+            entry = line.split()
+            if len(entry) != 2:
+                raise Exception("Unrecognized line: " + line)
+            qps.append(float(entry[0]))
+            lat.append(float(entry[1]))
+        f.close()
+        return qps, lat
+
+
+    # generate mutilate output format
+    @staticmethod
+    def build_mut_output(lat_arr, qps_arr):
+        output = '{0: <10}'.format('#type') + '{0: >10}'.format('avg') + '{0: >10}'.format('std') + \
+                        '{0: >10}'.format('min') + '{0: >10}'.format('5th') + '{0: >10}'.format('10th') + \
+                        '{0: >10}'.format('50th') + '{0: >10}'.format('90th')  + '{0: >10}'.format('95th') + '{0: >10}'.format('99th') + "\n"
+        
+        output += '{0: <10}'.format('read') + '{0: >10}'.format("{:.1f}".format(np.mean(lat_arr))) + ' ' + \
+                        '{0: >10}'.format("{:.1f}".format(np.std(lat_arr))) + ' ' + \
+                        '{0: >10}'.format("{:.1f}".format(np.min(lat_arr))) + ' ' + \
+                        '{0: >10}'.format("{:.1f}".format(np.percentile(lat_arr, 5))) + ' ' + \
+                        '{0: >10}'.format("{:.1f}".format(np.percentile(lat_arr, 10))) + ' ' + \
+                        '{0: >10}'.format("{:.1f}".format(np.percentile(lat_arr, 50))) + ' ' + \
+                        '{0: >10}'.format("{:.1f}".format(np.percentile(lat_arr, 90))) + ' ' + \
+                        '{0: >10}'.format("{:.1f}".format(np.percentile(lat_arr, 95))) + ' ' + \
+                        '{0: >10}'.format("{:.1f}".format(np.percentile(lat_arr, 99))) + ' ' + "\n" \
+
+        output += "\n" + "Total QPS = " + "{:.1f}".format(np.mean(qps_arr)) + " (0 / 0s)"
+
+        return output
--- a/scripts/libs/libtc.py
+++ b/scripts/libs/libtc.py
@ -0,0 +1,175 @@
+import subprocess as sp
+import time
+import select
+import os
+import pwd
+import sys
+import datetime
+import random
+import re
+from threading import Thread 
+
+tc_logfile = None
+	
+def log_print(info):
+	print(info)
+	if tc_logfile != None:
+		tc_logfile.write(info + "\n")
+		tc_logfile.flush()
+
+tc_output_dir=""
+tc_cur_test = ""
+tc_test_id = 0
+
+def init(odir = "./results.d/"):
+	global tc_output_dir
+	tc_output_dir = odir + "_" + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
+	tc_output_dir = os.path.expanduser(tc_output_dir)
+	os.system("mkdir -p " + tc_output_dir)
+	global tc_logfile
+	tc_logfile = open(tc_output_dir + "/log.txt", "w+")
+
+def begin(name):
+	global tc_test_id
+	global tc_cur_test
+	tc_cur_test = name
+	tc_test_id += 1
+	os.system("mkdir -p " + get_odir())
+	log_print("\n===== Test #" + str(tc_test_id) + " - " + tc_cur_test + " started =====")
+
+def end():
+	global tc_cur_test
+	log_print("\n===== Test #" + str(tc_test_id) + " - " + tc_cur_test + " completed =====")
+	tc_cur_test = None
+
+def get_odir():
+	return tc_output_dir + "/" + tc_cur_test
+
+SCHED_QUEUE = 1
+SCHED_CPU = 2
+SCHED_BEST = 4
+SCHED_FEAT_WS = 1
+def make_sched_flag(sched, args, feat = 0, fargs = 0):
+	return (sched & 0xFF) | (args & 0xFF) << 8 | (feat & 0xFF) << 16 | (fargs & 0xFF) << 24
+
+TUNE_RTSHARE = 2
+TUNE_TFREQ = 1
+def make_tune_flag(obj, val):
+	return (obj & 0xFFFF) | (val & 0xFFFF) << 16 
+
+def get_username():
+    return pwd.getpwuid( os.getuid() )[0]
+
+ssh_param = ""
+def set_ssh_param(para):
+	global ssh_param
+	ssh_param = para
+
+ssh_user = None
+def set_ssh_user(user):
+	global ssh_user
+	ssh_user = user
+
+def remote_exec(srv, cmd, blocking=True, check=True):
+	sub = []
+	for s in srv:
+		p = sp.Popen(["ssh " + ssh_param + " " + ((ssh_user + "@") if ssh_user != None else "") + s + " \"" + cmd +"\""], shell=True, stdout=sp.PIPE, stderr=sp.PIPE)
+		sub.append(p)
+	
+	if blocking:
+		for p in sub:
+			p.wait()
+			if check and p.returncode != 0:
+				raise Exception("Command failed " + cmd)
+
+	return sub
+
+
+def scan_stderr(p, exclude = None):
+	for err in p.stderr:
+		fail = True
+		err = err.decode()
+		err = err.strip()
+
+#		print(err)
+
+		if len(err) == 0:
+			continue
+		
+		if exclude != None:
+			for exc in exclude:
+				if (exc != None) and (re.match(exc, err) != None):
+					fail = False
+					break
+		
+		if fail:
+			log_print("Error detected: " + err)
+			return False
+			
+	return True
+
+# stderr threads
+errthr_objs = []
+errthr_sigstop = False
+errthr_failed = False
+
+def errthr_get_failed():
+	return errthr_failed
+
+def thr_check_stderr(p : sp.Popen, exclude):
+#	print("thread start!")
+	global errthr_failed
+	while(not errthr_sigstop):
+		if not scan_stderr(p, exclude=exclude):
+			errthr_failed = True
+#		print("running!")
+		time.sleep(0.5 + random.uniform(-0.1, 0.1))
+#	print("thread exit!")
+
+def errthr_start():
+	global errthr_sigstop
+	global errthr_failed
+	errthr_sigstop = False
+	errthr_failed = False
+	for thr in errthr_objs:
+		thr.start()
+
+def errthr_create(cp, exclude = None):
+	global errthr_objs
+	for p in cp:
+		errthr_objs.append(Thread(target = thr_check_stderr, args=(p, exclude)))
+
+def errthr_stop():
+	global errthr_objs
+	global errthr_sigstop
+	errthr_sigstop = True
+#	print("waiting!")
+	for thr in errthr_objs:
+		thr.join()
+	errthr_objs.clear()
+
+def parse_hostfile(fp):
+	ret = {}
+	fh = open(fp, "r")
+	content = fh.readlines()
+	fh.close()
+	content = [x.strip() for x in content]
+	for line in content:
+		spl = line.split(" ")
+		if len(spl) >= 2:
+			ret[spl[0]] = spl[1]
+			log_print("Parsed: hostname \"" + spl[0] + "\" -> \"" + spl[1] + "\"")
+	return ret
+
+def process_hostnames(names, hosts):
+	ret = []
+	for line in names:
+		if line in hosts:
+			ret.append(hosts[line])
+		else:
+			ret.append(line)
+	return ret
+
+def get_cpuset_core(threads):
+	ret = "cpuset -l 0-" + str(threads * 2 - 1) + " "
+	return ret
--- a/scripts/run.py
+++ b/scripts/run.py
@ -0,0 +1,225 @@
+import subprocess as sp
+import time
+import select
+import os
+import datetime
+import pwd
+import sys
+import getopt
+import numpy as np
+import re
+
+import libpar as par
+import libtc as tc
+
+step_inc_pct = 100
+init_step = 20000 #
+start_step = 10000
+term_qps = 85000000000
+
+term_pct = 1
+inc_pct = 50
+server_port = 23444
+
+# paths
+test_dir = "/numam.d/build"
+file_dir = os.path.dirname(os.path.realpath(__file__))
+root_dir = os.path.join(file_dir,"..")
+sample_filename = "sample.txt"
+
+affinity = [
+	"0x4", # core 2
+    "0x1000" # core 12
+]
+
+master = ["skylake3.rcs.uwaterloo.ca"]
+master_mac = ["3c:15:fb:c9:f3:4b"]
+
+server = ["skylake2.rcs.uwaterloo.ca"]
+server_mac = ["3c:15:fb:c9:f3:36"]
+
+clients = []
+client_mac = []
+
+rage_quit = 1000 #1s
+warmup = 5
+duration = 10
+cooldown = 0
+cacheline = 0
+SSH_PARAM = "-o StrictHostKeyChecking=no -p77"
+SSH_USER = "oscar"
+
+hostfile = None
+lockstat = False
+client_only = False
+
+def stop_all():
+	# stop clients
+	tc.log_print("Stopping clients...")
+	tc.remote_exec(clients, "sudo killall -9 rat", check=False)
+
+	if not client_only:
+		# stop server
+		tc.log_print("Stopping server...")
+		tc.remote_exec(server, "sudo killall -9 khat", check=False)
+
+		# stop master
+		tc.log_print("Stopping master...")
+		tc.remote_exec(master, "sudo killall -9 cat", check=False)
+
+def get_client_str(clt):
+	ret = " "
+	for client in clt:
+		ret += " -a " + client + " "
+	return ret
+
+def run_exp(sc, ld):
+	while True:
+		if client_only:
+			ssrv = None
+		else:
+			# start server
+			tc.log_print("Starting server...")
+			server_cmd = "sudo " + test_dir + "/khat -- -A " + sc
+			tc.log_print(server_cmd)
+
+			ssrv = tc.remote_exec(server, server_cmd, blocking=False)
+
+		# start clients
+		# tc.log_print("Starting clients...")
+		# client_cmd = tc.get_cpuset_core(client_threads) + " " + test_dir + "/pingpong/build/dismember -A"
+		# tc.log_print(client_cmd)
+		# sclt = tc.remote_exec(ssh_clients, client_cmd, blocking=False)
+
+		time.sleep(3)
+		# start master
+		tc.log_print("Starting master...")
+		master_cmd = "sudo " + test_dir + "/cat -- " + \
+							  " -s " + server_mac[0] + \
+							  " -o " + test_dir + "/" + sample_filename + \
+							  " -t " + str(duration) + \
+                              " -T " + str(warmup) + \
+                              " -i fixed:0.01" + \
+							  " -r " + str(rage_quit) + \
+                              " -A 0x4" 
+
+		tc.log_print(master_cmd)
+		sp = tc.remote_exec(master, master_cmd, blocking=False)
+		p = sp[0]
+
+
+		# launch stderr monitoring thread
+		tc.errthr_create(sp, exclude=[".*EAL.*"])
+		tc.errthr_create(ssrv, exclude=[".*EAL.*"])
+		tc.errthr_start()
+		success = False
+		cur = 0
+		while True:
+			# either failed or timeout
+			# we use failure detection to save time for long durations
+			if tc.errthr_get_failed() or cur >= int(warmup + duration) + 5 :
+				break
+			
+			if p.poll() != None:
+				success = True
+				break
+
+			time.sleep(1)
+			cur = cur + 1
+
+		stop_all()
+		tc.errthr_stop()
+		print("Cooling down...")
+		time.sleep(cooldown)
+			
+		if success:
+			return
+
+def keep_results():
+	scpcmd = "scp -P77 oscar@" + master[0] + ":" + test_dir + "/" + sample_filename + " " + tc.get_odir() + "/sample.txt"
+	tc.log_print(scpcmd)
+	sp.check_call(scpcmd, shell=True)
+
+	with open(tc.get_odir() + "/sample.txt", 'r') as f:
+		tc.log_print("Total requests: " + str(len(f.readlines())))
+
+	return
+
+def main():
+	global hostfile
+	global server
+	global master
+	global clients
+	global client_only
+
+	tc.set_ssh_param(SSH_PARAM)
+	tc.set_ssh_user(SSH_USER)
+
+	options = getopt.getopt(sys.argv[1:], 'h:sldcp')[0]
+	for opt, arg in options:
+		if opt in ('-h'):
+			hostfile = arg
+		elif opt in ('-s'):
+			stop_all()
+			return
+		elif opt in ('-c'):
+			client_only=True
+
+	tc.init("~/results.d/numam/")
+
+	tc.log_print("Configuration:\n" + \
+		  "Hostfile: " + ("None" if hostfile == None else hostfile) + "\n" \
+		  "Client only: " + str(client_only) + "\n")
+
+	if hostfile != None:
+		hosts = tc.parse_hostfile(hostfile)
+		server = tc.process_hostnames(server, hosts)
+		clients = tc.process_hostnames(clients, hosts)
+		master = tc.process_hostnames(master, hosts)
+
+	stop_all()
+
+	for i in range(0, len(affinity)):
+		eaff = affinity[i]
+		# step_mul = 100
+		# last_load = 0
+		# cur_load = start_step
+
+		tc.begin(eaff)
+
+		tc.log_print("============ Affinity: " + str(eaff) + " Load: MAX" + " ============")
+		run_exp(eaff, 0)
+		keep_results()
+		stop_all()
+		
+		# while True:
+		# 	tc.log_print("============ Sched: " + str(ename) + " Flag: " + format(esched, '#04x') + " Load: " + str(cur_load) + " ============")
+
+		# 	output, sout, serr = run_exp(esched, cur_load, lockstat)
+
+		# 	qps = keep_results(output, sout, serr)
+			
+		# 	pct = int((qps - last_load) / init_step * 100)
+		# 	tc.log_print("last_load: " + str(last_load) + " this_load: " + str(qps) + " inc_pct: " + str(pct) + "%")
+
+		# 	if cur_load > term_qps:
+		# 		tc.log_print("qps more than " + str(term_qps) + "%. Done.")
+		# 		break
+
+		# 	if pct <= term_pct:
+		# 		tc.log_print("inc_pct less than TERM_PCT " + str(term_pct) + "%. Done.")
+		# 		break
+
+		# 	if pct <= inc_pct:
+		# 		step_mul += step_inc_pct
+		# 		tc.log_print("inc_pct less than INC_PCT " + str(inc_pct) + "%. Increasing step multiplier to " + str(step_mul) + "%")
+
+		# 	last_load = qps
+		# 	cur_load += int(init_step * step_mul / 100)
+		# 	tc.log_print("")
+		
+		tc.end()
+
+	stop_all()
+
+main()
Author	SHA1	Message	Date
quackerd	a1f69bb3f8	change run.py defaults	2021-02-02 00:46:18 -05:00
quackerd	7b4fc6f3ab	+histogram	2021-02-02 00:43:45 -05:00
quackerd	1ec01d6c37	fix redundant global	2021-02-01 23:49:30 -05:00
quackerd	ff4946a699	run script and robust stderr detection	2021-02-01 23:48:14 -05:00
quackerd	f2be62a9be	cat refactor + rat reborn + statskeeping	2021-01-31 02:50:58 -05:00
quackerd	226449100d	NUMA detection & server multicore	2021-01-28 05:24:59 -05:00
quackerd	82e1098f3b	ptp working Summary: +arc stuff khat timestamp protocol working Test Plan: by hand Reviewers: ali Differential Revision: https://review.rcs.uwaterloo.ca/D408	2021-01-27 03:58:12 -05:00
quackerd	855b9cf714	stuff	2021-01-25 16:30:22 -05:00
quackerd	73c70a5c52	+arc	2021-01-18 14:06:10 -05:00
Oscar Zhao	0500dc1c21	ptp working	2021-01-18 12:19:13 -05:00