sandybridge doesn't support clflushopt

fix dpdk
akh morn
2023-12-06 04:22:46 +08:00 · 2023-12-06 03:38:32 +08:00 · 2023-12-06 03:23:00 +08:00 · 2023-05-01 15:28:51 -04:00 · 2023-05-01 21:18:34 +02:00 · 2023-03-29 22:00:59 +02:00
55 changed files with 9183 additions and 1226 deletions
--- a/.arcconfig
+++ b/.arcconfig
@ -0,0 +1,3 @@
 {
  "phabricator.uri" : "https://review.rcs.uwaterloo.ca/"
 }
--- a/.clang-format
+++ b/.clang-format
@ -0,0 +1,198 @@
 # $FreeBSD$
 # Basic .clang-format
 ---
 BasedOnStyle: WebKit
 AlignAfterOpenBracket: DontAlign
 AlignConsecutiveAssignments: false
 AlignConsecutiveDeclarations: false
 AlignEscapedNewlines: Left
 AlignOperands: false
 AlignTrailingComments: true
 AllowAllArgumentsOnNextLine: false
 AllowAllParametersOfDeclarationOnNextLine: false
 AllowShortBlocksOnASingleLine: Never
 AllowShortCaseLabelsOnASingleLine: false
 AllowShortFunctionsOnASingleLine: InlineOnly
 AllowShortIfStatementsOnASingleLine: Never
 AllowShortLoopsOnASingleLine: false
 AlwaysBreakAfterReturnType: TopLevelDefinitions
 AlwaysBreakBeforeMultilineStrings: false
 AlwaysBreakTemplateDeclarations: MultiLine
 BinPackArguments: true
 BinPackParameters: true
 BreakBeforeBinaryOperators: None
 BreakBeforeBraces: WebKit
 BreakBeforeTernaryOperators: false
 # TODO: BreakStringLiterals can cause very strange formatting so turn it off?
 BreakStringLiterals: false
 # Prefer:
 # some_var = function(arg1,
 #    arg2)
 # over:
 # some_var =
 #     function(arg1, arg2)
 PenaltyBreakAssignment: 100
 # Prefer:
 # some_long_function(arg1, arg2
 #     arg3)
 # over:
 # some_long_function(
 #     arg1, arg2, arg3)
 PenaltyBreakBeforeFirstCallParameter: 100
 CompactNamespaces: true
 DerivePointerAlignment: false
 DisableFormat: false
 ForEachMacros:
  - ARB_ARRFOREACH
  - ARB_ARRFOREACH_REVWCOND
  - ARB_ARRFOREACH_REVERSE
  - ARB_FOREACH
  - ARB_FOREACH_FROM
  - ARB_FOREACH_SAFE
  - ARB_FOREACH_REVERSE
  - ARB_FOREACH_REVERSE_FROM
  - ARB_FOREACH_REVERSE_SAFE
  - BIT_FOREACH_ISCLR
  - BIT_FOREACH_ISSET
  - CPU_FOREACH
  - CPU_FOREACH_ISCLR
  - CPU_FOREACH_ISSET
  - FOREACH_THREAD_IN_PROC
  - FOREACH_PROC_IN_SYSTEM
  - FOREACH_PRISON_CHILD
  - FOREACH_PRISON_DESCENDANT
  - FOREACH_PRISON_DESCENDANT_LOCKED
  - FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL
  - MNT_VNODE_FOREACH_ALL
  - MNT_VNODE_FOREACH_ACTIVE
  - RB_FOREACH
  - RB_FOREACH_FROM
  - RB_FOREACH_SAFE
  - RB_FOREACH_REVERSE
  - RB_FOREACH_REVERSE_FROM
  - RB_FOREACH_REVERSE_SAFE
  - SLIST_FOREACH
  - SLIST_FOREACH_FROM
  - SLIST_FOREACH_FROM_SAFE
  - SLIST_FOREACH_SAFE
  - SLIST_FOREACH_PREVPTR
  - SPLAY_FOREACH
  - LIST_FOREACH
  - LIST_FOREACH_FROM
  - LIST_FOREACH_FROM_SAFE
  - LIST_FOREACH_SAFE
  - STAILQ_FOREACH
  - STAILQ_FOREACH_FROM
  - STAILQ_FOREACH_FROM_SAFE
  - STAILQ_FOREACH_SAFE
  - TAILQ_FOREACH
  - TAILQ_FOREACH_FROM
  - TAILQ_FOREACH_FROM_SAFE
  - TAILQ_FOREACH_REVERSE
  - TAILQ_FOREACH_REVERSE_FROM
  - TAILQ_FOREACH_REVERSE_FROM_SAFE
  - TAILQ_FOREACH_REVERSE_SAFE
  - TAILQ_FOREACH_SAFE
  - VM_MAP_ENTRY_FOREACH
  - VM_PAGE_DUMP_FOREACH
 IndentCaseLabels: false
 IndentPPDirectives: None
 Language: Cpp
 NamespaceIndentation: None
 PointerAlignment: Right
 ContinuationIndentWidth: 4
 IndentWidth: 8
 TabWidth: 8
 ColumnLimit: 80
 UseTab: Always
 SpaceAfterCStyleCast: false
 IncludeBlocks: Regroup
 IncludeCategories:
  - Regex: '^\"opt_.*\.h\"'
    Priority: 1
    SortPriority: 10
  - Regex: '^<sys/cdefs\.h>'
    Priority: 2
    SortPriority: 20
  - Regex: '^<sys/types\.h>'
    Priority: 2
    SortPriority: 21
  - Regex: '^<sys/param\.h>'
    Priority: 2
    SortPriority: 22
  - Regex: '^<sys/systm\.h>'
    Priority: 2
    SortPriority: 23
  - Regex: '^<sys.*/'
    Priority: 2
    SortPriority: 24
  - Regex: '^<vm/vm\.h>'
    Priority: 3
    SortPriority: 30
  - Regex: '^<vm/'
    Priority: 3
    SortPriority: 31
  - Regex: '^<machine/'
    Priority: 4
    SortPriority: 40
  - Regex: '^<(x86|amd64|i386|xen)/'
    Priority: 5
    SortPriority: 50
  - Regex: '^<dev/'
    Priority: 6
    SortPriority: 60
  - Regex: '^<net.*/'
    Priority: 7
    SortPriority: 70
  - Regex: '^<protocols/'
    Priority: 7
    SortPriority: 71
  - Regex: '^<(fs|nfs(|client|server)|ufs)/'
    Priority: 8
    SortPriority: 80
  - Regex: '^<[^/].*\.h'
    Priority: 9
    SortPriority: 90
  - Regex: '^\".*\.h\"'
    Priority: 10
    SortPriority: 100
 # LLVM's header include ordering style is almost the exact opposite of ours.
 # Unfortunately, they have hard-coded their preferences into clang-format.
 # Clobbering this regular expression to avoid matching prevents non-system
 # headers from being forcibly moved to the top of the include list.
 # http://llvm.org/docs/CodingStandards.html#include-style
 IncludeIsMainRegex: 'BLAH_DONT_MATCH_ANYTHING'
 SortIncludes: true
 KeepEmptyLinesAtTheStartOfBlocks: true
 TypenameMacros:
  - ARB_ELMTYPE
  - ARB_HEAD
  - ARB8_HEAD
  - ARB16_HEAD
  - ARB32_HEAD
  - ARB_ENTRY
  - ARB8_ENTRY
  - ARB16_ENTRY
  - ARB32_ENTRY
  - LIST_CLASS_ENTRY
  - LIST_CLASS_HEAD
  - LIST_ENTRY
  - LIST_HEAD
  - QUEUE_TYPEOF
  - RB_ENTRY
  - RB_HEAD
  - SLIST_CLASS_HEAD
  - SLIST_CLASS_ENTRY
  - SLIST_HEAD
  - SLIST_ENTRY
  - SMR_POINTER
  - SPLAY_ENTRY
  - SPLAY_HEAD
  - STAILQ_CLASS_ENTRY
  - STAILQ_CLASS_HEAD
  - STAILQ_ENTRY
  - STAILQ_HEAD
  - TAILQ_CLASS_ENTRY
  - TAILQ_CLASS_HEAD
  - TAILQ_ENTRY
  - TAILQ_HEAD
--- a/.clang-tidy
+++ b/.clang-tidy
@ -0,0 +1 @@
 Checks: "-*,clang-diagnostic-*,clang-analyzer-*,modernize*,performance*,-modernize-use-trailing-return-type,-modernize-avoid-c-arrays"
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,274 @@
 ########################################################
 ########################################################
 ########################################################
 ########################################################
 ########################################################
 ########################################################
 ########################################################
 ################ C STUFF ##########################
 ########################################################
 ########################################################
 ########################################################
 ########################################################
 ########################################################
 ########################################################
 ########################################################
 # Prerequisites
 *.d
 # Object files
 *.o
 *.ko
 *.obj
 *.elf
 # Linker output
 *.ilk
 *.map
 *.exp
 # Precompiled Headers
 *.gch
 *.pch
 # Libraries
 *.lib
 *.a
 *.la
 *.lo
 # Shared objects (inc. Windows DLLs)
 *.dll
 *.so
 *.so.*
 *.dylib
 # Executables
 *.exe
 *.out
 *.app
 *.i*86
 *.x86_64
 *.hex
 # Debug files
 *.dSYM/
 *.su
 *.idb
 *.pdb
 # Kernel Module Compile Results
 *.mod*
 *.cmd
 .tmp_versions/
 modules.order
 Module.symvers
 Mkfile.old
 dkms.conf
 ########################################################
 ########################################################
 ########################################################
 ########################################################
 ########################################################
 ########################################################
 ########################################################
 ################ PYTHON STUFF ##########################
 ########################################################
 ########################################################
 ########################################################
 ########################################################
 ########################################################
 ########################################################
 ########################################################
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 ########################################################
 ########################################################
 ########################################################
 ########################################################
 ########################################################
 ########################################################
 ########################################################
 ################ C++ STUFF ##########################
 ########################################################
 ########################################################
 ########################################################
 ########################################################
 ########################################################
 ########################################################
 ########################################################
 # Prerequisites
 *.d
 # Compiled Object files
 *.slo
 *.lo
 *.o
 *.obj
 # Precompiled Headers
 *.gch
 *.pch
 # Compiled Dynamic libraries
 *.so
 *.dylib
 *.dll
 # Fortran module files
 *.mod
 *.smod
 # Compiled Static libraries
 *.lai
 *.la
 *.a
 *.lib
 # Executables
 *.exe
 *.out
 *.app
 *.clangd
 compile_commands.json
--- a/.gitmodules
+++ b/.gitmodules
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,33 +1,86 @@
 cmake_minimum_required(VERSION 3.0)
 find_program(CC_GCC gcc)
 find_program(CXX_GCC g++)
 set(CMAKE_C_COMPILER ${CC_GCC})
 set(CMAKE_CXX_COMPILER ${CXX_GCC})
 project(khat)
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}")
-find_package(dpdk REQUIRED)
+find_package(PkgConfig REQUIRED)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY lib)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY bin)
 pkg_check_modules(DPDK libdpdk)
 pkg_check_modules(SPDK spdk_event_bdev spdk_env_dpdk)
 pkg_check_modules(SPDK_SYS spdk_syslibs)
 pkg_check_modules(UUID uuid)
 pkg_check_modules(TOPO bsdtopo)
 set(CC_FLAGS -O2 -g -Wall -Wextra -Werror -std=c++11 
        -Wno-deprecated-declarations 
        -Wno-packed-not-aligned
        -Wno-address-of-packed-member
-        -msse4)
+        -Wno-zero-length-array
        -Wno-gnu-zero-variadic-macro-arguments
        -march=native)
 set(C_FLAGS -O2 -g -Wall -Wextra -Werror -std=c2x
        -Wno-deprecated-declarations 
        -Wno-address-of-packed-member
        -Wno-zero-length-array
        -Wno-gnu-zero-variadic-macro-arguments
        -march=native)
 include_directories(${CMAKE_SOURCE_DIR}/inc)
-include_directories(${dpdk_INCLUDE_DIRS})
+include_directories()
-add_executable(khat khat/khat.cc)
+set(LIBNTR_C_FLAGS -O3 -g -Wall -Wextra -Werror -std=c2x)
-add_executable(cat cat/cat.cc)
+set(LIBGEN_CC_FLAGS -O3 -g -Wall -Wextra -Werror -std=c++17)
-set(LINK_LIBS ${dpdk_LIBRARIES} pthread)
+add_library(ntr SHARED libntr/ntr.c)
 target_compile_options(ntr PRIVATE ${LIBNTR_C_FLAGS})
-target_link_libraries(khat ${LINK_LIBS})
+add_library(gen SHARED libgen/generator.cc libgen/loadgen.cc)
-target_compile_options(khat PRIVATE ${CC_FLAGS})
+target_link_libraries(gen PRIVATE pthread ntr ${TOPO_LINK_LIBRARIES} nms)
 target_compile_options(gen PRIVATE ${LIBGEN_CC_FLAGS} ${TOPO_CFLAGS})
-target_link_libraries(cat ${LINK_LIBS})
+add_library(netsup SHARED net/libnetsup/dpdk.cc net/libnetsup/portconf.cc)
-target_compile_options(cat PRIVATE ${CC_FLAGS})
+target_link_libraries(netsup PRIVATE ntr ${DPDK_LINK_LIBRARIES})
 target_compile_options(netsup PRIVATE ${LIBGEN_CC_FLAGS} ${DPDK_CFLAGS})
 add_library(nms SHARED libnms/alloc.c)
 target_link_libraries(nms PRIVATE ${TOPO_LINK_LIBRARIES})
 target_compile_options(nms PRIVATE ${TOPO_CFLAGS})
 add_executable(khat EXCLUDE_FROM_ALL net/khat.cc)
 target_link_libraries(khat PRIVATE pthread ntr gen netsup nms ${DPDK_LINK_LIBRARIES} ${TOPO_LINK_LIBRARIES})
 target_compile_options(khat PRIVATE ${CC_FLAGS} ${DPDK_CFLAGS} ${TOPO_CFLAGS})
 add_executable(cat EXCLUDE_FROM_ALL net/cat.cc)
 target_link_libraries(cat PRIVATE pthread ntr gen netsup nms ${DPDK_LINK_LIBRARIES} ${TOPO_LINK_LIBRARIES})
 target_compile_options(cat PRIVATE ${CC_FLAGS} ${DPDK_CFLAGS} ${TOPO_CFLAGS})
 add_executable(rat EXCLUDE_FROM_ALL net/rat.cc)
 target_link_libraries(rat PRIVATE pthread ntr gen netsup nms ${DPDK_LINK_LIBRARIES} ${TOPO_LINK_LIBRARIES})
 target_compile_options(rat PRIVATE ${CC_FLAGS} ${DPDK_CFLAGS} ${TOPO_CFLAGS})
 add_executable(birb EXCLUDE_FROM_ALL storage/birb.cc storage/io_gen.cc storage/drivers/bdev.cc storage/drivers/bdev_thread.cc storage/drivers/nvme.cc storage/drivers/nvme_thread.cc)
 target_include_directories(birb PRIVATE ${SPDK_INCLUDE_DIRS} ${DPDK_INCLUDE_DIRS} ${UUID_INCLUDE_DIRS})
 target_compile_options(birb PRIVATE ${CC_FLAGS} ${SPDK_CFLAGS} ${UUID_CFLAGS})
 target_link_directories(birb PRIVATE ${SPDK_LIBRARY_DIRS} ${SPDK_SYS_STATIC_LIBRARY_DIRS} ${UUID_LIBRARY_DIRS})
 target_link_libraries(birb PRIVATE pthread ntr gen -Wl,--whole-archive ${SPDK_LIBRARIES} -Wl,--no-whole-archive ${SPDK_SYS_STATIC_LIBRARIES})
 add_executable(birb_posix EXCLUDE_FROM_ALL storage/birb_posix.cc storage/io_gen.cc)
 target_compile_options(birb_posix PRIVATE ${CC_FLAGS})
 target_link_libraries(birb_posix PRIVATE pthread ntr gen)
 add_executable(memloadgen util/memloadgen.cc)
 target_link_libraries(memloadgen PRIVATE pthread gen ntr nms ${TOPO_LINK_LIBRARIES})
 target_compile_options(memloadgen PRIVATE ${CC_FLAGS} ${TOPO_CFLAGS})
 add_executable(mornafah util/mornafah.c)
 target_link_libraries(mornafah PRIVATE pthread gen ntr nms ${TOPO_LINK_LIBRARIES})
 target_compile_options(mornafah PRIVATE ${C_FLAGS} ${TOPO_CFLAGS})
 add_executable(nms_test tests/nms_test.c)
 set_target_properties(nms_test PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/tests)
 target_link_libraries(nms_test PRIVATE nms)
 target_compile_options(nms_test PRIVATE ${C_FLAGS})
--- a/Finddpdk.cmake
+++ b/Finddpdk.cmake
@ -1,142 +0,0 @@
 # Try to find dpdk
 #
 # Once done, this will define
 #
 # dpdk::dpdk
 # dpdk_FOUND
 # dpdk_INCLUDE_DIR
 # dpdk_LIBRARIES
 find_package(PkgConfig QUIET)
 if(PKG_CONFIG_FOUND)
  pkg_check_modules(dpdk QUIET libdpdk)
 endif()
 if(dpdk_INCLUDE_DIRS)
  # good
 elseif(TARGET dpdk::dpdk)
  get_target_property(dpdk_INCLUDE_DIRS
     dpdk::dpdk INTERFACE_INCLUDE_DIRECTORIES)
 else()
  find_path(dpdk_config_INCLUDE_DIR rte_config.h
    HINTS
      ENV DPDK_DIR
    PATH_SUFFIXES
      dpdk
      include)
  find_path(dpdk_common_INCLUDE_DIR rte_common.h
    HINTS
      ENC DPDK_DIR
    PATH_SUFFIXES
      dpdk
      include)
  set(dpdk_INCLUDE_DIRS "${dpdk_config_INCLUDE_DIR}")
  if(NOT dpdk_config_INCLUDE_DIR EQUAL dpdk_common_INCLUDE_DIR)
    list(APPEND dpdk_INCLUDE_DIRS "${dpdk_common_INCLUDE_DIR}")
  endif()
 endif()
 set(components
  bus_pci
  bus_vdev
  cfgfile
  cmdline
  eal
  ethdev
  hash
  kvargs
  mbuf
  mempool
  mempool_ring
  mempool_stack
  net
  pci
  pmd_af_packet
  pmd_bnxt
  pmd_bond
  pmd_cxgbe
  pmd_e1000
  pmd_ena
  pmd_enic
  pmd_i40e
  pmd_ixgbe
  pmd_mlx5
  pmd_nfp
  pmd_qede
  pmd_ring
  pmd_sfc_efx
  pmd_vmxnet3_uio
  ring
  timer)
 # for collecting dpdk library targets, it will be used when defining dpdk::dpdk
 set(_dpdk_libs)
 # for list of dpdk library archive paths
 set(dpdk_LIBRARIES)
 foreach(c ${components})
  set(dpdk_lib dpdk::${c})
  if(TARGET ${dpdk_lib})
    get_target_property(DPDK_rte_${c}_LIBRARY
      ${dpdk_lib} IMPORTED_LOCATION)
  else()
    find_library(DPDK_rte_${c}_LIBRARY rte_${c}
      HINTS
        ENV DPDK_DIR
        ${dpdk_LIBRARY_DIRS}
        PATH_SUFFIXES lib)
  endif()
  if(DPDK_rte_${c}_LIBRARY)
    if (NOT TARGET ${dpdk_lib})
      add_library(${dpdk_lib} UNKNOWN IMPORTED)
      set_target_properties(${dpdk_lib} PROPERTIES
        INTERFACE_INCLUDE_DIRECTORIES "${dpdk_INCLUDE_DIRS}"
        IMPORTED_LOCATION "${DPDK_rte_${c}_LIBRARY}")
      if(c STREQUAL pmd_mlx5)
        find_package(verbs QUIET)
        if(verbs_FOUND)
          target_link_libraries(${dpdk_lib} INTERFACE IBVerbs::verbs)
        endif()
      endif()
    endif()
    list(APPEND _dpdk_libs ${dpdk_lib})
    list(APPEND dpdk_LIBRARIES ${DPDK_rte_${c}_LIBRARY})
  endif()
 endforeach()
 mark_as_advanced(dpdk_INCLUDE_DIRS ${dpdk_LIBRARIES})
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(dpdk DEFAULT_MSG
  dpdk_INCLUDE_DIRS
  dpdk_LIBRARIES)
 if(dpdk_FOUND)
  if(NOT TARGET dpdk::cflags)
     if(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64|x86_64|AMD64")
      set(rte_cflags "-march=core2")
    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm|ARM")
      set(rte_cflags "-march=armv7-a")
    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
      set(rte_cflags "-march=armv8-a+crc")
    endif()
    add_library(dpdk::cflags INTERFACE IMPORTED)
    if (rte_cflags)
      set_target_properties(dpdk::cflags PROPERTIES
        INTERFACE_COMPILE_OPTIONS "${rte_cflags}")
    endif()
  endif()
  if(NOT TARGET dpdk::dpdk)
    add_library(dpdk::dpdk INTERFACE IMPORTED)
    find_package(Threads QUIET)
    list(APPEND _dpdk_libs
      Threads::Threads
      dpdk::cflags)
    set_target_properties(dpdk::dpdk PROPERTIES
      INTERFACE_LINK_LIBRARIES "${_dpdk_libs}"
      INTERFACE_INCLUDE_DIRECTORIES "${dpdk_INCLUDE_DIRS}")
  endif()
 endif()
 unset(_dpdk_libs)
--- a/cat/cat.cc
+++ b/cat/cat.cc
@ -1,444 +0,0 @@
 #include <cstdio>
 #include <ctime>
 #include <netinet/in.h>
 #include <rte_config.h>
 #include <rte_common.h>
 #include <rte_eal.h>
 #include <rte_ethdev.h>
 #include <rte_cycles.h>
 #include <rte_lcore.h>
 #include <rte_mbuf.h>
 #include <rte_ether.h>
 #include <rte_launch.h>
 #include <rte_log.h>
 #include <atomic>
 #include <vector>
 #include <fstream>
 #include <unistd.h>
 #include "ntrlog.h"
 #include "pkt.h"
 #include "rte_byteorder.h"
 #include "rte_ip.h"
 // init NTRLOG
 NTR_DECL_IMPL;
 constexpr unsigned int MBUF_MAX_COUNT = 8191;
 constexpr unsigned int MBUF_CACHE_SIZE = 250;
 constexpr unsigned int RX_RING_SIZE = 1024;
 constexpr unsigned int TX_RING_SIZE = 1024;
 constexpr unsigned int RX_RING_NUM = 1;
 constexpr unsigned int TX_RING_NUM = 1;
 constexpr unsigned int BURST_SIZE = 32;
 static const struct rte_eth_conf port_conf_default{};
 struct datapt{
    uint64_t server_proc = 0;
    uint64_t rtt = 0;
 };
 struct options_t {
    unsigned int run_time = 5;
    unsigned int warmup_time = 0;
    char output[256] = "output.txt";
    struct rte_ether_addr server_mac;
    // states
    std::atomic<bool> s_stop {false};
    std::atomic<bool> s_record {false};
    std::vector<struct datapt *> s_stats;
    struct rte_mempool * s_mbuf_pool;
    uint16_t s_portid;
    struct rte_ether_addr s_host_mac;
 };
 struct options_t options;
 static uint16_t
 rx_calc_latency(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
        struct rte_mbuf **pkts, uint16_t nb_pkts, uint16_t max_pkts __rte_unused, void *_ __rte_unused)
 {
    // XXX: need to get the timestamp in every loop?
    uint64_t now = rte_rdtsc();
    struct packet_data * pkt_data;
    for (int i = 0; i < nb_pkts; i++) {
        pkt_data = check_valid_packet(pkts[i]);
        if (pkt_data == NULL) {
            ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "rx_calc_latency: ignoring invalid packet 0x%p.\n", (void*)pkts[i]);
            continue;  
        }
        ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "rx_calc_latency: tagged packet %p with %llu.\n", (void*)pkts[i], now); 
        pkt_data->clt_ts_rx = rte_cpu_to_be_64(now);
    }
    return nb_pkts;
 }
 static uint16_t
 tx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
 		struct rte_mbuf **pkts, uint16_t nb_pkts, void *_ __rte_unused)
 {
    // XXX: need to get the timestamp in every loop?
    uint64_t now = rte_rdtsc();
    struct packet_data * pkt_data;
    for (int i = 0; i < nb_pkts; i++) {
        pkt_data = check_valid_packet(pkts[i]);
        if (pkt_data == NULL) {
            ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "tx_add_timestamp: ignoring invalid packet 0x%p.\n", (void*)pkts[i]);
            continue;  
        }
        ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "tx_add_timestamp: tagged packet %p with %llu.\n", (void*)pkts[i], now); 
        pkt_data->clt_ts_tx = rte_cpu_to_be_64(now);
    }
    return nb_pkts;
 }
 #define STATE_SEND (0)
 #define STATE_RECV (1)
 static int
 locore_main(void * _unused __rte_unused)
 {
    struct rte_mbuf *tx_buf;
    struct rte_mbuf *rx_bufs[BURST_SIZE];
    struct packet_data *pkt_data;
    uint32_t core_id = rte_lcore_id();
    uint32_t epoch = 0;
    int state = STATE_SEND;
    // XXX: check link status instead
    sleep(1);
    if (rte_eth_dev_socket_id(options.s_portid) > 0 && rte_eth_dev_socket_id(options.s_portid) != (int)rte_socket_id()) {
        ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "locore_main: WARNING, port %d is on remote NUMA node to "
                "polling thread.\n\tPerformance will "
                "not be optimal.\n", options.s_portid);
    }
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: core %d running...\n", core_id);
    tx_buf = rte_pktmbuf_alloc(options.s_mbuf_pool);
    if (tx_buf == NULL) {
        rte_exit(EXIT_FAILURE, "cannot allocate tx_buf\n");
    }
    pkt_data = construct_udp_pkt_hdr(tx_buf, 
                                    &options.s_host_mac, &options.server_mac, 
                                    RTE_IPV4(192, 168, 100, 150), RTE_IPV4(192, 168, 100, 151), 
                                    1337, 1337);
    if (pkt_data == NULL) {
        rte_exit(EXIT_FAILURE, "cannot allocate space for packet_data in mbuf\n");
    }
    pkt_data->magic = rte_cpu_to_be_32(ETHER_FRAME_MAGIC);
    while(!options.s_stop.load()) {
        // always pop incoming packets
        const uint16_t nb_rx = rte_eth_rx_burst(options.s_portid, 0, rx_bufs, BURST_SIZE);
        if (nb_rx != 0) {
            // only process packets when we are ready to receive
            for (int i = 0; i < nb_rx; i++) {
                struct packet_data * each = check_valid_packet(rx_bufs[i]);
                if (each == NULL) {
                    ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "locore_main: ignoring invalid packet %p.\n", (void*)rx_bufs[i]);
                    dump_pkt(rx_bufs[i]);
                    rte_pktmbuf_free(rx_bufs[i]);
                    continue;
                }
                if (rte_be_to_cpu_32(each->epoch) == epoch && state == STATE_RECV) {
                   ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: received packet %p for epoch %d\n", (void*)rx_bufs[i], epoch);
                    if (options.s_record.load()) {
                        // keep statistics
                        struct datapt * dpt = new datapt;
                        dpt->rtt = rte_be_to_cpu_64(each->clt_ts_rx) - rte_be_to_cpu_64(each->clt_ts_tx);
                        dpt->server_proc = rte_be_to_cpu_64(each->srv_ts_tx) - rte_be_to_cpu_64(each->srv_ts_rx);
                        options.s_stats.push_back(dpt);
                    }
                    // bump the epoch and stop processing other packets
                    state = STATE_SEND;
                    epoch++;
                } else {
                    ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: ignoring packet 0x%p with invalid epoch %d.\n", (void*)rx_bufs[i], epoch);
                }
                rte_pktmbuf_free(rx_bufs[i]);
            }
        }
        if (state == STATE_SEND) {
            // set new epoch
            pkt_data->epoch = rte_cpu_to_be_32(epoch);
            ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: sending packet %p with epoch %d\n", (void*)tx_buf, epoch);
            const uint16_t nb_tx = rte_eth_tx_burst(options.s_portid, 0, &tx_buf, 1);
            if (nb_tx < 1) {
                rte_exit(EXIT_FAILURE, "failed to send packet 0x%p, epoch %d\n", (void*)tx_buf, epoch);
            }
            state = STATE_RECV;
        }
 	}
    rte_pktmbuf_free(tx_buf);
    ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: core %d successfully stopped.\n", core_id);
    return 0;
 }
 static int 
 port_init(uint16_t portid, struct rte_mempool *mbuf_pool)
 {
    struct rte_eth_dev_info dev_info;
    struct rte_eth_conf port_conf = port_conf_default;
    struct rte_eth_txconf txconf;
    struct rte_eth_rxconf rxconf;
    uint16_t nb_rxd = RX_RING_SIZE;
 	uint16_t nb_txd = TX_RING_SIZE; 
    port_conf.rxmode.max_rx_pkt_len = RTE_ETHER_MAX_LEN;
    if(!rte_eth_dev_is_valid_port(portid)) {
        return -1;
    }
    int ret = rte_eth_dev_info_get(portid, &dev_info);
    if (ret != 0) {
        return ret;
    }
    port_conf.rxmode.max_rx_pkt_len = RTE_ETHER_MAX_LEN;
    port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_UDP_CKSUM;
    port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_IPV4_CKSUM;
    port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM;
    port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
    port_conf.txmode.offloads |= DEV_TX_OFFLOAD_MBUF_FAST_FREE;
    /* Configure the Ethernet device. */
    ret = rte_eth_dev_configure(portid, RX_RING_NUM, TX_RING_NUM, &port_conf);
 	if (ret != 0)
 		return ret;
 	ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd, &nb_txd);
 	if (ret != 0)
 		return ret;
 	/* Allocate and set up 1 RX queue per Ethernet port. */
    rxconf = dev_info.default_rxconf;
    rxconf.offloads = port_conf.rxmode.offloads;
 	for (uint32_t i = 0; i < RX_RING_NUM; i++) {
 		ret = rte_eth_rx_queue_setup(portid, i, nb_rxd, rte_eth_dev_socket_id(portid), &rxconf, mbuf_pool);
 		if (ret < 0)
 			return ret;
 	}
    txconf = dev_info.default_txconf;
 	txconf.offloads = port_conf.txmode.offloads;
 	/* Allocate and set up 1 TX queue per Ethernet port. */
 	for (uint32_t i = 0; i < TX_RING_NUM; i++) {
 		ret = rte_eth_tx_queue_setup(portid, i, nb_txd, rte_eth_dev_socket_id(portid), &txconf);
 		if (ret < 0)
 			return ret;
 	}
    ret = rte_eth_dev_start(portid);
    if (ret < 0)
        return ret;
 	/* Display the port MAC address. */
    struct rte_ether_addr addr;
    ret = rte_eth_macaddr_get(portid, &addr);
    if (ret != 0)
        return ret;
    /* Enable RX in promiscuous mode for the Ethernet device. */
    ret = rte_eth_promiscuous_enable(portid);
 	if (ret != 0)
 		return ret;
    rte_eth_add_tx_callback(portid, 0, tx_add_timestamp, NULL);
    rte_eth_add_rx_callback(portid, 0, rx_calc_latency, NULL);
 	return 0;
 }
 static void dump_options()
 {
    fprintf(stdout, "Configuration:\n" \
            "    run time = %d\n" \
            "    warmup time = %d\n" \
            "    output file = %s\n" \
            "    server MAC = %x:%x:%x:%x:%x:%x\n",
            options.run_time,
            options.warmup_time,
            options.output,
            options.server_mac.addr_bytes[0],
            options.server_mac.addr_bytes[1],
            options.server_mac.addr_bytes[2],
            options.server_mac.addr_bytes[3],
            options.server_mac.addr_bytes[4],
            options.server_mac.addr_bytes[5]);
 }
 static void usage()
 {
    fprintf(stdout, 
            "Usage:\n " \
            "    -v(vv): verbose mode\n" \
            "    -h: display the information\n" \
            "    -o: output filename\n" \
            "    -t: run time\n" \
            "    -T: warmup time\n" \
            "    -s: server's mac\n\n" );
 }
 int main(int argc, char* argv[])
 {
    unsigned int nb_ports;
    struct rte_mempool *mbuf_pool, *mbuf_pool_pkt;
    std::ofstream log_file;
    // init dpdk
    int ret = rte_eal_init(argc, argv);
    if (ret < 0) {
        rte_exit(EXIT_FAILURE, "rte_eal_init failed!\n");
    }
    argc -= ret;
    argv += ret;
    // set warning level
    ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_WARNING);
    {
        int c;
        // parse arguments
        while((c = getopt(argc, argv, "hvo:t:T:s:")) != -1) {
            switch (c) {
                case 'v':
                    ntr_set_level(NTR_DEP_USER1, ntr_get_level(NTR_DEP_USER1) + 1);
                    break;
                case 's':
                    if (rte_ether_unformat_addr(optarg, &options.server_mac) == -1) {
                        rte_exit(EXIT_FAILURE, "cannot parse %s as mac address.\n", optarg);
                    }
                    break;
                case 't':
                    options.run_time = atoi(optarg);
                    break;
                case 'T':
                    options.warmup_time = atoi(optarg);
                    break;
                case 'h':
                    usage();
                    rte_exit(EXIT_SUCCESS, NULL);
                    break;
                case 'o':
                    strncpy(options.output, optarg, sizeof(options.output) - 1);
                    break;
                default:
                    usage();
                    rte_exit(EXIT_FAILURE, "unknown argument: %c\n", c);
                    break;
            }
        }
    }
    // open log file for writing
    log_file.open(options.output, std::ofstream::out);
    if (!log_file) {
        rte_exit(EXIT_FAILURE, "failed to open log file %s\n", options.output);
    }
    nb_ports = rte_eth_dev_count_avail();
    if (nb_ports == 0) {
        rte_exit(EXIT_FAILURE, "number of ports must be > 0\n");
    }
    // create a mbuf memory pool on the socket
    mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", MBUF_MAX_COUNT * nb_ports, MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());
    if (mbuf_pool == nullptr) {
        rte_exit(EXIT_FAILURE, "cannot create mbuf pool\n");
    }
    mbuf_pool_pkt = rte_pktmbuf_pool_create("MBUF_POOL_PKT", MBUF_MAX_COUNT * nb_ports, MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());
    if (mbuf_pool_pkt == nullptr) {
        rte_exit(EXIT_FAILURE, "cannot create mbuf pool\n");
    }
    options.s_mbuf_pool = mbuf_pool_pkt;
    uint16_t portid = rte_eth_find_next(0);
    if (portid == RTE_MAX_ETHPORTS) {
        rte_exit(EXIT_FAILURE, "cannot find an available port\n");
    }
    options.s_portid = portid;
    if (port_init(portid, mbuf_pool) != 0) {
        rte_exit(EXIT_FAILURE, "cannot init port %d\n", portid);
    }
    if (rte_eth_macaddr_get(portid, &options.s_host_mac) != 0) {
        rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n", portid);
    }
    ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "Configured port %d with mac addr %x:%x:%x:%x:%x:%x\n", portid,
                                                                                                options.s_host_mac.addr_bytes[0],
                                                                                                options.s_host_mac.addr_bytes[1],
                                                                                                options.s_host_mac.addr_bytes[2],
                                                                                                options.s_host_mac.addr_bytes[3],
                                                                                                options.s_host_mac.addr_bytes[4],
                                                                                                options.s_host_mac.addr_bytes[5]);
    dump_options();
    uint16_t core_id = rte_get_next_lcore(0, true, false);
    if (rte_eal_remote_launch(locore_main, NULL, core_id) != 0) {
        rte_exit(EXIT_FAILURE, "failed to launch function on locore\n");
    }
    // poor man's timer
    // XXX: use kqueue instead
    struct timespec ts;
    ts.tv_sec = 1;
    ts.tv_nsec = 0;
    uint32_t second = 0;
    while(true) {
        if (second >= options.warmup_time) {
            options.s_record.store(true);
        }
        if (second >= options.run_time + options.warmup_time) {
            options.s_stop.store(true);
            break;
        }
        clock_nanosleep(CLOCK_REALTIME, 0, &ts, NULL);
        second++;
    }
    if (rte_eal_wait_lcore(core_id) < 0)
        rte_exit(EXIT_FAILURE, "failed to wait for job completion\n");
    // dump stats
    for (auto it = std::begin(options.s_stats); it != std::end(options.s_stats); ++it) {
        log_file << (*it)->rtt << "," << (*it)->server_proc << std::endl;
        delete *it;
    }
    log_file.close();
    // clean up
    rte_eth_dev_stop(portid);
    rte_eth_dev_close(portid);
    return 0;
 }
--- a/compile_flags.txt
+++ b/compile_flags.txt
@ -1,9 +0,0 @@
 -xc++
 -O2
 -std=c++11
 -Wall
 -Werror
 -Wpedantic
 -I/usr/include/dpdk
 -Iinc
 -Wno-deprecated-declarations
--- a/inc/defs.hh
+++ b/inc/defs.hh
@ -0,0 +1,61 @@
 #pragma once
 #include <cstdint>
 #include <cstdlib>
 #include <cstring>
 #include <ctime>
 #include <cstdio>
 #include <sys/types.h>
 #include <sys/cpuset.h>
 #define DISALLOW_EVIL_CONSTRUCTORS(TypeName) \
 	TypeName(const TypeName &) = delete; \
 	void operator=(const TypeName &) = delete
 #define UNUSED __attribute__((unused))
 constexpr static unsigned long S2NS = 1000000000UL;
 constexpr static unsigned long S2US = 1000000UL;
 constexpr static unsigned long MS2NS = 1000000UL;
 constexpr static int NEXT_CPU_NULL = -1;
 #if defined(__x86_64__)
 static inline int
 cmask_get_next_cpu(uint64_t *mask)
 {
 	int ffs = ffsll(*mask);
 	*mask &= ~(1ul << (ffs - 1));
 	return ffs - 1;
 }
 static inline int
 cmask_get_num_cpus(const uint64_t mask)
 {
 	return __builtin_popcount(mask);
 }
 #endif
 static inline uint64_t
 get_uptime()
 {
 	struct timespec tp;
 	clock_gettime(CLOCK_MONOTONIC, &tp);
    return (tp.tv_sec * S2NS + tp.tv_nsec);
 }
 static inline void
 cpulist_to_cpuset(char * cpulist, cpuset_t * cpuset)
 {
 	char * cpu = strtok(cpulist, ",");
 	CPU_ZERO(cpuset);
 	while (cpu != nullptr) {
 		CPU_SET(atoi(cpu), cpuset);
 		cpu = strtok(nullptr, ",");
 	}
 }
 #define ATTR_UNUSED __attribute__((unused))
--- a/inc/gen.hh
+++ b/inc/gen.hh
@ -0,0 +1,346 @@
 // modified from mutilate
 // -*- c++ -*-
 // 1. implement "fixed" generator
 // 2. implement discrete generator
 // 3. implement combine generator?
 #pragma once
 #include <assert.h>
 #include <inttypes.h>
 #include <limits.h>
 #include <math.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include <string>
 #include <utility>
 #include <vector>
 #include <sys/_pthreadtypes.h>
 #include <sys/param.h>
 #include "defs.hh"
 #define D(fmt, ...)
 #define DIE(fmt, ...) (void)0;
 #define FNV_64_PRIME (0x100000001b3ULL)
 #define FNV1_64_INIT (0xcbf29ce484222325ULL)
 static inline uint64_t
 fnv_64_buf(const void *buf, size_t len)
 {
 	uint64_t hval = FNV1_64_INIT;
 	unsigned char *bp = (unsigned char *)buf; /* start of buffer */
 	unsigned char *be = bp + len;		  /* beyond end of buffer */
 	while (bp < be) {
 		hval ^= (uint64_t)*bp++;
 		hval *= FNV_64_PRIME;
 	}
 	return hval;
 }
 static inline uint64_t
 fnv_64(uint64_t in)
 {
 	return fnv_64_buf(&in, sizeof(in));
 }
 // Generator syntax:
 //
 // \d+ == fixed
 // n[ormal]:mean,sd
 // e[xponential]:lambda
 // p[areto]:scale,shape
 // g[ev]:loc,scale,shape
 // fb_value, fb_key, fb_rate
 class Generator {
    public:
 	Generator() { }
 	//  Generator(const Generator &g) = delete;
 	//  virtual Generator& operator=(const Generator &g) = delete;
 	virtual ~Generator() { }
 	virtual double generate(double U = -1.0) = 0;
 	virtual void set_lambda(double) { DIE("set_lambda() not implemented"); }
    protected:
 	std::string type;
 };
 class Fixed : public Generator {
    public:
 	Fixed(double _value = 1.0)
 	    : value(_value)
 	{
 		D("Fixed(%f)", value);
 	}
 	virtual double generate(double) { return value; }
 	virtual void set_lambda(double lambda)
 	{
 		if (lambda > 0.0)
 			value = 1.0 / lambda;
 		else
 			value = 0.0;
 	}
    private:
 	double value;
 };
 class Uniform : public Generator {
    public:
 	Uniform(double _scale)
 	    : scale(_scale)
 	{
 		D("Uniform(%f)", scale);
 	}
 	virtual double generate(double U = -1.0)
 	{
 		if (U < 0.0)
 			U = drand48();
 		return scale * U;
 	}
 	virtual void set_lambda(double lambda)
 	{
 		if (lambda > 0.0)
 			scale = 2.0 / lambda;
 		else
 			scale = 0.0;
 	}
    private:
 	double scale;
 };
 class Normal : public Generator {
    public:
 	Normal(double _mean = 1.0, double _sd = 1.0)
 	    : mean(_mean)
 	    , sd(_sd)
 	{
 		D("Normal(mean=%f, sd=%f)", mean, sd);
 	}
 	virtual double generate(double U = -1.0)
 	{
 		if (U < 0.0)
 			U = drand48();
 		double V = U; // drand48();
 		double N = sqrt(-2 * log(U)) * cos(2 * M_PI * V);
 		return mean + sd * N;
 	}
 	virtual void set_lambda(double lambda)
 	{
 		if (lambda > 0.0)
 			mean = 1.0 / lambda;
 		else
 			mean = 0.0;
 	}
    private:
 	double mean, sd;
 };
 class Exponential : public Generator {
    public:
 	Exponential(double _lambda = 1.0)
 	    : lambda(_lambda)
 	{
 		D("Exponential(lambda=%f)", lambda);
 	}
 	virtual double generate(double U = -1.0)
 	{
 		if (lambda <= 0.0)
 			return 0.0;
 		if (U < 0.0)
 			U = drand48();
 		return -log(U) / lambda;
 	}
 	virtual void set_lambda(double lambda) { this->lambda = lambda; }
    private:
 	double lambda;
 };
 class GPareto : public Generator {
    public:
 	GPareto(double _loc = 0.0, double _scale = 1.0, double _shape = 1.0)
 	    : loc(_loc)
 	    , scale(_scale)
 	    , shape(_shape)
 	{
 		assert(shape != 0.0);
 		D("GPareto(loc=%f, scale=%f, shape=%f)", loc, scale, shape);
 	}
 	virtual double generate(double U = -1.0)
 	{
 		if (U < 0.0)
 			U = drand48();
 		return loc + scale * (pow(U, -shape) - 1) / shape;
 	}
 	virtual void set_lambda(double lambda)
 	{
 		if (lambda <= 0.0)
 			scale = 0.0;
 		else
 			scale = (1 - shape) / lambda - (1 - shape) * loc;
 	}
    private:
 	double loc /* mu */;
 	double scale /* sigma */, shape /* k */;
 };
 class GEV : public Generator {
    public:
 	GEV(double _loc = 0.0, double _scale = 1.0, double _shape = 1.0)
 	    : e(1.0)
 	    , loc(_loc)
 	    , scale(_scale)
 	    , shape(_shape)
 	{
 		assert(shape != 0.0);
 		D("GEV(loc=%f, scale=%f, shape=%f)", loc, scale, shape);
 	}
 	virtual double generate(double U = -1.0)
 	{
 		return loc + scale * (pow(e.generate(U), -shape) - 1) / shape;
 	}
    private:
 	Exponential e;
 	double loc /* mu */, scale /* sigma */, shape /* k */;
 };
 class Discrete : public Generator {
    public:
 	~Discrete() { delete def; }
 	Discrete(Generator *_def = NULL)
 	    : def(_def)
 	{
 		if (def == NULL)
 			def = new Fixed(0.0);
 	}
 	virtual double generate(double U = -1.0)
 	{
 		double Uc = U;
 		if (pv.size() > 0 && U < 0.0)
 			U = drand48();
 		double sum = 0;
 		for (auto p : pv) {
 			sum += p.first;
 			if (U < sum)
 				return p.second;
 		}
 		return def->generate(Uc);
 	}
 	void add(double p, double v)
 	{
 		pv.push_back(std::pair<double, double>(p, v));
 	}
    private:
 	Generator *def;
 	std::vector<std::pair<double, double>> pv;
 };
 class KeyGenerator {
    public:
 	KeyGenerator(Generator *_g, double _max = 10000)
 	    : g(_g)
 	    , max(_max)
 	{
 	}
 	std::string generate(uint64_t ind)
 	{
 		uint64_t h = fnv_64(ind);
 		double U = (double)h / (double)ULLONG_MAX;
 		double G = g->generate(U);
 		int keylen = MAX(round(G), floor(log10(max)) + 1);
 		char key[256];
 		snprintf(key, 256, "%0*" PRIu64, keylen, ind);
 		//    D("%d = %s", ind, key);
 		return std::string(key);
 	}
    private:
 	Generator *g;
 	double max;
 };
 Generator *createGenerator(std::string str);
 Generator *createFacebookKey();
 Generator *createFacebookValue();
 Generator *createFacebookIA();
 // memload generator
 class memload_generator {
 	public:
 	struct memload_generator_options {
 		size_t transaction_size {4096};
 		size_t buffer_size {64*1024*1024};
 		char ia_dist[64]{"fixed"};
 		int verbose {0};
 		uint64_t trans_per_second;
 		bool shared_buffer {true};
 	};
    private:
 	DISALLOW_EVIL_CONSTRUCTORS(memload_generator);
 	struct thread_info {
 		pthread_t pthr;
 		void *from_buffer;
 		void *to_buffer;
 		std::atomic<bool> reset_ts;
 		int tid;
 		int pull;
 		int coreid;
 		int target_dom;
 		struct memload_generator_options * opts;
 		Generator * ia_gen;
 		// stat keeping
 		std::atomic<uint32_t> num_trans;
 		std::atomic<int> * state;
 		std::atomic<int> init_status;
 	};
 	std::vector<struct thread_info *> thr_infos;
 	std::atomic<int> state;
 	static constexpr int STATE_RUN = 0;
 	static constexpr int STATE_RDY = 1;
 	static constexpr int STATE_END = 2;
 	static constexpr int STATE_INIT = 3;
 	static void *worker_thrd(void *_tinfo);
 	struct memload_generator_options opts;
    public:
 	memload_generator(cpuset_t * threads, cpuset_t * modes, cpuset_t * target_domain, struct memload_generator_options * opt, bool *success);
 	uint64_t get_transactions();
 	bool start();
 	bool stop();
 	bool set_transactions(uint64_t tps);
 	~memload_generator();
 };
--- a/inc/net/netsup.hh
+++ b/inc/net/netsup.hh
@ -0,0 +1,133 @@
 #pragma once
 #include <cstdint>
 #include "rte_ethdev.h"
 #include "rte_ether.h"
 #define MAX_NUMA_NODES (64)
 struct device_conf {
 	int portid;
 	uint16_t tx_ring_sz;
 	uint16_t rx_ring_sz;
 	cpuset_t core_affinity;
 	int mtu;
 	uint64_t rx_offloads;
 	uint64_t tx_offloads;
 	uint64_t rss_hf;
 	rte_tx_callback_fn tx_fn;
 	void * tx_user;
 	rte_rx_callback_fn rx_fn;
 	void * rx_user;
 	bool timesync;
 };
 struct mem_conf {
 	int num_elements;
 	int cache_size;
 	int data_room_size;
 	int priv_size;
 	unsigned int max_pools;
 };
 constexpr static uint16_t MIN_RANDOM_PORT = 1000;
 constexpr static uint16_t DEFAULT_RAT_PORT = 1234;
 constexpr static unsigned int INIT_DELAY = 3;
 constexpr static unsigned int MAX_NODES = 64;
 void
 dpdk_init(struct device_conf *dconf, struct mem_conf *mconf);
 void
 dpdk_cleanup(struct device_conf *dconf);
 struct rte_mempool *
 mempool_get(int nodeid);
 struct port_conf {
 	const char * driver_name;
 	uint64_t rxoffload;
 	uint64_t txoffload;
 	uint64_t rss_hf;
 	bool timesync;
 };
 int
 portconf_get(int portid, struct port_conf * out);
 // constexpr static int LATENCY_MEASURE_TIMES = 10000;
 // static inline void
 // sync_port_clock(uint16_t portid)
 //{
 //    int64_t lat = 0;
 //    int64_t get_time_lat;
 //    int64_t write_time_lat;
 //    struct timespec dum;
 //    struct timespec start;
 //    struct timespec end;
 //
 //    // measure clock_gettime latency
 //    for(int i = 0; i < LATENCY_MEASURE_TIMES; i++) {
 //        // end - start ~= 2x clock_gettime's latency
 //        clock_gettime(CLOCK_REALTIME, &start);
 //        clock_gettime(CLOCK_REALTIME, &dum);
 //        clock_gettime(CLOCK_REALTIME, &end);
 //
 //        if (end.tv_sec != start.tv_sec) {
 //            rte_exit(EXIT_FAILURE, "clock_gettime too slow\n");
 //        }
 //
 //        // shouldn't overflow
 //        lat += (end.tv_nsec - start.tv_nsec) / 2;
 //    }
 //    get_time_lat = lat / LATENCY_MEASURE_TIMES;
 //
 //    // measure rte_eth_timesync_write_time latency
 //    lat = 0;
 //    for(int i = 0; i < LATENCY_MEASURE_TIMES; i++) {
 //        // end - start ~= rte_eth_timesync latency + clock_gettime's latency
 //        clock_gettime(CLOCK_REALTIME, &dum);
 //        clock_gettime(CLOCK_REALTIME, &start);
 //        if (rte_eth_timesync_write_time(portid, &dum) != 0) {
 //            rte_exit(EXIT_FAILURE, "failed to write time\n");
 //        }
 //        clock_gettime(CLOCK_REALTIME, &end);
 //
 //        if (end.tv_sec != start.tv_sec) {
 //            rte_exit(EXIT_FAILURE, "clock_gettime too slow!\n");
 //        }
 //
 //        // shouldn't overflow
 //        int64_t elat = (end.tv_nsec - start.tv_nsec) - get_time_lat;
 //        if (elat < 0) {
 //            rte_exit(EXIT_FAILURE, "something is wrong with lat \n");
 //        }
 //        lat += elat;
 //    }
 //    write_time_lat = lat / LATENCY_MEASURE_TIMES;
 //
 //    int64_t delta = (get_time_lat + write_time_lat) / 2;
 //    int64_t s2ns = (int64_t)S2NS;
 //    // sync the clock
 //    while (true) {
 //        clock_gettime(CLOCK_REALTIME, &dum);
 //        dum.tv_nsec += delta;
 //        if (dum.tv_nsec > s2ns) {
 //            // try again if overflow
 //            continue;
 //        }
 //        if (rte_eth_timesync_write_time(portid, &dum) != 0) {
 //            rte_exit(EXIT_FAILURE, "failed to write time\n");
 //        }
 //        break;
 //    }
 //    rte_eth_timesync_enable(portid);
 //
 //    printf("Sync-ed time: get lat %ld write lat %ld\n", get_time_lat,
 //    write_time_lat);
 //}
--- a/inc/net/pkt.hh
+++ b/inc/net/pkt.hh
@ -0,0 +1,490 @@
 #pragma once
 #include <sys/endian.h>
 #include <rte_byteorder.h>
 #include <rte_ether.h>
 #include <rte_flow.h>
 #include <rte_ip.h>
 #include <rte_mbuf.h>
 #include <rte_mbuf_core.h>
 #include <rte_net.h>
 #include <rte_udp.h>
 #include <unistd.h>
 #include "defs.hh"
 #include <random>
 #define IP_DEFTTL 64 /* from RFC 1340. */
 #define IP_VERSION 0x40
 #define IP_HDRLEN 0x05 /* default IP header length == five 32-bits words. */
 #define IP_VHL_DEF (IP_VERSION | IP_HDRLEN)
 #define IP_ADDR_FMT_SIZE 15
 constexpr static uint32_t MAX_JUMBO_MTU = 9000;
 constexpr static uint32_t MAX_STANDARD_MTU = 1500;
 static inline int
 mtu_to_pkt_size(int mtu)
 {
 	return mtu + RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN;
 }
 static inline void 
 tx_burst_all(int portid, int txqid, struct rte_mbuf ** tx_bufs, int sz)
 {
 	int remaining = sz;
 	while(remaining > 0) {
 		remaining -= rte_eth_tx_burst(
 			    portid, txqid, &tx_bufs[sz - remaining],
 			    remaining);
 	}
 }
 constexpr static uint32_t ETHER_FRAME_MAGIC = 0xDCDCE5E5;
 const static struct rte_ether_addr POU_MAC {
 	0x01, 0x00, 0x5e, 0x00, 0x01, 0x81
 };
 const static uint32_t POU_IP = RTE_IPV4(224, 0, 1, 129);
 const static uint16_t POU_PORT = 320;
 /* Khat Protocol:
 *   khat only processes two kinds of packets - LOAD and PROBE
 *   rat:
 *        rat -> LOAD -> khat
 *        khat -> LOAD_RESP -> rat
 *   cat:
 *        cat -> PROBE -> khat (cat tx timestamps)
 *        khat -> PROBE_RESP -> cat (cat rx timestamps and khat tx/rx
 * timestamps) khat -> STAT -> cat (khat sends its tx/rx timestamps)
 */
 /* Rat Protocol:
 *   cat & rat:
 *      1. both launch with full parameters
 *         rat with slave flag
 *         cat with master flag
 *      2. rats create threads and wait for cat's signal
 *      3. cat creates threads
 *      4. cat -> rats SYNC
 *      5. rats -> cat SYNC_ACK and start running
 *      6. cat start running after received all SYNC_ACKs
 *      7. cat stops running, cat -> rats FIN
 *      8. rats stops running, rats -> cat FIN_ACK with QPS
 *      9. cat exits after receiving all FIN_ACKs and flushing statsGG
 */
 struct ptp_hdr {
 	uint8_t ptp_msg_type;
 	uint8_t ptp_ver;
 	uint8_t unused[34];
 } __attribute__((packed));
 struct pkt_hdr {
 	struct rte_ether_hdr eth_hdr;
 	struct rte_ipv4_hdr ipv4_hdr;
 	struct rte_udp_hdr udp_hdr;
 	struct ptp_hdr ptp_hdr;
 	uint16_t type;
 	uint32_t magic;
 	char payload[0];
 } __attribute__((packed));
 struct net_spec {
 	uint32_t ip;
 	rte_ether_addr mac_addr;
 };
 static inline void
 pkt_hdr_to_netspec(struct pkt_hdr *pkt, struct net_spec *src,
    uint16_t *src_port, struct net_spec *dst, uint16_t *dst_port)
 {
 	if (src != nullptr) {
 		rte_ether_addr_copy(&pkt->eth_hdr.src_addr, &src->mac_addr);
 		src->ip = rte_be_to_cpu_32(pkt->ipv4_hdr.src_addr);
 	}
 	if (src_port != nullptr) {
 		*src_port = rte_be_to_cpu_16(pkt->udp_hdr.src_port);
 	}
 	if (dst != nullptr) {
 		rte_ether_addr_copy(&pkt->eth_hdr.dst_addr, &dst->mac_addr);
 		dst->ip = rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr);
 	}
 	if (dst_port != nullptr) {
 		*dst_port = rte_be_to_cpu_16(pkt->udp_hdr.dst_port);
 	}
 };
 struct conn_spec {
 	struct net_spec *src;
 	uint16_t src_port;
 	struct net_spec *dst;
 	uint16_t dst_port;
 };
 // returns 0 on success
 static inline int
 str_to_netspec(char *str, struct net_spec *out)
 {
 	const char *tok = "@";
 	char *token;
 	char *ptr;
 	uint32_t a, b, c, d;
 	token = strtok_r(str, tok, &ptr);
 	if (token == nullptr ||
 	    sscanf(token, "%d.%d.%d.%d", &a, &b, &c, &d) != 4) {
 		return -1;
 	}
 	out->ip = RTE_IPV4(a, b, c, d);
 	// mac next
 	token = strtok_r(nullptr, tok, &ptr);
 	if (token == nullptr ||
 	    rte_ether_unformat_addr(token, &out->mac_addr) != 0) {
 		return -1;
 	}
 	return 0;
 }
 constexpr static uint16_t PKT_TYPE_LOAD = 0;
 constexpr static uint32_t LOAD_TYPE_CPU = 0; // arg0 = cpu time in us. arg1 = unused
 constexpr static uint32_t LOAD_TYPE_MEM = 1; // arg0 = which thread to access. arg1 = how many cachelines to access
 constexpr static uint32_t LOAD_TYPE_MAX = LOAD_TYPE_MEM + 1;
 struct pkt_payload_load {
 	uint32_t epoch;
 	uint32_t type; // type of load
 	uint32_t arg0;
 	uint32_t arg1;
 };
 constexpr static uint16_t PKT_TYPE_PROBE = 1;
 constexpr static uint16_t PKT_TYPE_LOAD_RESP = 2;
 constexpr static uint16_t PKT_TYPE_PROBE_RESP = 3;
 struct pkt_payload_epoch {
 	uint32_t epoch;
 };
 constexpr static uint16_t PKT_TYPE_STAT = 4;
 struct pkt_payload_stat {
 	uint32_t epoch;
 	uint64_t hw_rx;
 	uint64_t hw_tx;
 	uint64_t sw_rx;
 	uint64_t sw_tx;
 };
 constexpr static uint16_t PKT_TYPE_SYNC = 5;
 constexpr static uint16_t PKT_TYPE_SYNC_ACK = 6;
 constexpr static uint16_t PKT_TYPE_FIN = 7;
 constexpr static uint16_t PKT_TYPE_FIN_ACK = 8;
 struct pkt_payload_qps {
 	uint32_t qps;
 	uint32_t recved_pkts;
 	uint32_t lost_pkts;
 };
 constexpr static uint16_t NUM_PKT_TYPES = PKT_TYPE_FIN_ACK + 1;
 // for fast packet verification
 static const uint32_t expected_payload_size[NUM_PKT_TYPES] {
 	sizeof(struct pkt_payload_load),  // LOAD
 	sizeof(struct pkt_payload_epoch), // PROBE
 	sizeof(struct pkt_payload_epoch), // LOAD_RESP
 	sizeof(struct pkt_payload_epoch), // PROBE_RESP
 	sizeof(struct pkt_payload_stat),  // STAT
 	0,				  // SYNC
 	0,				  // SYNC_ACK
 	0,				  // FIN
 	sizeof(struct pkt_payload_qps)	  // FIN_ACK
 };
 class rdport_generator {
    private:
 	DISALLOW_EVIL_CONSTRUCTORS(rdport_generator);
 	constexpr static uint32_t MAX_PORT = 65535;
 	uint32_t min_port;
 	uint32_t cur;
 	std::random_device rd;
 	std::default_random_engine gen;
 	std::uniform_int_distribution<uint32_t> dist;
    public:
 	rdport_generator(uint32_t mport)
 	    : min_port(mport)
 	    , cur(0)
 	    , dist(0, MAX_PORT - min_port)
 	{
 		gen.seed(get_uptime());
 		cur = dist(gen);
 	}
 	uint16_t next()
 	{
 		uint16_t ret = ((cur) % (MAX_PORT - min_port)) + min_port;
 		cur++;
 		return ret;
 	}
 };
 #define NTR_PKT(dep, level, pkt, prefix_fmt, ...)                                                                             \
 	ntr(dep, level,                                                                                                       \
 	    prefix_fmt                                                                                                        \
 	    "src: %d.%d.%d.%d:%d@%02x:%02x:%02x:%02x:%02x:%02x dst: %d.%d.%d.%d:%d@%02x:%02x:%02x:%02x:%02x:%02x type: %d\n", \
 	    ##__VA_ARGS__,                                                                                                    \
 	    (rte_be_to_cpu_32(pkt->ipv4_hdr.src_addr) >> 24) & 0xff,                                                          \
 	    (rte_be_to_cpu_32(pkt->ipv4_hdr.src_addr) >> 16) & 0xff,                                                          \
 	    (rte_be_to_cpu_32(pkt->ipv4_hdr.src_addr) >> 8) & 0xff,                                                           \
 	    (rte_be_to_cpu_32(pkt->ipv4_hdr.src_addr) >> 0) & 0xff,                                                           \
 	    rte_be_to_cpu_16(pkt->udp_hdr.src_port),                                                                          \
 	    pkt->eth_hdr.src_addr.addr_bytes[0],                                                                                \
 	    pkt->eth_hdr.src_addr.addr_bytes[1],                                                                                \
 	    pkt->eth_hdr.src_addr.addr_bytes[2],                                                                                \
 	    pkt->eth_hdr.src_addr.addr_bytes[3],                                                                                \
 	    pkt->eth_hdr.src_addr.addr_bytes[4],                                                                                \
 	    pkt->eth_hdr.src_addr.addr_bytes[5],                                                                                \
 	    (rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr) >> 24) & 0xff,                                                          \
 	    (rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr) >> 16) & 0xff,                                                          \
 	    (rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr) >> 8) & 0xff,                                                           \
 	    (rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr) >> 0) & 0xff,                                                           \
 	    rte_be_to_cpu_16(pkt->udp_hdr.dst_port),                                                                          \
 	    pkt->eth_hdr.dst_addr.addr_bytes[0],                                                                                \
 	    pkt->eth_hdr.dst_addr.addr_bytes[1],                                                                                \
 	    pkt->eth_hdr.dst_addr.addr_bytes[2],                                                                                \
 	    pkt->eth_hdr.dst_addr.addr_bytes[3],                                                                                \
 	    pkt->eth_hdr.dst_addr.addr_bytes[4],                                                                                \
 	    pkt->eth_hdr.dst_addr.addr_bytes[5], rte_be_to_cpu_16(pkt->type))
 static inline void
 print_mac(struct rte_ether_addr *mac)
 {
 	printf("%x:%x:%x:%x:%x:%x", mac->addr_bytes[0], mac->addr_bytes[1],
 	    mac->addr_bytes[2], mac->addr_bytes[3], mac->addr_bytes[4],
 	    mac->addr_bytes[5]);
 }
 static inline void
 print_ipv4(uint32_t ip)
 {
 	printf("%d.%d.%d.%d", (ip >> 24) & 0xff, (ip >> 16) & 0xff,
 	    (ip >> 8) & 0xff, (ip >> 0) & 0xff);
 }
 static inline void
 dump_pkt(struct rte_mbuf *pkt)
 {
 	if (rte_pktmbuf_data_len(pkt) < sizeof(struct rte_ether_hdr)) {
 		return;
 	}
 	struct rte_ether_hdr _eth_hdr;
 	auto eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_read(
 	    pkt, 0, sizeof(struct rte_ether_hdr), &_eth_hdr);
 	if (eth_hdr == nullptr) {
 		return;
 	}
 	// ethernet frame
 	printf(
 	    "Packet %p: Length 0x%x\n", (void *)pkt, rte_pktmbuf_data_len(pkt));
 	printf("    Ethernet header:\n");
 	printf("        Src:");
 	print_mac(&eth_hdr->src_addr);
 	printf("\n");
 	printf("        Dst:");
 	print_mac(&eth_hdr->dst_addr);
 	printf("\n");
 	printf("        Type: 0x%x\n", rte_be_to_cpu_16(eth_hdr->ether_type));
 	uint16_t ether_type = rte_be_to_cpu_16(eth_hdr->ether_type);
 	if (ether_type != RTE_ETHER_TYPE_IPV4) {
 		return;
 	}
 	if (rte_pktmbuf_data_len(pkt) <
 	    sizeof(struct rte_ether_hdr) + sizeof(struct rte_ipv4_hdr)) {
 		return;
 	}
 	// dump ip header
 	auto ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
 	printf("    IPv4 header:\n");
 	printf("        Src:");
 	print_ipv4(rte_be_to_cpu_32(ipv4_hdr->src_addr));
 	printf("\n");
 	printf("        Dst:");
 	print_ipv4(rte_be_to_cpu_32(ipv4_hdr->dst_addr));
 	printf("\n");
 	printf("        Protocol: 0x%x\n", ipv4_hdr->next_proto_id);
 }
 static inline bool
 is_l2ts_pkt(uint16_t type)
 {
 	return type == PKT_TYPE_PROBE || type == PKT_TYPE_PROBE_RESP;
 }
 // fills the packet with the information except for the payload itself
 static inline struct pkt_hdr *
 construct_pkt_hdr(
    struct rte_mbuf *buf, uint16_t type, const struct conn_spec *conn, int pkt_pad_sz)
 {
 	rte_pktmbuf_reset(buf);
 	int total_sz = sizeof(struct pkt_hdr) +
 	    expected_payload_size[type];
 	if (pkt_pad_sz > total_sz) {
 		total_sz = pkt_pad_sz;
 	}
 	auto pkt_data = (struct pkt_hdr *)rte_pktmbuf_append(buf, total_sz);
 	if (pkt_data == nullptr)
 		return nullptr;
 	struct rte_ether_hdr *eth_hdr;
 	struct rte_ipv4_hdr *ipv4_hdr;
 	struct rte_udp_hdr *udp_hdr;
 	bool is_ts_pkt = is_l2ts_pkt(type);
 	// single segment
 	buf->nb_segs = 1;
 	// construct l2 header
 	eth_hdr = &pkt_data->eth_hdr;
 	rte_ether_addr_copy(&conn->src->mac_addr, &eth_hdr->src_addr);
 	if (is_ts_pkt) {
 		rte_ether_addr_copy(&POU_MAC, &eth_hdr->dst_addr);
 	} else {
 		rte_ether_addr_copy(&conn->dst->mac_addr, &eth_hdr->dst_addr);
 	}
 	eth_hdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4);
 	buf->l2_len = sizeof(struct rte_ether_hdr);
 	// construct l3 header
 	ipv4_hdr = &pkt_data->ipv4_hdr;
 	memset(ipv4_hdr, 0, sizeof(struct rte_ipv4_hdr));
 	ipv4_hdr->version_ihl = IP_VHL_DEF;
 	ipv4_hdr->type_of_service = 0;
 	ipv4_hdr->fragment_offset = 0;
 	ipv4_hdr->time_to_live = IP_DEFTTL;
 	ipv4_hdr->next_proto_id = IPPROTO_UDP;
 	ipv4_hdr->packet_id = 0;
 	ipv4_hdr->src_addr = rte_cpu_to_be_32(conn->src->ip);
 	if (is_ts_pkt) {
 		ipv4_hdr->dst_addr = rte_cpu_to_be_32(POU_IP);
 	} else {
 		ipv4_hdr->dst_addr = rte_cpu_to_be_32(conn->dst->ip);
 	}
 	ipv4_hdr->total_length = rte_cpu_to_be_16(total_sz - sizeof(struct rte_ether_hdr) - sizeof(struct rte_ipv4_hdr));
 	ipv4_hdr->hdr_checksum = 0;
 	buf->l3_len = sizeof(struct rte_ipv4_hdr);
 	// construct l4 header
 	udp_hdr = &pkt_data->udp_hdr;
 	udp_hdr->src_port = rte_cpu_to_be_16(conn->src_port);
 	if (is_ts_pkt) {
 		udp_hdr->dst_port = rte_cpu_to_be_16(POU_PORT);
 	} else {
 		udp_hdr->dst_port = rte_cpu_to_be_16(conn->dst_port);
 	}
 	udp_hdr->dgram_cksum = 0; /* No UDP checksum. */
 	udp_hdr->dgram_len = total_sz - sizeof(struct rte_ether_hdr) - sizeof(struct rte_ipv4_hdr) - sizeof(struct rte_udp_hdr);
 	buf->l4_len = sizeof(struct rte_udp_hdr);
 	buf->ol_flags |= RTE_MBUF_F_TX_IPV4;
 	buf->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
 	buf->ol_flags |= RTE_MBUF_F_TX_UDP_CKSUM;
 	if (is_ts_pkt) {
 		// set misc flags
 		buf->ol_flags |= RTE_MBUF_F_TX_IEEE1588_TMST;
 		pkt_data->ptp_hdr.ptp_ver = 0x2;      // VER 2
 		pkt_data->ptp_hdr.ptp_msg_type = 0x0; // SYNC
 	} else {
 		pkt_data->ptp_hdr.ptp_ver = 0xff; // invalid ver
 	}
 	pkt_data->type = rte_cpu_to_be_16(type);
 	pkt_data->magic = rte_cpu_to_be_32(ETHER_FRAME_MAGIC);
 	return pkt_data;
 }
 // returns 0 on success
 static inline int
 alloc_pkt_hdr(struct rte_mempool *pool, uint16_t type,
    const struct conn_spec *conn, int pkt_pad_sz, struct rte_mbuf **mbuf_out,
    struct pkt_hdr **hdr_out)
 {
 	struct pkt_hdr *hdr;
 	struct rte_mbuf *pkt = rte_pktmbuf_alloc(pool);
 	if (pkt == nullptr) {
 		return -1;
 	}
 	// printf("alloc_pkt_hdr:\n");
 	// printf("from ");
 	// print_mac(&conn->src->mac_addr);
 	// printf("\nto ");
 	// print_mac(&conn->dst->mac_addr);
 	// printf("\n");
 	hdr = construct_pkt_hdr(pkt, type, conn, pkt_pad_sz);
 	if (hdr == nullptr) {
 		rte_pktmbuf_free(pkt);
 		return -1;
 	}
 	*mbuf_out = pkt;
 	*hdr_out = hdr;
 	return 0;
 }
 static inline struct pkt_hdr *
 check_valid_packet(struct rte_mbuf *pkt, const struct rte_ether_addr *host_mac)
 {
 	struct pkt_hdr *pkt_data = nullptr;
 	const struct rte_ether_addr *expected_mac = nullptr;
 	uint16_t type;
 	const uint32_t data_len = rte_pktmbuf_data_len(pkt);
 	if (data_len < sizeof(struct pkt_hdr)) {
 		return nullptr;
 	}
 	pkt_data = rte_pktmbuf_mtod(pkt, struct pkt_hdr *);
 	// check MAGIC
 	if (rte_be_to_cpu_32(pkt_data->magic) != ETHER_FRAME_MAGIC) {
 		return nullptr;
 	}
 	type = rte_be_to_cpu_16(pkt_data->type);
 	// check type and payload size
 	if ((type >= NUM_PKT_TYPES) ||
 	    (data_len <
 		(sizeof(struct pkt_hdr) +
 		    expected_payload_size[rte_be_to_cpu_16(pkt_data->type)]))) {
 		return nullptr;
 	}
 	// strict dest mac filter
 	if (host_mac != nullptr) {
 		if (is_l2ts_pkt(type)) {
 			// dst mac must be the broadcast addr
 			expected_mac = &POU_MAC;
 		} else {
 			// dst mac must match the host mac
 			expected_mac = host_mac;
 		}
 		if (!rte_is_same_ether_addr(
 			expected_mac, &pkt_data->eth_hdr.dst_addr))
 			return nullptr;
 	}
 	return pkt_data;
 }
--- a/inc/nms.h
+++ b/inc/nms.h
@ -0,0 +1,26 @@
 #pragma once
 #include <sys/types.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 int
 nms_init(int verbose);
 void *
 nms_malloc(int nodeid, size_t sz);
 void *
 nms_alloc_static(int nodeid, size_t sz);
 void
 nms_free_static(void * buf, size_t sz);
 void
 nms_free(int nodeid, void * addr);
 #ifdef __cplusplus
 }
 #endif // __cplusplus
--- a/inc/ntr.h
+++ b/inc/ntr.h
@ -0,0 +1,38 @@
 #pragma once
 #include <stdarg.h>
 #include <stdio.h>
 #define NTR_LEVEL_NONE (0)
 #define NTR_LEVEL_ERROR (1)
 #define NTR_LEVEL_WARNING (2)
 #define NTR_LEVEL_INFO (3)
 #define NTR_LEVEL_DEBUG (4)
 #define NTR_LEVEL_DEFAULT (NTR_LEVEL_WARNING)
 #define NTR_DEP_NTR (0)
 #define NTR_DEP_USER1 (1)
 #define NTR_DEP_USER2 (2)
 #define NTR_DEP_USER3 (3)
 #define NTR_DEP_USER4 (4)
 #define NTR_DEP_USER5 (5)
 #define NTR_DEP_MAX (NTR_DEP_USER5 + 1)
 #ifdef __cplusplus
 extern "C" {
 #endif
 void ntr_init();
 __attribute__((format(printf, 3, 4))) void ntr(
    int dep, int level, const char *fmt, ...);
 void ntr_set_level(int dep, int level);
 void ntr_set_output(FILE *f);
 int ntr_get_level(int dep);
 #ifdef __cplusplus
 }
 #endif
--- a/inc/ntrlog.h
+++ b/inc/ntrlog.h
@ -1,61 +0,0 @@
 #pragma once
 #include <stdio.h>
 #define NTR_LEVEL_NONE (0)
 #define NTR_LEVEL_ERROR (1)
 #define NTR_LEVEL_WARNING (2)
 #define NTR_LEVEL_INFO (3)
 #define NTR_LEVEL_DEBUG (4)
 #define NTR_LEVEL_DEFAULT (NTR_LEVEL_WARNING)
 #define NTR_DEP_NTR (0)
 #define NTR_DEP_USER1 (1)
 #define NTR_DEP_USER2 (2)
 #define NTR_DEP_USER3 (3)
 #define NTR_DEP_USER4 (4)
 #define NTR_DEP_USER5 (5)
 #define NTR_DEP_MAX (NTR_DEP_USER5 + 1)
 #define NTR_DECL_IMPL \
 int ntr_log_levels[NTR_DEP_MAX] = {NTR_LEVEL_DEFAULT}; \
 FILE * ntr_out = stdout
 extern int ntr_log_levels[];
 extern FILE * ntr_out;
 static inline
 void ntr(int dep, int level, const char * fmt, ...)
 {
    va_list vl;
    va_start(vl, fmt);
    if (dep < NTR_DEP_MAX && level <= ntr_log_levels[dep]) {
        vfprintf(ntr_out, fmt, vl);
    }
    va_end(vl);
 }
 static inline
 void ntr_set_level(int dep, int level)
 {
    if (dep < NTR_DEP_MAX) {
        ntr_log_levels[dep] = level;
    }
 }
 static inline
 void ntr_set_output(FILE * f)
 {
    if (f != NULL) {
        ntr_out = f;
    }
 }
 static inline
 int ntr_get_level(int dep)
 {
    if (dep < NTR_DEP_MAX) {
        return ntr_log_levels[dep];
    }
    return 0;
 }
--- a/inc/pkt.h
+++ b/inc/pkt.h
@ -1,175 +0,0 @@
 #pragma once
 #include <rte_mbuf_core.h>
 #include <rte_mbuf.h>
 #include <rte_udp.h>
 #include <rte_byteorder.h>
 #include <rte_ip.h>
 #include <stdint.h>
 #include <rte_flow.h>
 #include <rte_ether.h>
 #include <unistd.h>
 #include <rte_net.h>
 #include <rte_vxlan.h>
 #define IP_DEFTTL 64 /* from RFC 1340. */
 #define IP_VERSION 0x40
 #define IP_HDRLEN 0x05 /* default IP header length == five 32-bits words. */
 #define IP_VHL_DEF (IP_VERSION | IP_HDRLEN)
 #define IP_ADDR_FMT_SIZE 15
 constexpr static uint32_t ETHER_FRAME_MAGIC = 0xDCDCE5E5;
 struct packet_hdr {
    struct rte_ether_hdr eth_hdr;
    struct rte_ipv4_hdr ipv4_hdr;
    struct rte_udp_hdr udp_hdr;
 } __attribute__((packed));
 struct packet_data
 {
    struct packet_hdr pkt_hdr;
    uint32_t magic;
    uint32_t epoch;
    uint64_t clt_ts_tx;
    uint64_t clt_ts_rx;
    uint64_t srv_ts_tx;
    uint64_t srv_ts_rx;
 };
 static inline void
 print_mac(struct rte_ether_addr * mac)
 {
 	printf("%x:%x:%x:%x:%x:%x", mac->addr_bytes[0],
 								mac->addr_bytes[1],
 								mac->addr_bytes[2],
 								mac->addr_bytes[3],
 								mac->addr_bytes[4],
 								mac->addr_bytes[5]);
 }
 static inline void
 print_ipv4(uint32_t ip)
 {
 	printf("%d-%d-%d-%d", (ip >> 24) & 0xff,
 						  (ip >> 16) & 0xff,
 						  (ip >> 8) & 0xff,
 						  (ip >> 0) & 0xff);
 }
 static inline void
 dump_pkt(struct rte_mbuf *pkt)
 {
 	if(rte_pktmbuf_data_len(pkt) < sizeof(struct rte_ether_hdr)) {
 		return;
 	}
 	struct rte_ether_hdr _eth_hdr;
 	struct rte_ether_hdr * eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_read(pkt, 0, sizeof(struct rte_ether_hdr), &_eth_hdr);
 	if (eth_hdr == NULL) {
 		return;
 	}
 	// ethernet frame
 	printf("Packet %p: Length 0x%x\n", (void*)pkt, rte_pktmbuf_data_len(pkt));
 	printf("    Ethernet header:\n");
 	printf("        Src:");
 	print_mac(&eth_hdr->s_addr);
 	printf("\n");
 	printf("        Dst:");
 	print_mac(&eth_hdr->d_addr);
 	printf("\n");
 	printf("        Type: 0x%x\n", rte_be_to_cpu_16(eth_hdr->ether_type));
 	uint16_t ether_type = rte_be_to_cpu_16(eth_hdr->ether_type);
 	if (ether_type != RTE_ETHER_TYPE_IPV4) {
 		return;
 	}
 	if(rte_pktmbuf_data_len(pkt) < sizeof(struct rte_ether_hdr) + sizeof(struct rte_ipv4_hdr)) {
 		return;
 	}
 	// dump ip header
 	struct rte_ipv4_hdr * ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);	
 	printf("    IPv4 header:\n");
 	printf("        Src:");
 	print_ipv4(rte_be_to_cpu_32(ipv4_hdr->src_addr));
 	printf("\n");
 	printf("        Dst:");
 	print_ipv4(rte_be_to_cpu_32(ipv4_hdr->dst_addr));
 	printf("\n");
 	printf("        Protocol: 0x%x\n", ipv4_hdr->next_proto_id);
 }
 static inline
 struct packet_data * construct_udp_pkt_hdr(struct rte_mbuf * buf,
                    struct rte_ether_addr * src_mac, struct rte_ether_addr * dst_mac,
                    uint32_t src_ip, uint32_t dst_ip, uint16_t src_port, uint16_t dst_port)
 {
    rte_pktmbuf_reset(buf);
    struct packet_data * pkt_data = (struct packet_data *)rte_pktmbuf_append(buf, sizeof(struct packet_data));
    struct rte_ether_hdr * eth_hdr;
    struct rte_ipv4_hdr * ipv4_hdr;
    struct rte_udp_hdr * udp_hdr;
    if (pkt_data == NULL)   
        return NULL;
    // single segment
    buf->nb_segs = 1;
    // construct l2 header
    eth_hdr = &pkt_data->pkt_hdr.eth_hdr;
    rte_ether_addr_copy(src_mac, &eth_hdr->s_addr);
    rte_ether_addr_copy(dst_mac, &eth_hdr->d_addr);
    eth_hdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4);
    buf->l2_len = sizeof(struct rte_ether_hdr);
    // construct l3 header
    ipv4_hdr = &pkt_data->pkt_hdr.ipv4_hdr;
    memset(ipv4_hdr, 0, sizeof(struct rte_ipv4_hdr));
    ipv4_hdr->version_ihl = IP_VHL_DEF;
    ipv4_hdr->type_of_service = 0;
    ipv4_hdr->fragment_offset = 0;
    ipv4_hdr->time_to_live = IP_DEFTTL;
    ipv4_hdr->next_proto_id = IPPROTO_UDP;
    ipv4_hdr->packet_id = 0;
    ipv4_hdr->src_addr = rte_cpu_to_be_32(src_ip);
    ipv4_hdr->dst_addr = rte_cpu_to_be_32(dst_ip);
    ipv4_hdr->total_length = rte_cpu_to_be_16(sizeof(struct packet_data) - sizeof(struct rte_ether_hdr));
    ipv4_hdr->hdr_checksum = 0;
    buf->l3_len = sizeof(struct rte_ipv4_hdr);
    // construct l4 header
    udp_hdr = &pkt_data->pkt_hdr.udp_hdr;
    udp_hdr->src_port = rte_cpu_to_be_16(src_port);
    udp_hdr->dst_port = rte_cpu_to_be_16(dst_port);
    udp_hdr->dgram_cksum = 0; /* No UDP checksum. */
    udp_hdr->dgram_len = rte_cpu_to_be_16(sizeof(struct packet_data) -
                                          sizeof(struct rte_ether_hdr) -
                                          sizeof(struct rte_udp_hdr));
    buf->l4_len = sizeof(struct rte_udp_hdr);
    return pkt_data;
 }
 static inline
 struct packet_data * check_valid_packet(struct rte_mbuf * pkt)
 {
    struct packet_data * pkt_data = NULL;
    if (rte_pktmbuf_data_len(pkt) < sizeof(struct packet_data)) {
        return NULL;
    }
    pkt_data = rte_pktmbuf_mtod(pkt, struct packet_data *);
    if (rte_be_to_cpu_32(pkt_data->magic) == ETHER_FRAME_MAGIC) {
        return pkt_data;
    }
    return NULL;
 }
--- a/inc/storage/drivers/bdev.hh
+++ b/inc/storage/drivers/bdev.hh
@ -0,0 +1,56 @@
 #pragma once
 #include "storage/drivers/driver.hh"
 #include "spdk/bdev.h"
 #include "spdk/bdev_zone.h"
 #include "spdk/thread.h"
 class birb_bdev_driver : public birb_driver
 {
 public:
    birb_bdev_driver(const char * dev_name);
    ~birb_bdev_driver() override;
    size_t get_capacity() override;
    birb_driver_status get_status() override;
    struct spdk_bdev * get_bdev();
    struct spdk_bdev_desc * get_bdev_desc();
    birb_driver_type get_type() override;
    size_t get_align() override;
 private:
    DISALLOW_EVIL_CONSTRUCTORS(birb_bdev_driver);
    struct spdk_bdev_desc * bdev_desc;
    struct spdk_bdev * bdev;
    size_t block_sz;
    size_t block_num;
    birb_driver_status status;
    static void print_all_bdev();
    static void bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev * bdev,
 	                void * event_ctx);
 };
 class birb_bdev_thread_context : public birb_driver_thread_context
 {
 public:
    birb_bdev_thread_context(birb_bdev_driver * driver);
    ~birb_bdev_thread_context() override;
    int read(size_t offset, size_t size, char * buffer, callback callback, void * context) override;
    int write(size_t offset, size_t size, char * buffer, callback callback, void * context) override;
    void poll() override;
    birb_driver::birb_driver_status get_status() override;
 private:
    struct cb_context {
        callback cb;
        void * ctx;
    };
    DISALLOW_EVIL_CONSTRUCTORS(birb_bdev_thread_context);
    spdk_io_channel * io_channel;
    birb_driver::birb_driver_status status;
    birb_bdev_driver * driver;
    static void io_callback(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
 };
--- a/inc/storage/drivers/driver.hh
+++ b/inc/storage/drivers/driver.hh
@ -0,0 +1,47 @@
 #pragma once
 #include "defs.hh"
 #include "spdk/thread.h"
 #include <cstdlib>
 class birb_driver
 {
 private:
    DISALLOW_EVIL_CONSTRUCTORS(birb_driver);
 public:
    enum birb_driver_status{
        BIRB_SUCCESS,
        BIRB_FAIL
    };
    enum birb_driver_type{
        BIRB_DRV_NVME,
        BIRB_DRV_BDEV
    };
    virtual size_t get_capacity() = 0;
    virtual birb_driver_status get_status() = 0;
    virtual size_t get_align() = 0;
    virtual birb_driver_type get_type() = 0;
    virtual ~birb_driver() = default;
 protected:
    birb_driver() = default;
 };
 class birb_driver_thread_context
 {
 private:
    DISALLOW_EVIL_CONSTRUCTORS(birb_driver_thread_context);
 public:
    using callback = void (*)(bool, void *);
    virtual int read(size_t offset, size_t size, char * buffer, callback callback, void * context) = 0;
    virtual int write(size_t offset, size_t size, char * buffer, callback callback, void * context) = 0;
    virtual void poll() = 0;
    virtual birb_driver::birb_driver_status get_status() = 0;
    virtual ~birb_driver_thread_context() = default;
 protected:
    birb_driver_thread_context() = default;
 };
--- a/inc/storage/drivers/nvme.hh
+++ b/inc/storage/drivers/nvme.hh
@ -0,0 +1,65 @@
 #pragma once
 #include "storage/drivers/driver.hh"
 #include "spdk/nvme.h"
 #include "spdk/thread.h"
 class birb_nvme_driver : public birb_driver
 {
 public:
    birb_nvme_driver(const char * dev_name);
    ~birb_nvme_driver() override;
    size_t get_capacity() override;
    birb_driver_status get_status() override;
    birb_driver_type get_type() override;
    size_t get_align() override;
    spdk_nvme_ctrlr * get_ctrlr();
    spdk_nvme_ns * get_ns();
    spdk_nvme_io_qpair_opts * get_io_qpair_opts();
 private:
    struct attach_context {
        spdk_nvme_ctrlr ** ctrlr;
        spdk_nvme_ns ** ns;
        const char * dev_name;
        int valid;
    };
    DISALLOW_EVIL_CONSTRUCTORS(birb_nvme_driver);
    birb_driver_status status;
    spdk_nvme_ctrlr * ctrlr;
    spdk_nvme_ns * ns;
    spdk_nvme_io_qpair_opts opts;
    static bool probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, struct spdk_nvme_ctrlr_opts *opts);
    static void attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
 	                        struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts);
 };
 class birb_nvme_thread_context : public birb_driver_thread_context
 {
 public:
    birb_nvme_thread_context(birb_nvme_driver * driver);
    ~birb_nvme_thread_context() override;
    int read(size_t offset, size_t size, char * buffer, callback callback, void * context) override;
    int write(size_t offset, size_t size, char * buffer, callback callback, void * context) override;
    void poll() override;
    birb_driver::birb_driver_status get_status() override;
 private:
    struct cb_context {
        callback cb;
        void * ctx;
    };
    DISALLOW_EVIL_CONSTRUCTORS(birb_nvme_thread_context);
    birb_driver::birb_driver_status status;
    birb_nvme_driver * driver;
    struct spdk_nvme_qpair * qpair;
    static void io_callback(void *arg, const struct spdk_nvme_cpl *completion);
    static uint32_t size_to_lba(size_t size, int lba_size);
    static uint64_t addr_to_lba(size_t addr, int lba_size);
 };
--- a/inc/storage/drivers/posix.hh
+++ b/inc/storage/drivers/posix.hh
@ -0,0 +1,47 @@
 #pragma once
 #include "defs.hh"
 #include "spdk/thread.h"
 #include <cstdlib>
 class birb_driver
 {
 private:
    DISALLOW_EVIL_CONSTRUCTORS(birb_driver);
 public:
    enum birb_driver_status{
        BIRB_SUCCESS,
        BIRB_FAIL
    };
    enum birb_driver_type{
        BIRB_DRV_NVME,
        BIRB_DRV_BDEV
    };
    virtual size_t get_capacity() = 0;
    virtual birb_driver_status get_status() = 0;
    virtual size_t get_align() = 0;
    virtual birb_driver_type get_type() = 0;
    virtual ~birb_driver() = default;
 protected:
    birb_driver() = default;
 };
 class birb_driver_thread_context
 {
 private:
    DISALLOW_EVIL_CONSTRUCTORS(birb_driver_thread_context);
 public:
    using callback = void (*)(bool, void *);
    virtual int read(size_t offset, size_t size, char * buffer, callback callback, void * context) = 0;
    virtual int write(size_t offset, size_t size, char * buffer, callback callback, void * context) = 0;
    virtual void poll() = 0;
    virtual birb_driver::birb_driver_status get_status() = 0;
    virtual ~birb_driver_thread_context() = default;
 protected:
    birb_driver_thread_context() = default;
 };
--- a/inc/storage/io_gen.hh
+++ b/inc/storage/io_gen.hh
@ -0,0 +1,53 @@
 #pragma once
 #include <sys/endian.h>
 #include <sys/types.h>
 #include "defs.hh"
 #include "gen.hh"
 #include <random>
 enum io_generator_opcode {
    IOGEN_READ,
    IOGEN_WRITE
 };
 enum io_generator_address_mode {
    IOGEN_ADDR_MONOTONIC_INCREASING,
    IOGEN_ADDR_UNIFORM_RANDOM
 };
 struct io_generator_ctx {
    unsigned long size;
    uint64_t offset;
    io_generator_opcode op;
 };
 //
 // cur_offset is aligned to req_size
 //
 class io_generator {
 public:
    int issue(struct io_generator_ctx * ctx, char * buf);
    io_generator(unsigned long req_size,
                    unsigned long capacity,
                    unsigned int read_pct,
                    io_generator_address_mode addr_mode);
    io_generator() = delete;
 private:
    unsigned long cur_offset;
    const unsigned long capacity;
    const unsigned long req_size;
    const unsigned int read_pct;
    const io_generator_address_mode addr_mode;
    std::random_device rd;
    std::mt19937 rng;
    std::uniform_int_distribution<int> dist;
    std::random_device addr_rd;
    std::mt19937 addr_rng;
    std::uniform_int_distribution<uint64_t> addr_dist;
    DISALLOW_EVIL_CONSTRUCTORS(io_generator);
 };
--- a/khat/khat.cc
+++ b/khat/khat.cc
@ -1,378 +0,0 @@
 #include <cstdio>
 #include <cstdlib>
 #include <rte_common.h>
 #include <rte_eal.h>
 #include <rte_ethdev.h>
 #include <rte_cycles.h>
 #include <rte_lcore.h>
 #include <rte_mbuf.h>
 #include <rte_byteorder.h>
 #include <rte_config.h>
 #include <rte_ether.h>
 #include <rte_launch.h>
 #include <atomic>
 #include <unistd.h>
 #include "pkt.h"
 #include "ntrlog.h"
 #include "rte_arp.h"
 #include "rte_mbuf_core.h"
 NTR_DECL_IMPL;
 constexpr unsigned int MBUF_MAX_COUNT = 8191;
 constexpr unsigned int MBUF_CACHE_SIZE = 250;
 constexpr unsigned int RX_RING_SIZE = 1024;
 constexpr unsigned int TX_RING_SIZE = 1024;
 constexpr unsigned int RX_RING_NUM = 1;
 constexpr unsigned int TX_RING_NUM = 1;
 constexpr unsigned int BURST_SIZE = 32;
 static const struct rte_eth_conf port_conf_default{};
 struct options_t {
    //states
    uint16_t s_portid;
    struct rte_ether_addr s_host_mac;
    struct rte_mempool * s_pkt_mempool;
 };
 struct options_t options;
 static uint16_t
 rx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
        struct rte_mbuf **pkts, uint16_t nb_pkts, uint16_t max_pkts __rte_unused, void *_ __rte_unused)
 {
    uint64_t now = rte_rdtsc();
    struct packet_data * pkt_data;
    for (int i = 0; i < nb_pkts; i++) {
        pkt_data = check_valid_packet(pkts[i]);
        if (pkt_data == NULL) {
            ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "rx_add_timestamp: ignoring invalid packet %p.\n", (void*)pkts[i]);
            continue;  
        }
        ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "rx_add_timestamp: tagged packet %p with %llu.\n", (void*)pkts[i], now);        
        pkt_data->srv_ts_rx = rte_cpu_to_be_64(now);
    }
    return nb_pkts;
 }
 static uint16_t
 tx_calc_latency(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
 		struct rte_mbuf **pkts, uint16_t nb_pkts, void *_ __rte_unused)
 {
    uint64_t now = rte_rdtsc();
    struct packet_data * pkt_data;
    for (int i = 0; i < nb_pkts; i++) {
        pkt_data = check_valid_packet(pkts[i]);
        if (pkt_data == NULL) {
            ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "tx_calc_latency: ignoring invalid packet %p.\n", (void*)pkts[i]);
            continue;
        }
        ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "tx_calc_latency: tagged packet %p with %llu.\n", (void*)pkts[i], now);    
        pkt_data->srv_ts_tx = rte_cpu_to_be_64(now);
    }
    return nb_pkts;
 }
 static int
 locore_main(void * _unused __rte_unused)
 {
    struct rte_mbuf *bufs[BURST_SIZE];
    struct rte_mbuf *tx_bufs[BURST_SIZE];
    struct packet_data *pkt_data;
    uint32_t core_id = rte_lcore_id();
    if (rte_eth_dev_socket_id(options.s_portid) > 0 && rte_eth_dev_socket_id(options.s_portid) != (int)rte_socket_id()) {
        ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,  "locore_main: WARNING, port %d is on remote NUMA node to "
                "polling thread.\n\tPerformance will "
                "not be optimal.\n", options.s_portid);
    }
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: core %d running.\n", core_id);
 	while(true) {
        uint16_t nb_tx = 0;
        const uint16_t nb_rx = rte_eth_rx_burst(options.s_portid, 0, bufs, BURST_SIZE);
        if (nb_rx == 0) {
            continue;
        }
        for(int i = 0; i < nb_rx; i++) {
            pkt_data = check_valid_packet(bufs[i]);
            if (pkt_data == NULL) {
                ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "locore_main: core %d skipping invalid packet %p.\n", core_id, (void*)bufs[i]);
                dump_pkt(bufs[i]);
                rte_pktmbuf_free(bufs[i]);
                continue;
            }
            uint32_t dst_ip = rte_be_to_cpu_32(pkt_data->pkt_hdr.ipv4_hdr.dst_addr);
            uint32_t src_ip = rte_be_to_cpu_32(pkt_data->pkt_hdr.ipv4_hdr.src_addr);
            uint16_t src_port = rte_be_to_cpu_16(pkt_data->pkt_hdr.udp_hdr.src_port);
            uint16_t dst_port = rte_be_to_cpu_16(pkt_data->pkt_hdr.udp_hdr.dst_port);
            ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: core %d packet %p from %d.%d.%d.%d(%x:%x:%x:%x:%x:%x) to %d.%d.%d.%d(%x:%x:%x:%x:%x:%x), sport %d, dport %d, epoch %d\n", 
                                                                                            core_id,
                                                                                            (void*)bufs[i],
                                                                                            (src_ip >> 24) & 0xff,
                                                                                            (src_ip >> 16) & 0xff,
                                                                                            (src_ip >> 8) & 0xff,
                                                                                            (src_ip >> 0) & 0xff,
                                                                                            pkt_data->pkt_hdr.eth_hdr.s_addr.addr_bytes[0],
                                                                                            pkt_data->pkt_hdr.eth_hdr.s_addr.addr_bytes[1],
                                                                                            pkt_data->pkt_hdr.eth_hdr.s_addr.addr_bytes[2],
                                                                                            pkt_data->pkt_hdr.eth_hdr.s_addr.addr_bytes[3],
                                                                                            pkt_data->pkt_hdr.eth_hdr.s_addr.addr_bytes[4],
                                                                                            pkt_data->pkt_hdr.eth_hdr.s_addr.addr_bytes[5],
                                                                                            (dst_ip >> 24) & 0xff,
                                                                                            (dst_ip >> 16) & 0xff,
                                                                                            (dst_ip >> 8) & 0xff,
                                                                                            (dst_ip >> 0) & 0xff,
                                                                                            pkt_data->pkt_hdr.eth_hdr.d_addr.addr_bytes[0],
                                                                                            pkt_data->pkt_hdr.eth_hdr.d_addr.addr_bytes[1],
                                                                                            pkt_data->pkt_hdr.eth_hdr.d_addr.addr_bytes[2],
                                                                                            pkt_data->pkt_hdr.eth_hdr.d_addr.addr_bytes[3],
                                                                                            pkt_data->pkt_hdr.eth_hdr.d_addr.addr_bytes[4],
                                                                                            pkt_data->pkt_hdr.eth_hdr.d_addr.addr_bytes[5], 
                                                                                            src_port,
                                                                                            dst_port,
                                                                                            rte_be_to_cpu_32(pkt_data->epoch));
            // swap s_addr and d_addr
            struct rte_mbuf * pkt_buf = rte_pktmbuf_alloc(options.s_pkt_mempool);
            if (pkt_buf == NULL) {
                rte_exit(EXIT_FAILURE, "locore_main: failed to allocate memory for pkt_buf");
            }
            struct packet_data * tx_data = construct_udp_pkt_hdr(pkt_buf, 
                                                &options.s_host_mac, 
                                                &pkt_data->pkt_hdr.eth_hdr.s_addr, 
                                                dst_ip, 
                                                src_ip, 
                                                dst_port, 
                                                src_port);
            if (tx_data == NULL) {
                rte_exit(EXIT_FAILURE, "failed to construct tx packet %p", (void*)pkt_buf);
            }
            // copy, endianess doesn't matter
            tx_data->epoch = pkt_data->epoch;
            tx_data->magic = pkt_data->magic;
            tx_data->clt_ts_rx = pkt_data->clt_ts_rx;
            tx_data->clt_ts_tx = pkt_data->clt_ts_tx;
            tx_data->srv_ts_rx = pkt_data->srv_ts_rx;
            tx_data->srv_ts_tx = pkt_data->srv_ts_tx;
            // queue for burst send
            tx_bufs[nb_tx++] = pkt_buf;
            // free rx packet
            rte_pktmbuf_free(bufs[i]);
        }
        const uint16_t nb_tx_succ = rte_eth_tx_burst(options.s_portid, 0, tx_bufs, nb_tx);
        // cleanup unsent packets
        // don't need to free others because it's offloaded
        if (nb_tx_succ < nb_tx) {
            rte_exit(EXIT_FAILURE, "locore_main: failed to send some packets.\n");
        }
 	}
    return 0;
 }
 static int 
 port_init(uint16_t portid, struct rte_mempool *mbuf_pool)
 {
    struct rte_eth_dev_info dev_info;
    struct rte_eth_conf port_conf = port_conf_default;
    struct rte_eth_txconf txconf;
    struct rte_eth_rxconf rxconf;
    uint16_t nb_rxd = RX_RING_SIZE;
 	uint16_t nb_txd = TX_RING_SIZE; 
    if(!rte_eth_dev_is_valid_port(portid)) {
        return -1;
    }
    int ret = rte_eth_dev_info_get(portid, &dev_info);
    if (ret != 0) {
        return ret;
    }
    port_conf.rxmode.max_rx_pkt_len = RTE_ETHER_MAX_LEN;
    port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_UDP_CKSUM;
    port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_IPV4_CKSUM;
    port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM;
    port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
    port_conf.txmode.offloads |= DEV_TX_OFFLOAD_MBUF_FAST_FREE;
    /* Configure the Ethernet device. */
    ret = rte_eth_dev_configure(portid, RX_RING_NUM, TX_RING_NUM, &port_conf);
 	if (ret != 0)
 		return ret;
 	ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd, &nb_txd);
 	if (ret != 0)
 		return ret;
 	/* Allocate and set up 1 RX queue per Ethernet port. */
    rxconf = dev_info.default_rxconf;
 	for (uint32_t i = 0; i < RX_RING_NUM; i++) {
 		ret = rte_eth_rx_queue_setup(portid, i, nb_rxd, rte_eth_dev_socket_id(portid), &rxconf, mbuf_pool);
 		if (ret < 0)
 			return ret;
 	}
    txconf = dev_info.default_txconf;
 	txconf.offloads = port_conf.txmode.offloads;
 	/* Allocate and set up 1 TX queue per Ethernet port. */
 	for (uint32_t i = 0; i < TX_RING_NUM; i++) {
 		ret = rte_eth_tx_queue_setup(portid, i, nb_txd, rte_eth_dev_socket_id(portid), &txconf);
 		if (ret < 0)
 			return ret;
 	}
    ret = rte_eth_dev_start(portid);
    if (ret < 0)
        return ret;
 	/* Display the port MAC address. */
    struct rte_ether_addr addr;
    ret = rte_eth_macaddr_get(portid, &addr);
    if (ret != 0)
        return ret;
    /* Enable RX in promiscuous mode for the Ethernet device. */
    ret = rte_eth_promiscuous_enable(portid);
 	if (ret != 0)
 		return ret;
    if (rte_eth_add_tx_callback(portid, 0, tx_calc_latency, NULL) == NULL || rte_eth_add_rx_callback(portid, 0, rx_add_timestamp, NULL) == NULL) {
        return -1;
    }
 	return 0;
 }
 static void usage()
 {
    fprintf(stdout, 
            "Usage:\n" \
            "    -v(vv): verbose mode\n" \
            "    -h: display the information\n");
 }
 int main(int argc, char* argv[])
 {
    unsigned int nb_ports;
    struct rte_mempool *mbuf_pool, *mbuf_pool_pkt;
    // init dpdk
    int ret = rte_eal_init(argc, argv);
    if (ret < 0) {
        rte_exit(EXIT_FAILURE, "rte_eal_init failed!\n");
    }
    argc -= ret;
    argv += ret;
    // set warning level
    ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_WARNING);
    {
        int c;
        // parse arguments
        while((c = getopt(argc, argv, "hv")) != -1) {
            switch (c) {
                case 'v':
                    ntr_set_level(NTR_DEP_USER1, ntr_get_level(NTR_DEP_USER1) + 1);
                    break;
                case 'h':
                    usage();
                    rte_exit(EXIT_SUCCESS, NULL);
                    break;
                default:
                    usage();
                    rte_exit(EXIT_SUCCESS, "unknown argument: %c", c);
                    break;
            }
        }
    }
    // XXX: singal handler to exit
    nb_ports = rte_eth_dev_count_avail();
    if (nb_ports == 0) {
        rte_exit(EXIT_FAILURE, "number of ports must be > 0\n");
    }
    // create a mbuf memory pool on the socket
    mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", MBUF_MAX_COUNT * nb_ports, MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());
    if (mbuf_pool == nullptr) {
        rte_exit(EXIT_FAILURE, "cannot create mbuf pool\n");
    }
    // create a pkt mbuf memory pool on the socket
    mbuf_pool_pkt = rte_pktmbuf_pool_create("MBUF_POOL_PKT", MBUF_MAX_COUNT * nb_ports, MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());
    if (mbuf_pool_pkt == nullptr) {
        rte_exit(EXIT_FAILURE, "cannot create mbuf_pkt pool\n");
    }
    options.s_pkt_mempool = mbuf_pool_pkt;
    uint16_t portid = rte_eth_find_next(0);
    if (portid == RTE_MAX_ETHPORTS) {
        rte_exit(EXIT_FAILURE, "cannot find an available port\n");
    }
    options.s_portid = portid;
    if (port_init(portid, mbuf_pool) != 0) {
        rte_exit(EXIT_FAILURE, "cannot init port %d\n", portid);
    }
    if (rte_eth_macaddr_get(portid, &options.s_host_mac) != 0) {
        rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n", portid);
    }
    ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "Configured port %d with mac addr %x:%x:%x:%x:%x:%x\n", portid,
                                                                                                options.s_host_mac.addr_bytes[0],
                                                                                                options.s_host_mac.addr_bytes[1],
                                                                                                options.s_host_mac.addr_bytes[2],
                                                                                                options.s_host_mac.addr_bytes[3],
                                                                                                options.s_host_mac.addr_bytes[4],
                                                                                                options.s_host_mac.addr_bytes[5]);
    uint16_t lcore_id = rte_get_next_lcore(0, true, false);
    if (lcore_id == RTE_MAX_LCORE) {
        rte_exit(EXIT_FAILURE, "cannot detect lcores.\n");
    }
    if (rte_eal_remote_launch(locore_main, NULL, lcore_id) != 0) {
        rte_exit(EXIT_FAILURE, "failed to launch function on locore %d\n", lcore_id);
    }
    // while(true) {
    //     struct rte_eth_stats stats;
    //     rte_eth_stats_get(portid, &stats);
    //     printf("recv: %d missed: %d err: %d\n",(uint32_t)stats.ipackets, (uint32_t)stats.imissed,(uint32_t)stats.ierrors);
    //     usleep(1000000);
    // }
    if (rte_eal_wait_lcore(lcore_id) != 0) {
        rte_exit(EXIT_FAILURE, "failed to wait for locore %d\n", lcore_id);
    }
    // shouldn't get here
    return 0;
 }
--- a/libgen/generator.cc
+++ b/libgen/generator.cc
@ -0,0 +1,95 @@
 // modified from mutilate
 #include "gen.hh"
 Generator *
 createFacebookKey()
 {
 	return new GEV(30.7984, 8.20449, 0.078688);
 }
 Generator *
 createFacebookValue()
 {
 	Generator *g = new GPareto(15.0, 214.476, 0.348238);
 	Discrete *d = new Discrete(g);
 	d->add(0.00536, 0.0);
 	d->add(0.00047, 1.0);
 	d->add(0.17820, 2.0);
 	d->add(0.09239, 3.0);
 	d->add(0.00018, 4.0);
 	d->add(0.02740, 5.0);
 	d->add(0.00065, 6.0);
 	d->add(0.00606, 7.0);
 	d->add(0.00023, 8.0);
 	d->add(0.00837, 9.0);
 	d->add(0.00837, 10.0);
 	d->add(0.08989, 11.0);
 	d->add(0.00092, 12.0);
 	d->add(0.00326, 13.0);
 	d->add(0.01980, 14.0);
 	return d;
 }
 Generator *
 createFacebookIA()
 {
 	return new GPareto(0, 16.0292, 0.154971);
 }
 Generator *
 createGenerator(std::string str)
 {
 	if (!strcmp(str.c_str(), "fb_key"))
 		return createFacebookKey();
 	else if (!strcmp(str.c_str(), "fb_value"))
 		return createFacebookValue();
 	else if (!strcmp(str.c_str(), "fb_ia"))
 		return createFacebookIA();
 	char *s_copy = new char[str.length() + 1];
 	strcpy(s_copy, str.c_str());
 	char *saveptr = NULL;
 	if (atoi(s_copy) != 0 || !strcmp(s_copy, "0")) {
 		double v = atof(s_copy);
 		delete[] s_copy;
 		return new Fixed(v);
 	}
 	char *t_ptr = strtok_r(s_copy, ":", &saveptr);
 	char *a_ptr = strtok_r(NULL, ":", &saveptr);
 	if (t_ptr == NULL) // || a_ptr == NULL)
 		DIE("strtok(.., \":\") failed to parse %s", str.c_str());
 	saveptr = NULL;
 	char *s1 = strtok_r(a_ptr, ",", &saveptr);
 	char *s2 = strtok_r(NULL, ",", &saveptr);
 	char *s3 = strtok_r(NULL, ",", &saveptr);
 	double a1 = s1 ? atof(s1) : 0.0;
 	double a2 = s2 ? atof(s2) : 0.0;
 	double a3 = s3 ? atof(s3) : 0.0;
 	delete[] s_copy;
 	if (strcasestr(str.c_str(), "fixed"))
 		return new Fixed(a1);
 	else if (strcasestr(str.c_str(), "normal"))
 		return new Normal(a1, a2);
 	else if (strcasestr(str.c_str(), "exponential"))
 		return new Exponential(a1);
 	else if (strcasestr(str.c_str(), "pareto"))
 		return new GPareto(a1, a2, a3);
 	else if (strcasestr(str.c_str(), "gev"))
 		return new GEV(a1, a2, a3);
 	else if (strcasestr(str.c_str(), "uniform"))
 		return new Uniform(a1);
 	DIE("Unable to create Generator '%s'", str.c_str());
 	return NULL;
 }
--- a/libgen/loadgen.cc
+++ b/libgen/loadgen.cc
@ -0,0 +1,276 @@
 #include <sys/types.h>
 #include <sys/cpuset.h>
 #include <sys/domainset.h>
 #include <sys/endian.h>
 #include <sys/thr.h>
 #include <pthread.h>
 #include <pthread_np.h>
 #include <topo.h>
 #include <unistd.h>
 #include "nms.h"
 #include "gen.hh"
 #include <atomic>
 void *
 memload_generator::worker_thrd(void *_tinfo)
 {
 	auto *tinfo = (struct thread_info *)_tinfo;
 	void *from_buffer, *to_buffer, *tmp;
 	if (tinfo->opts->shared_buffer) {
 		from_buffer = tinfo->from_buffer;
 		to_buffer = tinfo->to_buffer;
 	} else {
 		if (tinfo->opts->verbose) {
 			fprintf(stdout,
 			    "memload_generator <thread %d>: allocating fbuf %lu bytes on domain %d...\n",
 			    tinfo->tid, tinfo->opts->buffer_size,
 			    topo_core_to_numa(tinfo->coreid));
 		}
 		from_buffer = nms_alloc_static(topo_core_to_numa(
 							  tinfo->coreid),
 		    tinfo->opts->buffer_size);
 		if (tinfo->opts->verbose) {
 			fprintf(stdout,
 			    "memload_generator <thread %d>: allocating tbuf %lu bytes on domain %d...\n",
 			    tinfo->tid, tinfo->opts->buffer_size, tinfo->target_dom);
 		}
 		to_buffer = nms_alloc_static(tinfo->target_dom,
 		    tinfo->opts->buffer_size);
 	}
 	if (from_buffer == nullptr || to_buffer == nullptr) {
 		if (tinfo->opts->verbose) {
 			fprintf(stderr,
 			    "memload_generator <thread %d>: failed to allocate memory\n",
 			    tinfo->tid);
 		}
 		tinfo->init_status.store(-1);
 		return nullptr;
 	}
 	if (tinfo->pull) {
 		tmp = from_buffer;
 		from_buffer = to_buffer;
 		to_buffer = tmp;
 	}
 	// wait for other threads to init
 	if (tinfo->opts->verbose) {
 		fprintf(stdout, "memload_generator <thread %d, pull %d>: running...\n", tinfo->tid, tinfo->pull);
 	}
 	tinfo->init_status.store(1);
 	uint64_t next_ts = topo_uptime_ns();
 	size_t cur_offset = 0;
 	uint64_t cur_ts = 0;
 	while (true) {
 		switch (tinfo->state->load()) {
 		case STATE_RUN:
 			cur_ts = topo_uptime_ns();
 			if (cur_ts >= next_ts) {
 				if (cur_offset + tinfo->opts->transaction_size >
 				    tinfo->opts->buffer_size) {
 					cur_offset = 0;
 				}
 				// for (uint i = 0; i < tinfo->opts->transaction_size; i++) {
 				// 	((char *)to_buffer)[cur_offset + i] = ((char *)from_buffer)[cur_offset + i];
 				// }
 				memcpy((char *)to_buffer + cur_offset,
 				    (char *)from_buffer + cur_offset,
 				    tinfo->opts->transaction_size);
 				tinfo->num_trans.fetch_add(1);
 				if (tinfo->reset_ts.load(
 					std::memory_order_relaxed)) {
 					tinfo->reset_ts.store(false,
 					    std::memory_order_relaxed);
 					next_ts = cur_ts;
 				}
 				next_ts += tinfo->ia_gen->generate() *
 				    (double)S2NS;
 				cur_offset += tinfo->opts->transaction_size;
 			}
 			break;
 		case STATE_END:
 			goto end;
 		case STATE_RDY:
 			next_ts = topo_uptime_ns();
 			break;
 		case STATE_INIT:
 		default:
 			break;
 		}
 	}
 end:
 	if (tinfo->opts->verbose) {
 		fprintf(stdout, "memload_generator <thread %d>: exiting...\n",
 		    tinfo->tid);
 	}
 	if (!tinfo->opts->shared_buffer) {
 		nms_free_static(from_buffer, tinfo->opts->buffer_size);
 		nms_free_static(to_buffer, tinfo->opts->buffer_size);
 	}
 	return nullptr;
 }
 memload_generator::memload_generator(cpuset_t *threads, cpuset_t * modes, cpuset_t *target_domain,
    struct memload_generator_options *opt, bool *success)
 {
 	*success = false;
 	state.store(STATE_INIT);
 	std::memcpy(&this->opts, opt, sizeof(memload_generator_options));
 	int nextcore = CPU_FFS(threads) - 1;
 	int target_domain_id = CPU_FFS(target_domain) - 1;
 	int num_cores = CPU_COUNT(threads);
 	if (target_domain_id < 0 || num_cores == 0) {
 		return;
 	}
 	double thread_tps = (double)opt->trans_per_second / (double)num_cores;
 	void *local_buffer = nullptr;
 	void *target_buffer = nullptr;
 	int tid = 0;
 	if (opts.shared_buffer) {
 		local_buffer = nms_alloc_static(topo_core_to_numa(nextcore),
 		    opt->buffer_size);
 		target_buffer = nms_alloc_static(target_domain_id,
 		    opt->buffer_size);
 		if (local_buffer == nullptr || target_buffer == nullptr) {
 			*success = false;
 			goto end;
 		}
 	}
 	while (nextcore != -1) {
 		auto info = new struct thread_info;
 		cpuset_t cpuset;
 		pthread_attr_t attr;
 		info->ia_gen = createGenerator(opts.ia_dist);
 		if (info->ia_gen == nullptr) {
 			goto end;
 		}
 		info->ia_gen->set_lambda(thread_tps);
 		info->init_status.store(0);
 		info->state = &this->state;
 		info->reset_ts.store(false, std::memory_order_relaxed);
 		info->num_trans.store(0);
 		info->opts = &this->opts;
 		info->tid = tid;
 		info->coreid = nextcore;
 		info->target_dom = target_domain_id;
 		info->from_buffer = local_buffer;
 		info->to_buffer = target_buffer;
 		info->pull = CPU_ISSET(nextcore, modes);
 		CPU_ZERO(&cpuset);
 		CPU_SET(nextcore, &cpuset);
 		pthread_attr_init(&attr);
 		pthread_attr_setaffinity_np(&attr, sizeof(cpuset_t), &cpuset);
 		pthread_create(&info->pthr, &attr, worker_thrd, info);
 		if (opts.verbose) {
 			fprintf(stdout,
 			    "memload_generator: created thread %d on core %d target domain %d\n",
 			    tid, nextcore, target_domain_id);
 		}
 		thr_infos.push_back(info);
 		CPU_CLR(nextcore, threads);
 		nextcore = CPU_FFS(threads) - 1;
 		tid++;
 	}
 	for (auto tinfo : thr_infos) {
 		int status;
 		while ((status = tinfo->init_status.load()) != 1) {
 			if (status == -1) {
 				state.store(STATE_END);
 				*success = false;
 				goto end;
 			}
 		}
 	}
 	state.store(STATE_RDY);
 	*success = true;
 end:
 	if (opts.verbose) {
 		fprintf(stdout,
 		    "memload_generator: exiting constructor. Success: %d...\n",
 		    success ? 1 : 0);
 	}
 }
 bool
 memload_generator::start()
 {
 	if (this->state.load() == STATE_RDY) {
 		this->state.store(memload_generator::STATE_RUN);
 		return true;
 	}
 	return false;
 }
 bool
 memload_generator::stop()
 {
 	if (this->state.load() == STATE_RUN) {
 		this->state.store(memload_generator::STATE_RDY);
 		return true;
 	}
 	return false;
 }
 bool
 memload_generator::set_transactions(uint64_t tps)
 {
 	if (this->state.load() != STATE_END &&
 	    this->state.load() != STATE_INIT) {
 		for (unsigned int i = 0; i < thr_infos.size(); i++) {
 			thr_infos.at(i)->ia_gen->set_lambda(
 			    (double)tps / (double)thr_infos.size());
 			thr_infos.at(i)->reset_ts.store(true,
 			    std::memory_order_relaxed);
 		}
 		return true;
 	}
 	return false;
 }
 uint64_t
 memload_generator::get_transactions()
 {
 	uint64_t total_transactions = 0;
 	for (auto i : thr_infos) {
 		total_transactions += i->num_trans.load();
 	}
 	return total_transactions;
 }
 memload_generator::~memload_generator()
 {
 	void *buf1, *buf2;
 	this->state.store(STATE_END);
 	for (auto i : thr_infos) {
 		// XXX: nms_free regions
 		pthread_join(i->pthr, NULL);
 		buf1 = i->from_buffer;
 		buf2 = i->to_buffer;
 		delete i;
 	}
 	if (opts.shared_buffer) {
 		nms_free_static(buf1, opts.buffer_size);
 		nms_free_static(buf2, opts.buffer_size);
 	}
 }
--- a/libnms/alloc.c
+++ b/libnms/alloc.c
@ -0,0 +1,205 @@
 #include <pthread.h>
 #include <sys/types.h>
 #include <sys/cpuset.h>
 #include <sys/domainset.h>
 #include <sys/thr.h>
 #include <sys/mman.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <errno.h>
 #include <stdatomic.h>
 #include <string.h>
 #include <assert.h>
 #include <nms.h>
 #define MAX_NUMA_DOMAINS (64)
 #define MAX_REGIONS (64)
 #define REGION_SIZE (1024 * 1024 * 1024)
 #define PAGE_SIZE (4096)
 struct nms_region {
 	uintptr_t start_addr;
 	size_t size;
 	size_t occupied;
 };
 struct nms_desc {
    // alloc
    pthread_mutex_t alloc_lock;
 	struct nms_region regions[MAX_NUMA_DOMAINS][MAX_REGIONS];
 	int region_sz[MAX_NUMA_DOMAINS];
 };
 static _Atomic(int) initialized = 0;
 static struct nms_desc g_desc;
 void
 nms_free_static(void * buf, size_t sz)
 {
 	munmap(buf, sz);
 	return;
 }
 void *
 nms_alloc_static(int node_id, size_t sz)
 {
 	long tid;
 	domainset_t orig_dom;
 	int orig_policy;
 	void * region;
    thr_self(&tid);
 	DOMAINSET_ZERO(&orig_dom);
 	// save existing thread's allocation strategy
 	int ret = cpuset_getdomain(CPU_LEVEL_WHICH, CPU_WHICH_TID, tid, sizeof(orig_dom), &orig_dom, &orig_policy);
 	if (ret != 0) {
 		fprintf(stderr, "libnms: cpuset_getdomain failed with %d\n", errno);
 		return NULL;
 	}
 	domainset_t tmp_domain;
 	DOMAINSET_ZERO(&tmp_domain);
 	DOMAINSET_SET(node_id, &tmp_domain);
 	ret = cpuset_setdomain(CPU_LEVEL_WHICH, CPU_WHICH_TID, tid, sizeof(tmp_domain), &tmp_domain, DOMAINSET_POLICY_ROUNDROBIN);
 	if (ret != 0) {
 		fprintf(stderr, "libnms: cpuset_setdomain failed with %d\n", errno);
 		return NULL;
 	}
 	if ((region = mmap(NULL, sz, PROT_READ | PROT_WRITE, MAP_ANON | MAP_ALIGNED_SUPER | MAP_NOCORE | MAP_PRIVATE | MAP_PREFAULT_READ, -1, 0)) == MAP_FAILED) {
 		fprintf(stderr, "libnms: mmap failed with %d\n", errno);
 		return NULL;
 	}
 	// touch the pages to prefault the pages
 	int sum;
 	for (size_t i = 0; i < sz; i++) {
 		sum += *(uint8_t *)((char *)region + i);
 		*(uint8_t *)((char *)region + i) = i;
 	}
 	// restore existing thread's allocation strategy
 	ret = cpuset_setdomain(CPU_LEVEL_WHICH, CPU_WHICH_TID, tid, sizeof(orig_dom), &orig_dom, orig_policy);
 	if (ret != 0) {
 		fprintf(stderr, "libnms: cpuset_setdomain failed with %d\n", errno);
 		munmap(region, REGION_SIZE);
 		return NULL;
 	}
 	return region;
 }
 static int
 nms_desc_init(struct nms_desc * desc, int verbose)
 {
 	memset(desc, 0, sizeof(struct nms_desc));
 	pthread_mutex_init(&desc->alloc_lock, NULL);
 	return 0;
 }
 static void *
 nms_region_malloc(struct nms_region * region, size_t size)
 {
 	void * ret = NULL;
 	if (region->size >= region->occupied + size) {
 		ret = (void *)(region->start_addr + region->occupied);
 		region->occupied += size;
 		region->occupied = (region->occupied + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
 	}
 	return ret;
 }
 static int
 nms_desc_add_region(struct nms_desc * desc, int nodeid, size_t size)
 {
 	void * ret;
 	int idx;
 	ret = nms_alloc_static(nodeid, REGION_SIZE);
 	if (ret == NULL) {
 		fprintf(stderr, "libnms: failed to allocate region on node %d\n", nodeid);
 		return ENOMEM;
 	}
 	desc->region_sz[nodeid]++;
 	idx = desc->region_sz[nodeid] - 1;
 	desc->regions[nodeid][idx].start_addr = (uintptr_t)ret;
 	desc->regions[nodeid][idx].occupied = 0;
 	desc->regions[nodeid][idx].size = REGION_SIZE;
 	return 0;
 }
 static void *
 nms_desc_malloc(struct nms_desc * desc, unsigned int nodeid, size_t size)
 {
 	void * ret = NULL;
 	int idx;
 	int new_region = 0;
 	if (size > REGION_SIZE) {
 		return NULL;
 	}
 	pthread_mutex_lock(&desc->alloc_lock);
 retry:
 	if (desc->region_sz[nodeid] > 0) {
 		idx = desc->region_sz[nodeid] - 1;
 		ret = nms_region_malloc(&desc->regions[nodeid][idx], size);
 	}
 	if (ret == NULL) {
 		// we need a new region
 		if (nms_desc_add_region(desc, nodeid, REGION_SIZE) != 0) {
 			pthread_mutex_unlock(&desc->alloc_lock);
 			return NULL;
 		}
 		fprintf(stdout, "libnms: malloc request of size %zu -> allocated new region on node %d\n", size, nodeid);
 		goto retry;
 	}
 	pthread_mutex_unlock(&desc->alloc_lock);
 	return ret;
 }
 static void
 nms_desc_free(struct nms_desc * desc __attribute__((unused)), unsigned int node __attribute__((unused)), void * addr __attribute__((unused)))
 {
 	// dummy function
 }
 int
 nms_init(int verbose)
 {	
 	int expected = 0;
 	if (atomic_compare_exchange_strong(&initialized, &expected, 2)) {
 		nms_desc_init(&g_desc, verbose);
 		atomic_store(&initialized, 1);
 	} else {
 		while(atomic_load(&initialized) != 1) {
 		}
 		fprintf(stdout,"libnms: already initialized.\n");
 	}
 	return 0;
 }
 void *
 nms_malloc(int nodeid, size_t sz)
 {
 	assert(atomic_load(&initialized) == 1);
 	return nms_desc_malloc(&g_desc, nodeid, sz);
 }
 void
 nms_free(int nodeid, void * addr)
 {
 	assert(atomic_load(&initialized) == 1);
 	nms_desc_free(&g_desc, nodeid, addr);
 }
--- a/libntr/ntr.c
+++ b/libntr/ntr.c
@ -0,0 +1,46 @@
 #include "ntr.h"
 static int ntr_log_levels[NTR_DEP_MAX] = { NTR_LEVEL_DEFAULT };
 static FILE *ntr_out;
 void
 ntr_init()
 {
 	ntr_out = stdout;
 }
 void
 ntr(int dep, int level, const char *fmt, ...)
 {
 	va_list vl;
 	va_start(vl, fmt);
 	if (dep < NTR_DEP_MAX && level <= ntr_log_levels[dep]) {
 		vfprintf(ntr_out, fmt, vl);
 	}
 	va_end(vl);
 }
 void
 ntr_set_level(int dep, int level)
 {
 	if (dep < NTR_DEP_MAX) {
 		ntr_log_levels[dep] = level;
 	}
 }
 void
 ntr_set_output(FILE *f)
 {
 	if (f != NULL) {
 		ntr_out = f;
 	}
 }
 int
 ntr_get_level(int dep)
 {
 	if (dep < NTR_DEP_MAX) {
 		return ntr_log_levels[dep];
 	}
 	return 0;
 }
--- a/net/cat.cc
+++ b/net/cat.cc
@ -0,0 +1,989 @@
 #include <atomic>
 #include <cstdlib>
 #include <ctime>
 #include <fstream>
 #include <random>
 #include <vector>
 #include <topo.h>
 #include <rte_byteorder.h>
 #include <rte_common.h>
 #include <rte_config.h>
 #include <rte_cycles.h>
 #include <rte_eal.h>
 #include <rte_ethdev.h>
 #include <rte_ether.h>
 #include <rte_launch.h>
 #include <rte_lcore.h>
 #include <rte_mbuf.h>
 #include <unistd.h>
 #include "ntr.h"
 #include "gen.hh"
 #include "net/netsup.hh"
 #include "net/pkt.hh"
 #include "nms.h"
 constexpr static unsigned int BURST_SIZE = 32;
 constexpr static unsigned int MAX_SLAVES = 32;
 constexpr static unsigned int SLAVES_MAX_WAIT_MS = 1000;
 struct datapt {
 	uint32_t epoch;
 	uint32_t valid;
 	uint64_t clt_hw_tx;
 	uint64_t clt_sw_tx;
 	uint64_t clt_hw_rx;
 	uint64_t clt_sw_rx;
 	uint64_t srv_hw_tx;
 	uint64_t srv_sw_tx;
 	uint64_t srv_hw_rx;
 	uint64_t srv_sw_rx;
 };
 constexpr static uint32_t STATE_WAIT = 0;     // waiting for sending
 constexpr static uint32_t STATE_SENT = 1;     // we sent a packet
 constexpr static uint32_t STATE_COMPLETE = 2; // we received everything
 constexpr static uint32_t STATE_PKTLOSS = 3;  // last packet sent was lost
 struct options_t {
 	// parameters
 	unsigned int run_time { 5 };
 	unsigned int warmup_time { 3 };
 	char output[256] = "output.txt";
 	char ia_gen_str[256] = "fixed";
 	unsigned int target_qps { 0 };
 	unsigned int master_mode { 0 };
 	struct net_spec server_spec { };
 	cpuset_t cpu_set = CPUSET_T_INITIALIZER(0x2); // 2nd  core
 	std::vector<struct net_spec *> slaves;
 	uint32_t pkt_loss_failure_threshold { 0 };
 	uint32_t pkt_loss_time_ms { UINT32_MAX };
 	int portid { 0 };
 	// states
 	struct net_spec s_host_spec { };
 	struct conn_spec s_host_conn {
 		.src = &s_host_spec, .dst = &server_spec, .dst_port = POU_PORT
 	};
 	unsigned int s_rxqid { 0 };
 	unsigned int s_txqid { 0 };
 	unsigned int s_socketid { 0 };
 	// for qps calculation
 	std::atomic<uint32_t> s_recved_pkts { 0 };
 	std::atomic<uint32_t> s_pkt_loss { 0 };
 	std::atomic<uint64_t> s_start_time { 0 };
 	std::atomic<uint64_t> s_end_time { 0 };
 	std::atomic<uint32_t> s_slave_qps { 0 };
 	std::atomic<uint32_t> s_slave_recved { 0 };
 	std::atomic<uint32_t> s_slave_loss { 0 };
 	uint32_t s_state { STATE_WAIT };
 	bool s_hwtimestamp { true };
 	Generator *s_iagen { nullptr };
 	std::vector<struct datapt *> s_data;
 	struct datapt *s_last_datapt { nullptr };
 	uint32_t s_epoch { 0 };
 	std::atomic<bool> s_stop { false };
 	std::atomic<uint32_t> s_record { 0 };
 };
 static struct options_t options;
 static uint16_t
 rx_add_timestamp(uint16_t port, uint16_t qidx __rte_unused,
 struct rte_mbuf **pkts, uint16_t nb_pkts, uint16_t max_pkts __rte_unused,
    void *_ __rte_unused)
 {
 	uint64_t now = topo_uptime_ns();
 	struct pkt_hdr *pkt_data;
 	struct timespec ts { };
 	int ret;
 	if (options.s_state != STATE_SENT) {
 		return nb_pkts;
 	}
 	for (int i = 0; i < nb_pkts; i++) {
 		pkt_data = check_valid_packet(pkts[i],
 		    &options.s_host_spec.mac_addr);
 		if (pkt_data == nullptr) {
 			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 			    "rx_add_timestamp: ignoring invalid packet 0x%p.\n",
 			    (void *)pkts[i]);
 			continue;
 		}
 		if (rte_be_to_cpu_16(pkt_data->type) == PKT_TYPE_PROBE_RESP) {
 			uint32_t epoch = rte_be_to_cpu_32(
 			    ((struct pkt_payload_epoch *)pkt_data->payload)
 				->epoch);
 			if (options.s_last_datapt != nullptr &&
 			    options.s_last_datapt->epoch == epoch) {
 				if (options.s_hwtimestamp) {
 					if ((ret = rte_eth_timesync_read_rx_timestamp(
 						port, &ts, pkts[i]->timesync & 0x3)) ==
 						0) {
 						// has hw rx timestamp
 						options.s_last_datapt->clt_hw_rx =
 							ts.tv_sec * S2NS + ts.tv_nsec;
 						options.s_last_datapt->clt_sw_rx = now;
 						ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 							"rx_add_timestamp: tagged packet %p with sw: %lu hw: %lu.\n",
 							(void *)pkts[i], now,
 							options.s_last_datapt->clt_hw_rx);
 					} else {
 						rte_exit(EXIT_FAILURE,
 							"rx_add_timestamp: packet %p not tagged - hw ts not "
 							"available - %d.\n",
 							(void *)pkts[i], ret);
 					}
 				} else {
 					options.s_last_datapt->clt_sw_rx = now;
 					options.s_last_datapt->clt_hw_rx = 0;
 					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 							"rx_add_timestamp: tagged packet %p with sw: %lu hw: (disabled).\n",
 							(void *)pkts[i], now);
 				}
 			} else {
 				ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
 				    "rx_add_timestamp: packet %p epoch %d != last epoch %d.\n",
 				    (void *)pkts[i], epoch,
 				    options.s_last_datapt->epoch);
 			}
 		} else {
 			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 			    "rx_add_timestamp: packet %p not tagged - type %d.\n",
 			    (void *)pkts[i], rte_be_to_cpu_16(pkt_data->type));
 		}
 	}
 	return nb_pkts;
 }
 static uint16_t
 tx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
    struct rte_mbuf **pkts, uint16_t nb_pkts, void *_ __rte_unused)
 {
 	uint64_t now = topo_uptime_ns();
 	struct pkt_hdr *pkt_data;
 	// if (options.s_state != STATE_SENT) {
 	// 	return nb_pkts;
 	// }
 	for (int i = 0; i < nb_pkts; i++) {
 		pkt_data = check_valid_packet(pkts[i],
 		    &options.s_host_spec.mac_addr);
 		if (pkt_data == nullptr) {
 			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 			    "tx_add_timestamp: ignoring invalid packet 0x%p.\n",
 			    (void *)pkts[i]);
 			continue;
 		}
 		if (rte_be_to_cpu_16(pkt_data->type) == PKT_TYPE_PROBE) {
 			uint32_t epoch = rte_be_to_cpu_32(
 			    ((struct pkt_payload_epoch *)pkt_data->payload)
 				->epoch);
 			if (options.s_last_datapt == nullptr ||
 			    epoch != options.s_last_datapt->epoch) {
 				rte_exit(EXIT_FAILURE,
 				    "tx_add_timestamp: packet epoch %d != last epoch %d\n",
 				    epoch, options.s_last_datapt->epoch);
 			}
 			options.s_last_datapt->clt_sw_tx = now;
 			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 			    "tx_add_timestamp: tagged packet %p with sw: %lu.\n",
 			    (void *)pkts[i], now);
 		} else {
 			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 			    "tx_add_timestamp: packet %p not tagged - type %d.\n",
 			    (void *)pkts[i], pkt_data->type);
 		}
 	}
 	return nb_pkts;
 }
 // returns 0 on success
 static void
 send_all_slaves(uint16_t type)
 {
 	struct rte_mbuf *tx_bufs[MAX_SLAVES];
 	//struct rte_eth_stats stats;
 	struct conn_spec cspec;
 	cspec.src = &options.s_host_spec;
 	cspec.dst_port = DEFAULT_RAT_PORT;
 	cspec.src_port = DEFAULT_RAT_PORT;
 	// send all clients SYNC
 	for (unsigned int i = 0; i < options.slaves.size(); i++) {
 		struct pkt_hdr *hdr;
 		cspec.dst = options.slaves.at(i);
 		if (alloc_pkt_hdr(mempool_get(options.s_socketid), type, &cspec, 0,
 			&tx_bufs[i], &hdr) != 0) {
 			rte_exit(EXIT_FAILURE, "failed to alloc packet\n");
 		}
 	}
 	// if (rte_eth_stats_get(options.portid, &stats) != 0 ) {
 	// 	rte_exit(EXIT_FAILURE, "failed!");
 	// }
 	// printf("send_all_slaves: ipackets %lu, opackets %lu, ierrors %lu, oerrors %lu\n", stats.ipackets, stats.opackets, stats.ierrors, stats.oerrors);
 	if (rte_eth_tx_burst(options.portid, options.s_txqid, tx_bufs,
 		options.slaves.size()) != options.slaves.size()) {
 		rte_exit(EXIT_FAILURE, "failed to send some packets\n");
 	}
 }
 // sizeof mbuf must >= MAX_SLAVES
 // this function fills up to #slave
 static void
 wait_for_slaves(uint16_t etype, struct rte_mbuf **out)
 {
 	struct rte_mbuf *tx_bufs[MAX_SLAVES];
 	bool stop = false;
 	const uint64_t start = topo_uptime_ns();
 	std::vector<struct rte_ether_addr *> recved;
 	uint32_t tot = 0;
 	while (!stop) {
 		uint64_t now = topo_uptime_ns();
 		const uint16_t nb_rx = rte_eth_rx_burst(options.portid,
 		    options.s_rxqid, tx_bufs, MAX_SLAVES);
 		if (nb_rx > 0) {
 			for (unsigned int i = 0; i < nb_rx; i++) {
 				struct pkt_hdr *each = check_valid_packet(
 				    tx_bufs[i], &options.s_host_spec.mac_addr);
 				uint16_t type;
 				if (each == nullptr) {
 					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 					    "wait_for_slaves: ignoring invalid packet %p.\n",
 					    (void *)tx_bufs[i]);
 					goto end_loop;
 				}
 				type = rte_be_to_cpu_16(each->type);
 				if (type == etype) {
 					bool invalid = true;
 					// check if it is from one of our
 					// clients
 					for (auto eaddr : options.slaves) {
 						if (rte_is_same_ether_addr(
 							&eaddr->mac_addr,
 							&each->eth_hdr
 							     .src_addr)) {
 							invalid = false;
 							break;
 						}
 					}
 					if (invalid) {
 						// received invalid packet from
 						// unregistered slave
 						ntr(NTR_DEP_USER1,
 						    NTR_LEVEL_WARNING,
 						    "wait_for_slaves: invalid packet %p from unregistered slave\n.",
 						    tx_bufs[i]);
 						goto end_loop;
 					}
 					invalid = false;
 					// check if we have already received the
 					// same packet from the mac addr
 					for (auto eaddr : recved) {
 						if (rte_is_same_ether_addr(
 							eaddr,
 							&each->eth_hdr
 							     .src_addr)) {
 							invalid = true;
 							break;
 						}
 					}
 					if (invalid) {
 						// received invalid packet from
 						// the same slave
 						ntr(NTR_DEP_USER1,
 						    NTR_LEVEL_WARNING,
 						    "wait_for_slaves: invalid packet %p - duplicated\n.",
 						    tx_bufs[i]);
 						goto end_loop;
 					}
 					recved.push_back(
 					    &each->eth_hdr.src_addr);
 					if (recved.size() ==
 					    options.slaves.size()) {
 						stop = true;
 					}
 					if (out != nullptr) {
 						out[tot] = tx_bufs[i];
 						tot++;
 						// don't free this packet
 						continue;
 					}
 				} else {
 					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 					    "wait_for_slaves: ignoring invalid packet %p type %d.\n",
 					    (void *)tx_bufs[i], type);
 				}
 			end_loop:
 				rte_pktmbuf_free(tx_bufs[i]);
 			}
 		}
 		// struct rte_eth_stats stats;
 		// if (rte_eth_stats_get(options.portid, &stats) != 0 ) {
 		// 	rte_exit(EXIT_FAILURE, "failed!");
 		// }
 		//printf("wait_slaves <AFTER>: ipackets %lu, opackets %lu, ierrors %lu, oerrors %lu\n", stats.ipackets, stats.opackets, stats.ierrors, stats.oerrors);
 		if (now - start > SLAVES_MAX_WAIT_MS * MS2NS) {
 			rte_exit(EXIT_FAILURE,
 			    "cat: waiting for too long %d. I QUIT!!", etype);
 		}
 	}
 }
 static void
 pkt_loop()
 {
 	struct rte_mbuf *tx_buf;
 	struct rte_mbuf *rx_bufs[BURST_SIZE];
 	struct pkt_hdr *pkt_data;
 	rdport_generator port_gen(MIN_RANDOM_PORT);
 	bool read_tx = true;
 	bool recv_stat = true;
 	bool recv_resp = true;
 	if (rte_eth_dev_socket_id(options.portid) > 0 &&
 	    rte_eth_dev_socket_id(options.portid) != (int)rte_socket_id()) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
 		    "locore_main: WARNING, port %d is on remote NUMA node to "
 		    "polling thread.\n\tPerformance will "
 		    "not be optimal.\n",
 		    options.portid);
 	}
 	uint64_t next_ts = topo_uptime_ns();
 	uint64_t last_send_ts = next_ts;
 	bool is_last_pkt_lost = false;
 	uint32_t num_cts_pkt_lost = 0;
 	while (!options.s_stop.load()) {
 		uint64_t now = topo_uptime_ns();
 		// always pop incoming packets
 		const uint16_t nb_rx = rte_eth_rx_burst(options.portid,
 		    options.s_rxqid, rx_bufs, BURST_SIZE);
 		if (nb_rx > 0) {
 			for (int i = 0; i < nb_rx; i++) {
 				if (options.s_state != STATE_SENT) {
 					// only need to process packets after we
 					// sent one
 					rte_pktmbuf_free(rx_bufs[i]);
 					continue;
 				}
 				struct pkt_hdr *each = check_valid_packet(
 				    rx_bufs[i], &options.s_host_spec.mac_addr);
 				if (each == nullptr) {
 					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 					    "locore_main: ignoring invalid packet %p.\n",
 					    (void *)rx_bufs[i]);
 					rte_pktmbuf_free(rx_bufs[i]);
 					continue;
 				}
 				uint16_t type = rte_be_to_cpu_16(each->type);
 				NTR_PKT(NTR_DEP_USER1, NTR_LEVEL_DEBUG, each,
 				    "locore_main: received packet %p ", each);
 				struct pkt_payload_epoch *pld_epoch;
 				struct pkt_payload_stat *pld_stat;
 				uint32_t epoch;
 				switch (type) {
 				case PKT_TYPE_PROBE_RESP:
 					pld_epoch = (struct pkt_payload_epoch *)
 							each->payload;
 					epoch = rte_be_to_cpu_32(
 					    pld_epoch->epoch);
 					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "lcore_main: PROBE_RESP received packet %p epoch %d\n", each, epoch);
 					if (options.s_last_datapt == nullptr ||
 					    epoch !=
 						options.s_last_datapt->epoch) {
 						ntr(NTR_DEP_USER1,
 						    NTR_LEVEL_WARNING,
 						    "locore_main: packet %p epoch %d doesn't match datapt %d.\n",
 						    (void *)rx_bufs[i], epoch,
 						    options.s_last_datapt
 							->epoch);
 						break;
 					}
 					recv_resp = true;
 					break;
 				case PKT_TYPE_STAT:
 					pld_stat = (struct pkt_payload_stat *)
 						       each->payload;
 					epoch = rte_be_to_cpu_32(
 					    pld_stat->epoch);
 					if (options.s_last_datapt == nullptr ||
 					    epoch !=
 						options.s_last_datapt->epoch) {
 						ntr(NTR_DEP_USER1,
 						    NTR_LEVEL_WARNING,
 						    "locore_main: packet %p epoch %d doesn't match datapt %d.\n",
 						    (void *)rx_bufs[i], epoch,
 						    options.s_last_datapt
 							->epoch);
 						break;
 					}
 					options.s_last_datapt->srv_hw_tx =
 					    rte_be_to_cpu_64(pld_stat->hw_tx);
 					options.s_last_datapt->srv_hw_rx =
 					    rte_be_to_cpu_64(pld_stat->hw_rx);
 					options.s_last_datapt->srv_sw_tx =
 					    rte_be_to_cpu_64(pld_stat->sw_tx);
 					options.s_last_datapt->srv_sw_rx =
 					    rte_be_to_cpu_64(pld_stat->sw_rx);
 					recv_stat = true;
 					is_last_pkt_lost = false;
 					break;
 				default:
 					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 					    "locore_main: ignoring packet %p with unknown type %d.\n",
 					    (void *)rx_bufs[i], type);
 				}
 				rte_pktmbuf_free(rx_bufs[i]);
 			}
 		}
 		if (options.s_state == STATE_SENT) {
 			// check if hw tx ts is read
 			if (!read_tx) {
 				int ret;
 				struct timespec ts;
 				if (options.s_hwtimestamp) {
 					if ((ret = rte_eth_timesync_read_tx_timestamp(
 						options.portid, &ts)) == 0) {
 						ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 							"locore_main: read hw tx timestamp %lu.\n",
 							(ts.tv_nsec + ts.tv_sec * S2NS));
 						options.s_last_datapt->clt_hw_tx =
 							ts.tv_nsec + ts.tv_sec * S2NS;
 						read_tx = true;
 					}
 				} else {
 					options.s_last_datapt->clt_hw_tx = 0;
 					read_tx = true;
 				}
 			}
 			if (read_tx && recv_resp && recv_stat) {
 				options.s_state = STATE_COMPLETE;
 			} else {
 				// check packet loss
 				if (now - last_send_ts >
 				    options.pkt_loss_time_ms * MS2NS) {
 					if (is_last_pkt_lost) {
 						num_cts_pkt_lost++;
 					} else {
 						is_last_pkt_lost = true;
 						num_cts_pkt_lost = 1;
 					}
 					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 					    "locore_main: packet loss: waiting too long for epoch %d. %d in a row.\n",
 					    options.s_last_datapt->epoch,
 					    num_cts_pkt_lost);
 					delete options.s_last_datapt;
 					options.s_last_datapt = nullptr;
 					options.s_state = STATE_PKTLOSS;
 					options.s_pkt_loss.fetch_add(1);
 					if (num_cts_pkt_lost >
 					    options
 						.pkt_loss_failure_threshold) {
 						rte_exit(EXIT_FAILURE,
 						    "too many continuous packet loss detected\n");
 					}
 				}
 			}
 		}
 		if (options.s_state == STATE_COMPLETE ||
 		    options.s_state == STATE_PKTLOSS ||
 		    options.s_state == STATE_WAIT) {
 			if (options.s_state == STATE_COMPLETE) {
 				options.s_data.push_back(options.s_last_datapt);
 				ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 				    "locore_main: datapt for epoch %d dump:\n"
 				    "                    Valid: %d\n"
 				    "                    client TX HW: %lu\n"
 				    "                    client TX SW: %lu\n"
 				    "                    client RX HW: %lu\n"
 				    "                    client RX SW: %lu\n"
 				    "                    server TX HW: %lu\n"
 				    "                    server TX SW: %lu\n"
 				    "                    server RX HW: %lu\n"
 				    "                    server RX SW: %lu\n\n",
 				    options.s_last_datapt->epoch,
 				    options.s_last_datapt->valid,
 				    options.s_last_datapt->clt_hw_tx,
 				    options.s_last_datapt->clt_sw_tx,
 				    options.s_last_datapt->clt_hw_rx,
 				    options.s_last_datapt->clt_sw_rx,
 				    options.s_last_datapt->srv_hw_tx,
 				    options.s_last_datapt->srv_sw_tx,
 				    options.s_last_datapt->srv_hw_rx,
 				    options.s_last_datapt->srv_sw_rx);
 				options.s_recved_pkts.fetch_add(1);
 				options.s_last_datapt = nullptr;
 			}
 			options.s_state = STATE_WAIT;
 			if (now >= next_ts) {
 				struct pkt_payload_epoch *pld_epoch;
 				uint32_t epoch;
 				next_ts += (int)(options.s_iagen->generate() *
 				    S2NS);
 				options.s_host_conn.src_port = port_gen.next();
 				if (alloc_pkt_hdr(mempool_get(options.s_socketid),
 					PKT_TYPE_PROBE, &options.s_host_conn, 0,
 					&tx_buf, &pkt_data) != 0) {
 					rte_exit(EXIT_FAILURE,
 					    "failed to alloc probe packet.\n");
 				}
 				epoch = options.s_epoch;
 				options.s_epoch++;
 				pld_epoch = (struct pkt_payload_epoch *)
 						pkt_data->payload;
 				pld_epoch->epoch = rte_cpu_to_be_32(epoch);
 				options.s_last_datapt = new struct datapt;
 				options.s_last_datapt->epoch = epoch;
 				options.s_last_datapt->valid =
 				    options.s_record.load();
 				last_send_ts = now;
 				ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 				    "locore_main: sending packet 0x%p with epoch %d\n",
 				    (void *)tx_buf, epoch);
 				const uint16_t nb_tx =
 				    rte_eth_tx_burst(options.portid,
 					options.s_txqid, &tx_buf, 1);
 				if (nb_tx != 1) {
 					rte_exit(EXIT_FAILURE,
 					    "failed to send packet 0x%p, epoch %d\n",
 					    (void *)tx_buf, epoch);
 				}
 				rte_pktmbuf_free(tx_buf);
 				read_tx = false;
 				recv_resp = false;
 				recv_stat = false;
 				options.s_state = STATE_SENT;
 			}
 		}
 	}
 }
 static int
 locore_main(void *tif __rte_unused)
 {
 	struct rte_mbuf *mbufs[MAX_SLAVES];
 	uint32_t core_id = rte_lcore_id();
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: core %d running...\n",
 	    core_id);
 	if (options.master_mode == 1) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 		    "locore_main: sending SYNC ...\n");
 		send_all_slaves(PKT_TYPE_SYNC);
 		ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 		    "locore_main: waiting for SYNC_ACK ...\n");
 		wait_for_slaves(PKT_TYPE_SYNC_ACK, nullptr);
 	}
 	options.s_start_time.store(topo_uptime_ns());
 	pkt_loop();
 	options.s_end_time.store(topo_uptime_ns());
 	if (options.master_mode == 1) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 		    "locore_main: sending FIN ...\n");
 		send_all_slaves(PKT_TYPE_FIN);
 		ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 		    "locore_main: waiting for FIN_ACK ...\n");
 		wait_for_slaves(PKT_TYPE_FIN_ACK, mbufs);
 		// aggregate slave QPS
 		for (unsigned int i = 0; i < options.slaves.size(); i++) {
 			// these packets already underwent validity check in
 			// wait_for_slaves
 			auto pkt_hdr = rte_pktmbuf_mtod(mbufs[i],
 			    struct pkt_hdr *);
 			auto pld_qps = (struct pkt_payload_qps *)
 					   pkt_hdr->payload;
 			uint32_t qps = rte_be_to_cpu_32(pld_qps->qps);
 			uint32_t recved = rte_be_to_cpu_32(
 			    pld_qps->recved_pkts);
 			uint32_t loss = rte_be_to_cpu_32(pld_qps->lost_pkts);
 			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 			    "locore_main: received qps %d from client %d\n",
 			    qps, i);
 			options.s_slave_qps.fetch_add(qps);
 			options.s_slave_loss.fetch_add(loss);
 			options.s_slave_recved.fetch_add(recved);
 			rte_pktmbuf_free(mbufs[i]);
 		}
 	}
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: exited\n");
 	return 0;
 }
 static void
 dump_options()
 {
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
 	    "Configuration:\n"
 	    "    verbosity = +%d\n"
 	    "    run time = %d\n"
 	    "    warmup time = %d\n"
 	    "    output file = %s\n"
 	    "    number of threads = %d\n"
 	    "    interarrival dist = %s\n"
 	    "    target qps = %d\n"
 	    "    host IP = 0x%x\n"
 	    "    pkt loss time = %u\n"
 	    "    pkt loss failure threshold = %u\n"
 	    "    portid = %d\n",
 	    ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING, options.run_time,
 	    options.warmup_time, options.output, CPU_COUNT(&options.cpu_set),
 	    options.ia_gen_str, options.target_qps, options.s_host_spec.ip,
 	    options.pkt_loss_time_ms, options.pkt_loss_failure_threshold,
 	    options.portid);
 	for (auto slave : options.slaves) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
 		    "    slave = 0x%x@%x:%x:%x:%x:%x:%x\n", slave->ip,
 		    slave->mac_addr.addr_bytes[0],
 		    slave->mac_addr.addr_bytes[1],
 		    slave->mac_addr.addr_bytes[2],
 		    slave->mac_addr.addr_bytes[3],
 		    slave->mac_addr.addr_bytes[4],
 		    slave->mac_addr.addr_bytes[5]);
 	}
 }
 static void
 usage()
 {
 	fprintf(stdout,
 	    "Usage:\n"
 	    "    -v(vv): verbose mode\n"
 	    "    -s: server net spec\n"
 	    "    -S: slave(rat)'s net spec (also turns on master mode)\n"
 	    "    -t: run time\n"
 	    "    -T: warmup time\n"
 	    "    -h: display the information\n"
 	    "    -o: output filename\n"
 	    "    -A: affinity mask\n"
 	    "    -i: inter-arrival time distribution\n"
 	    "    -q: target qps\n"
 	    "    -H: host net spec\n"
 	    "    -L: pkt loss failure threshold\n"
 	    "    -l: pkt loss time threshold\n");
 }
 int
 main(int argc, char *argv[])
 {
 	std::ofstream log_file;
 	bool has_host_spec = false;
 	ntr_init();
 	// init dpdk
 	int ret = rte_eal_init(argc, argv);
 	if (ret < 0) {
 		rte_exit(EXIT_FAILURE, "rte_eal_init failed!\n");
 	}
 	argc -= ret;
 	argv += ret;
 	// set warning level
 	ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_WARNING);
 	{
 		int c;
 		// parse arguments
 		struct net_spec *ns;
 		while ((c = getopt(argc, argv, "vs:S:t:T:ho:A:i:q:H:L:l:p:")) !=
 		    -1) {
 			switch (c) {
 			case 'v':
 				ntr_set_level(NTR_DEP_USER1,
 				    ntr_get_level(NTR_DEP_USER1) + 1);
 				break;
 			case 's':
 				if (str_to_netspec(optarg,
 					&options.server_spec) != 0) {
 					rte_exit(EXIT_FAILURE,
 					    "invalid server net spec.\n");
 				}
 				break;
 			case 'S':
 				ns = new struct net_spec;
 				if (str_to_netspec(optarg, ns) != 0) {
 					rte_exit(EXIT_FAILURE,
 					    "invalid client net spec\n");
 				}
 				options.slaves.push_back(ns);
 				options.master_mode = 1;
 				if (options.slaves.size() > MAX_SLAVES) {
 					rte_exit(EXIT_FAILURE,
 					    "too many rats.\n");
 				}
 				break;
 			case 't':
 				options.run_time = strtol(optarg, nullptr, 10);
 				break;
 			case 'T':
 				options.warmup_time = strtol(optarg, nullptr,
 				    10);
 				break;
 			case 'h':
 				usage();
 				rte_exit(EXIT_SUCCESS, "\n");
 			case 'o':
 				strncpy(options.output, optarg,
 				    sizeof(options.output) - 1);
 				break;
 			case 'A':
 				cpulist_to_cpuset(optarg, &options.cpu_set);
 				break;
 			case 'i':
 				strncpy(options.ia_gen_str, optarg,
 				    sizeof(options.ia_gen_str) - 1);
 				break;
 			case 'q':
 				options.target_qps = strtoul(optarg, nullptr,
 				    10);
 				break;
 			case 'H':
 				has_host_spec = true;
 				if (str_to_netspec(optarg,
 					&options.s_host_spec) != 0) {
 					rte_exit(EXIT_FAILURE,
 					    "invalid host net spec.\n");
 				}
 				break;
 			case 'L':
 				options.pkt_loss_failure_threshold =
 				    strtoul(optarg, nullptr, 10);
 				break;
 			case 'l':
 				options.pkt_loss_time_ms = strtoul(optarg,
 				    nullptr, 10);
 				if (options.pkt_loss_time_ms == 0) {
 					options.pkt_loss_time_ms = UINT32_MAX;
 				}
 				break;
 			case 'p':
 				options.portid = strtol(optarg, nullptr, 10);
 				break;
 			default:
 				usage();
 				rte_exit(EXIT_FAILURE, "unknown argument: %c\n",
 				    c);
 			}
 		}
 	}
 	if (!has_host_spec) {
 		rte_exit(EXIT_FAILURE, "must specify host IP\n");
 	}
 	// init libtopo
 	if (topo_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) !=
 	    0) {
 		rte_exit(EXIT_FAILURE, "libtopo init failed!\n");
 	}
 	// init nms
 	if (nms_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != 0) {
 		rte_exit(EXIT_FAILURE, "failed to init libnms!\n");
 	}
 	if (CPU_COUNT(&options.cpu_set) != 1) {
 		rte_exit(EXIT_FAILURE, "must specify exactly one core\n");
 	}
 	int core_id = CPU_FFS(&options.cpu_set) - 1;
 	dump_options();
 	// configure memory and port
 	struct port_conf pconf;
 	struct device_conf dconf;
 	struct mem_conf mconf;
 	portconf_get(options.portid, &pconf);
 	if (!pconf.timesync) {
 		options.s_hwtimestamp = false;
 		ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
 		    "main: timesync disabled. hw timestamp unavailable.\n ");
 	}
 	if (CPU_COUNT(&options.cpu_set) > 1) {
 		int ffs = CPU_FFS(&options.cpu_set);
 		CPU_ZERO(&options.cpu_set);
 		CPU_SET(ffs - 1, &options.cpu_set);
 		ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "cat only supports one thread, using only core %d.\n", ffs - 1);
 	}
 	dconf.mtu = MAX_STANDARD_MTU;
 	CPU_COPY(&options.cpu_set, &dconf.core_affinity);
 	dconf.portid = options.portid;
 	dconf.rss_hf = pconf.rss_hf;
 	dconf.rx_offloads = pconf.rxoffload;
 	dconf.tx_offloads = pconf.txoffload;
 	dconf.timesync = pconf.timesync;
 	dconf.rx_fn = rx_add_timestamp;
 	dconf.rx_user = nullptr;
 	dconf.rx_ring_sz = 2048;
 	dconf.tx_fn = tx_add_timestamp;
 	dconf.tx_user = nullptr;
 	dconf.tx_ring_sz = 2048;
 	mconf.cache_size = 64;
 	mconf.priv_size = 0;
 	mconf.num_elements = 4096;
 	mconf.data_room_size = RTE_MBUF_DEFAULT_BUF_SIZE + MAX_STANDARD_MTU;
 	mconf.max_pools = -1;
 	dpdk_init(&dconf, &mconf);
 	if (rte_eth_macaddr_get(options.portid,
 		&options.s_host_spec.mac_addr) != 0) {
 		rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n",
 		    options.portid);
 	}
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
 	    "Configured port %d with mac addr %x:%x:%x:%x:%x:%x\n",
 	    options.portid, options.s_host_spec.mac_addr.addr_bytes[0],
 	    options.s_host_spec.mac_addr.addr_bytes[1],
 	    options.s_host_spec.mac_addr.addr_bytes[2],
 	    options.s_host_spec.mac_addr.addr_bytes[3],
 	    options.s_host_spec.mac_addr.addr_bytes[4],
 	    options.s_host_spec.mac_addr.addr_bytes[5]);
 	// create default generator
 	options.s_iagen = createGenerator(options.ia_gen_str);
 	if (options.s_iagen == nullptr) {
 		rte_exit(EXIT_FAILURE, "invalid generator string %s\n",
 		    options.ia_gen_str);
 	}
 	options.s_iagen->set_lambda((double)options.target_qps);
 	// open log file for writing
 	log_file.open(options.output, std::ofstream::out);
 	if (!log_file) {
 		rte_exit(EXIT_FAILURE, "failed to open log file %s\n",
 		    options.output);
 	}
 	sleep(INIT_DELAY);
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
 	    "main: launching thread on core %d\n", core_id);
 	if (rte_eal_remote_launch(locore_main, nullptr, core_id) != 0) {
 		rte_exit(EXIT_FAILURE, "failed to launch function on locore\n");
 	}
 	// XXX: poor man's timer
 	uint32_t second = 0;
 	while (true) {
 		if (second >= options.warmup_time) {
 			options.s_record.store(1);
 		}
 		if (second >= options.run_time + options.warmup_time) {
 			options.s_stop.store(true);
 			break;
 		}
 		usleep(S2US);
 		second++;
 	}
 	if (rte_eal_wait_lcore(core_id) < 0)
 		rte_exit(EXIT_FAILURE, "failed to wait for job completion\n");
 	// calculate QPS
 	uint32_t qps = (double)options.s_recved_pkts.load() /
 	    (((double)(options.s_end_time.load() -
 		  options.s_start_time.load()) /
 		(double)S2NS));
 	qps += options.s_slave_qps.load();
 	// dump stats
 	log_file << qps << ',' << options.s_recved_pkts.load() << ','
 		 << options.s_pkt_loss.load() << ','
 		 << options.s_slave_recved.load() << ','
 		 << options.s_slave_loss.load() << std::endl;
 	for (auto it : options.s_data) {
 		if (it->valid) {
 			log_file << it->clt_sw_rx << ',' << it->clt_sw_tx << ','
 				 << it->clt_hw_rx << ',' << it->clt_hw_tx << ','
 				 << it->srv_sw_rx << ',' << it->srv_sw_tx << ','
 				 << it->srv_hw_rx << ',' << it->srv_hw_tx
 				 << std::endl;
 		}
 		delete it;
 	}
 	log_file.close();
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
 	    "qps = %d, recved = %d, loss = %d, slave recved = %d, slave loss = %d\n",
 	    qps, options.s_recved_pkts.load(), options.s_pkt_loss.load(),
 	    options.s_slave_recved.load(), options.s_slave_loss.load());
 	// clean up
 	dpdk_cleanup(&dconf);
 	return 0;
 }
--- a/net/khat.cc
+++ b/net/khat.cc
@ -0,0 +1,701 @@
 #include <atomic>
 #include <cassert>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
 #include <vector>
 #include <unistd.h>
 #include <sys/cpuset.h>
 #include <sys/endian.h>
 #include <sys/sched.h>
 #include <sys/types.h>
 #include <topo.h>
 #include <rte_common.h>
 #include <rte_config.h>
 #include <rte_cycles.h>
 #include <rte_eal.h>
 #include <rte_ethdev.h>
 #include <rte_ether.h>
 #include <rte_launch.h>
 #include <rte_lcore.h>
 #include <rte_mbuf.h>
 #include "ntr.h"
 //#include "gen.hh"
 #include "net/netsup.hh"
 #include "net/pkt.hh"
 #include "nms.h"
 #include "rte_byteorder.h"
 constexpr static unsigned int BURST_SIZE = 32;
 constexpr static unsigned int CACHELINE_SIZE = 64;
 constexpr static uint16_t THREAD_LOAD_BUFFER_SZ = 16384;
 struct probe_state_t {
 	struct net_spec dst;
 	struct conn_spec cspec {
 		.dst = &dst
 	};
 	uint64_t last_sw_rx;
 	uint64_t last_sw_tx;
 	uint64_t last_hw_rx;
 	uint32_t epoch;
 };
 // keep track of the probe state
 // when a probe packet first arrives this state is set to be influx and the
 // rte_mbuf's userdata is set to PROBE_MAGIC which prevents other probe packets
 // to be processed when the server sends the probe stats back to user influx is
 // released this is to guarantee that the server only processes one probe packet
 // at the time
 // XXX: also this can be attached to the mbuf itself and processed by the lcore
 // thread
 //      I kept this global because globally there could be only one pending
 //      probe request and rx_add_timestamp can save their shit here too
 struct thread_info {
 	int tid;
 	int rxqid;
 	int txqid;
 	int lcore_id;
 	int node_id;
 	void *cache_lines;
 	void *load_buffer;
 };
 struct options_t {
 	// config
 	int num_threads { 1 };
 	cpuset_t cpu_set = CPUSET_T_INITIALIZER(0x2); // 2nd core
 	bool jumbo_frame_enabled {
 		false
 	}; // setting this to true changes mbuf size and mtu
 	int port_mtu { MAX_STANDARD_MTU };
 	int thread_cacheline_cnt = { 1600 }; // 100MB data per thread
 	uint16_t portid { 0 };
 	// states
 	struct net_spec s_host_spec { };
 	std::vector<struct thread_info *> s_thr_info;
 	int probe_state_offset { 0 };
 	bool s_hwtimestamp { true };
 	struct probe_state_t s_probe_info;
 	std::atomic<bool> is_probing { false };
 };
 struct options_t options;
 static bool
 mbuf_is_probe_valid(struct rte_mbuf *pkt)
 {
 	return *RTE_MBUF_DYNFIELD(pkt, options.probe_state_offset, bool *);
 }
 static void
 mbuf_set_probe_valid(struct rte_mbuf *pkt, bool b)
 {
 	*RTE_MBUF_DYNFIELD(pkt, options.probe_state_offset, bool *) = b;
 }
 static uint16_t
 rx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
    struct rte_mbuf **pkts, uint16_t nb_pkts, uint16_t max_pkts __rte_unused,
    void *_ __rte_unused)
 {
 	int rc = 0;
 	uint64_t now = topo_uptime_ns();
 	struct timespec ts { };
 	struct pkt_hdr *pkt_data;
 	for (int i = 0; i < nb_pkts; i++) {
 		pkt_data = check_valid_packet(pkts[i],
 		    &options.s_host_spec.mac_addr);
 		if (pkt_data == nullptr) {
 			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 			    "rx_add_timestamp: ignoring invalid packet %p.\n",
 			    (void *)pkts[i]);
 			continue;
 		}
 		if (rte_be_to_cpu_16(pkt_data->type) == PKT_TYPE_PROBE) {
 			bool cmp = false;
 			mbuf_set_probe_valid(pkts[i], false);
 			if (options.is_probing.compare_exchange_strong(cmp,
 				true)) {
 				options.s_probe_info.last_sw_rx = now;
 				if (options.s_hwtimestamp) {
 					if ((rc = rte_eth_timesync_read_rx_timestamp(
 						 port, &ts,
 						 pkts[i]->timesync & 0x3)) ==
 					    0) {
 						options.s_probe_info
 						    .last_hw_rx = ts.tv_nsec +
 						    ts.tv_sec * S2NS;
 						ntr(NTR_DEP_USER1,
 						    NTR_LEVEL_DEBUG,
 						    "rx_add_timestamp: tagged packet %p with sw rx: %lu hw rx:%lu.\n",
 						    (void *)pkts[i],
 						    options.s_probe_info
 							.last_sw_rx,
 						    options.s_probe_info
 							.last_hw_rx);
 						mbuf_set_probe_valid(pkts[i],
 						    true);
 					} else {
 						options.is_probing.store(false);
 						ntr(NTR_DEP_USER1,
 						    NTR_LEVEL_WARNING,
 						    "rx_add_timestamp: packet %p not tagged - failed to read hw rx timestamp: %d.\n",
 						    (void *)pkts[i], rc);
 					}
 				} else {
 					mbuf_set_probe_valid(pkts[i], true);
 					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 					    "rx_add_timestamp: tagged packet %p with sw rx only: %lu.\n",
 					    (void *)pkts[i], now);
 				}
 			} else {
 				ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 				    "rx_add_timestamp: packet %p not tagged - server is probing.\n",
 				    (void *)pkts[i]);
 			}
 		} else {
 			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 			    "rx_add_timestamp: packet %p not tagged - not PROBE packet: type %d.\n",
 			    (void *)pkts[i], rte_be_to_cpu_16(pkt_data->type));
 		}
 	}
 	return nb_pkts;
 }
 static uint16_t
 tx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
    struct rte_mbuf **pkts, uint16_t nb_pkts, void *_ __rte_unused)
 {
 	uint64_t now = topo_uptime_ns();
 	struct pkt_hdr *pkt_data;
 	for (int i = 0; i < nb_pkts; i++) {
 		pkt_data = check_valid_packet(pkts[i],
 		    &options.s_host_spec.mac_addr);
 		if (pkt_data == nullptr) {
 			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 			    "tx_add_timestamp: ignoring invalid packet %p.\n",
 			    (void *)pkts[i]);
 			continue;
 		}
 		if (rte_be_to_cpu_16(pkt_data->type) == PKT_TYPE_PROBE_RESP) {
 			// this packet is the response to PROBE packets
 			// at this time the packet is not sent to the NIC yet so
 			// the state must be waiting stats
 			assert(options.is_probing.load() &&
 			    mbuf_is_probe_valid(pkts[i]));
 			options.s_probe_info.last_sw_tx = now;
 			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 			    "tx_add_timestamp: tagged packet %p with sw tx %lu\n",
 			    (void *)pkts[i], options.s_probe_info.last_sw_tx);
 		} else {
 			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 			    "tx_add_timestamp: packet %p not tagged - type %d\n",
 			    (void *)pkts[i], pkt_data->type);
 		}
 	}
 	return nb_pkts;
 }
 static void 
 worker_cpu_load(unsigned long us)
 {
 	uint64_t now = topo_uptime_ns();
 	while(true) {
 		uint64_t cur = topo_uptime_ns();
 		if (cur - now >= us * 1000) {
 			break;
 		} 
 	}
 }
 static void
 worker_memory_load(int tid, uint32_t which, uint32_t load)
 {
 	uint32_t start_cacheline = which % (options.thread_cacheline_cnt * options.s_thr_info.size());
 	uint32_t thrd = start_cacheline / options.thread_cacheline_cnt;
 	uint32_t start = start_cacheline % options.thread_cacheline_cnt;
 	struct thread_info * cur = options.s_thr_info.at(tid);
 	struct thread_info * tgt = options.s_thr_info.at(thrd);
 	for (uint32_t i = 0; i < load; i++) {
 		*(uint32_t *)cur->load_buffer = *(uint32_t *)((char *)tgt->cache_lines + ((start + i) % options.thread_cacheline_cnt) * CACHELINE_SIZE);
 	}
 }
 static int
 locore_main(void *ti)
 {
 	auto tinfo = (struct thread_info *)ti;
 	struct rte_mbuf *bufs[BURST_SIZE];
 	// + 1 because it might involve an extra PKT_TYPE_STAT packet
 	// when all tx timestamps are ready
 	struct rte_mbuf *tx_bufs[BURST_SIZE];
 	struct pkt_hdr *pkt_data;
 	// XXX: hack hardcode to be larger than MTU
 	bool pending_probe = false;
 	if (rte_eth_dev_socket_id(options.portid) > 0 &&
 	    rte_eth_dev_socket_id(options.portid) != (int)rte_socket_id()) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
 		    "locore_main <thread %d>: WARNING, port %d is on remote NUMA node to "
 		    "polling thread.\n\tPerformance will "
 		    "not be optimal.\n",
 		    tinfo->tid, options.portid);
 	}
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
 	    "locore_main <thread %d>: running on locore %d with txqid %d and rxqid %d.\n",
 	    tinfo->tid, rte_lcore_id(), tinfo->txqid, tinfo->rxqid);
 	while (true) {
 		uint16_t nb_tx = 0;
 		const uint16_t nb_rx = rte_eth_rx_burst(options.portid,
 		    tinfo->rxqid, bufs, BURST_SIZE);
 		struct rte_mbuf *pkt_buf;
 		struct pkt_hdr *tx_data;
 		for (int i = 0; i < nb_rx; i++) {
 			// XXX: optimization: in rx_add_timestamp every packet
 			// is already validated once can just mark valid packet
 			// with a value so we can avoid this redundant check
 			pkt_data = check_valid_packet(bufs[i],
 			    &options.s_host_spec.mac_addr);
 			if (pkt_data == nullptr) {
 				ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 				    "locore_main <thread %d>: skipping invalid packet %p.\n",
 				    tinfo->tid, (void *)bufs[i]);
 				// dump_pkt(bufs[i]);
 				rte_pktmbuf_free(bufs[i]);
 				continue;
 			}
 			NTR_PKT(NTR_DEP_USER1, NTR_LEVEL_DEBUG, pkt_data,
 			    "locore_main <thread %d>: received packet ", tinfo->tid);
 			switch (rte_be_to_cpu_16(pkt_data->type)) {
 			case PKT_TYPE_PROBE: {
 				if (mbuf_is_probe_valid(bufs[i])) {
 					// send back probe_resp pkt to probe for
 					// return latency
 					pending_probe = true;
 					// book keep probe results
 					options.s_probe_info.epoch =
 					    rte_be_to_cpu_32(
 						((struct pkt_payload_epoch *)
 							pkt_data->payload)
 						    ->epoch);
 					pkt_hdr_to_netspec(pkt_data,
 					    &options.s_probe_info.dst,
 					    &options.s_probe_info.cspec
 						 .dst_port,
 					    nullptr,
 					    &options.s_probe_info.cspec
 						 .src_port);
 					options.s_probe_info.cspec.src =
 					    &options.s_host_spec;
 					if (alloc_pkt_hdr(mempool_get(
 							      tinfo->node_id),
 						PKT_TYPE_PROBE_RESP,
 						&options.s_probe_info.cspec, 0,
 						&pkt_buf, &tx_data) != 0) {
 						rte_exit(EXIT_FAILURE,
 						    "failed to allocate pkt\n");
 					}
 					rte_memcpy(tx_data->payload,
 					    pkt_data->payload,
 					    sizeof(struct pkt_payload_epoch));
 					mbuf_set_probe_valid(pkt_buf, true);
 					// queue for burst send
 					NTR_PKT(NTR_DEP_USER1, NTR_LEVEL_DEBUG, tx_data,
 			    		"locore_main <thread %d>: sending packet ", tinfo->tid);
 					tx_bufs[nb_tx++] = pkt_buf;
 				}
 				break;
 			}
 			case PKT_TYPE_LOAD: {
 				struct conn_spec cspec;
 				struct net_spec src;
 				struct net_spec dst;
 				// touch the unused data to pretend that we read
 				// those dummy fields
 				memcpy(tinfo->load_buffer, pkt_data->payload,
 				    MIN(bufs[i]->data_len -
 					    sizeof(struct pkt_hdr),
 					THREAD_LOAD_BUFFER_SZ));
 				// perform the load
 				auto pld = (struct pkt_payload_load *)
 					       pkt_data->payload;
 				uint32_t load_type = rte_be_to_cpu_32(pld->type);
 				uint32_t load_arg0 = rte_be_to_cpu_32(pld->arg0);
 				uint32_t load_arg1 = rte_be_to_cpu_32(pld->arg1);
 				ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 				    "locore_main <thread %d>: LOAD type %d, arg0 %d, arg1 %d\n",
 				    tinfo->tid, load_type, load_arg0, load_arg1);
 				if (load_type == LOAD_TYPE_CPU) {
 					worker_cpu_load(load_arg0);
 				} else if (load_type == LOAD_TYPE_MEM) {
 					worker_memory_load(tinfo->tid, load_arg0, load_arg1);
 				} else {
 					ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
 							"locore_main <thread %d>: unknown LOAD type %d, ignoring...", tinfo->tid, load_type);
 					break;
 				}
 				// reply
 				pkt_hdr_to_netspec(pkt_data, &src,
 				    &cspec.dst_port, &dst, &cspec.src_port);
 				cspec.dst = &src;
 				cspec.src = &dst;
 				// printf("LOAD PKT SIZE: %d\n",
 				// bufs[i]->data_len); we reply to load packet
 				// regardless of the server state
 				if (alloc_pkt_hdr(mempool_get(tinfo->node_id),
 					PKT_TYPE_LOAD_RESP, &cspec, 0, &pkt_buf,
 					&tx_data) != 0) {
 					rte_exit(EXIT_FAILURE,
 					    "failed to allocate pkt\n");
 				}
 				rte_memcpy(tx_data->payload, pkt_data->payload,
 				    sizeof(struct pkt_payload_load));
 				// queue for burst send
 				NTR_PKT(NTR_DEP_USER1, NTR_LEVEL_DEBUG, tx_data,
 			    		"locore_main <thread %d>: sending packet ", tinfo->tid);
 				tx_bufs[nb_tx++] = pkt_buf;
 				break;
 			}
 			default:
 				ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 				    "locore_main <thread %d>: ignoring packet %p with unknown type %d.\n",
 				    tinfo->tid, (void *)bufs[i],
 				    rte_be_to_cpu_16(pkt_data->type));
 				break;
 			}
 			rte_pktmbuf_free(bufs[i]);
 		}
 		// send all packets
 		tx_burst_all(options.portid, tinfo->txqid, tx_bufs, nb_tx);
 		// we wanna check every loop not only when there are packets
 		if (pending_probe) {
 			assert(options.is_probing.load());
 			struct timespec ts { };
 			struct pkt_payload_stat *stat;
 			int status = 0;
 			if (options.s_hwtimestamp) {
 				if ((status = rte_eth_timesync_read_tx_timestamp(
 					options.portid, &ts)) == 0) {
 					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 					    "locore_main <thread %d>: obtained hw tx timestamp %lu.\n",
 					    tinfo->tid,
 					    (ts.tv_sec * S2NS + ts.tv_nsec));
 				} else {
 					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 					    "locore_main <thread %d>: failed to obtain hw tx timestamp: %d.\n",
 					    tinfo->tid, status);
 				}
 			}
 			if (status == 0) {
 				// now we have everything we need
 				if (alloc_pkt_hdr(mempool_get(tinfo->node_id),
 					PKT_TYPE_STAT, &options.s_probe_info.cspec, 0,
 					&pkt_buf, &tx_data) != 0) {
 					rte_exit(EXIT_FAILURE,
 						"failed to alloc pkt_buf\n");
 				}
 				// populate stats
 				stat = (struct pkt_payload_stat *)tx_data->payload;
 				stat->epoch = rte_cpu_to_be_32(
 					options.s_probe_info.epoch);
 				if (options.s_hwtimestamp) {
 					stat->hw_rx = rte_cpu_to_be_64(
 						options.s_probe_info.last_hw_rx);
 					stat->hw_tx = rte_cpu_to_be_64(
 						ts.tv_nsec + ts.tv_sec * S2NS);
 				} else {
 					stat->hw_rx = 0;
 					stat->hw_tx = 0;
 				}
 				stat->sw_rx = rte_cpu_to_be_64(
 					options.s_probe_info.last_sw_rx);
 				stat->sw_tx = rte_cpu_to_be_64(
 					options.s_probe_info.last_sw_tx);
 				// send the packet
 				tx_burst_all(options.portid, tinfo->txqid, &pkt_buf, 1);
 				// release flux
 				pending_probe = false;
 				options.is_probing.store(false);
 			}
 		}
 	}
 }
 static void
 usage()
 {
 	fprintf(stdout,
 	    "Usage:\n"
 	    "    -v(vv): verbose mode\n"
 	    "    -h: seek help\n"
 	    "    -A: cpu list for worker threads\n"
 	    "    -m: enable memory load generator(MLG)\n"
 	    "    -b: MLG trunk size\n"
 	    "    -x: MLG thread affinity mask\n"
 	    "    -X: MLG target domain affinity mask\n"
 	    "    -S: MLG shared buffer\n"
 	    "    -H: host spec\n"
 	    "    -J: enable jumbo frames\n"
 	    "    -p: port id\n");
 	fflush(stdout);
 }
 static void
 dump_options()
 {
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
 	    "main: khat configuration:\n"
 	    "          verbosity: +%d\n"
 	    "          thread count: %d\n"
 	    "          ip: 0x%x\n"
 	    "          jumbo frame: %d\n"
 	    "          port id: %d\n",
 	    ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING,
 	    options.num_threads, options.s_host_spec.ip,
 	    options.jumbo_frame_enabled, options.portid);
 }
 int
 main(int argc, char *argv[])
 {
 	bool has_host_spec { false };
 	struct mem_conf mconf;
 	struct device_conf dconf;
 	ntr_init();
 	// init dpdk
 	int ret = rte_eal_init(argc, argv);
 	if (ret < 0) {
 		rte_exit(EXIT_FAILURE, "rte_eal_init failed!\n");
 	}
 	argc -= ret;
 	argv += ret;
 	// set warning level
 	ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_WARNING);
 	{
 		int c;
 		// parse arguments
 		while ((c = getopt(argc, argv, "hvA:H:Jp:")) != -1) {
 			switch (c) {
 			case 'v':
 				ntr_set_level(NTR_DEP_USER1,
 				    ntr_get_level(NTR_DEP_USER1) + 1);
 				break;
 			case 'h':
 				usage();
 				rte_exit(EXIT_SUCCESS, "\n");
 			case 'A':
 				cpulist_to_cpuset(optarg, &options.cpu_set);
 				options.num_threads = CPU_COUNT(
 				    &options.cpu_set);
 				if (options.num_threads == 0) {
 					rte_exit(EXIT_FAILURE,
 					    "must run at least one thread\n");
 				}
 				break;
 			case 'H':
 				if (str_to_netspec(optarg,
 					&options.s_host_spec) != 0) {
 					rte_exit(EXIT_FAILURE,
 					    "invalid host spec\n");
 				}
 				has_host_spec = true;
 				break;
 			case 'J':
 				options.jumbo_frame_enabled = true;
 				options.port_mtu = MAX_JUMBO_MTU;
 				break;
 			case 'p':
 				options.portid = atoi(optarg);
 				break;
 			default:
 				usage();
 				rte_exit(EXIT_SUCCESS, "unknown argument: %c",
 				    c);
 			}
 		}
 	}
 	if (!has_host_spec) {
 		rte_exit(EXIT_FAILURE, "Must specify host spec\n");
 	}
 	// init libtopo
 	if (topo_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) !=
 	    0) {
 		rte_exit(EXIT_FAILURE, "libtopo init failed!\n");
 	}
 	// init libnms
 	if (nms_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != 0) {
 		rte_exit(EXIT_FAILURE, "libnms init failed!\n");
 	}
 	dump_options();
 	// register dynamic field
 	struct rte_mbuf_dynfield rte_mbuf_dynfield_probe_flag = {
 		.name = "rte_mbuf_dynfield_probe_valid",
 		.size = sizeof(bool),
 		.align = __alignof__(uint32_t),
 		.flags = 0
 	};
 	options.probe_state_offset = rte_mbuf_dynfield_register(
 	    &rte_mbuf_dynfield_probe_flag);
 	if (options.probe_state_offset == -1) {
 		rte_exit(EXIT_FAILURE, "failed to register dynamic field: %d\n",
 		    rte_errno);
 	}
 	// configure memory and port
 	struct port_conf pconf;
 	portconf_get(options.portid, &pconf);
 	if (!pconf.timesync) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
 		    "main: timesync disabled. hw timestamp unavailable.\n ");
 		options.s_hwtimestamp = false;
 	}
 	dconf.mtu = options.port_mtu;
 	CPU_COPY(&options.cpu_set, &dconf.core_affinity);
 	dconf.portid = options.portid;
 	dconf.rss_hf = pconf.rss_hf;
 	dconf.rx_offloads = pconf.rxoffload;
 	dconf.tx_offloads = pconf.txoffload;
 	dconf.timesync = pconf.timesync;
 	dconf.rx_fn = rx_add_timestamp;
 	dconf.rx_user = nullptr;
 	dconf.rx_ring_sz = 2048;
 	dconf.tx_fn = tx_add_timestamp;
 	dconf.tx_user = nullptr;
 	dconf.tx_ring_sz = 2048;
 	mconf.cache_size = 512;
 	mconf.priv_size = 0;
 	mconf.num_elements = (dconf.rx_ring_sz + dconf.tx_ring_sz) *
 	    rte_lcore_count() / rte_socket_count();
 	mconf.data_room_size = RTE_MBUF_DEFAULT_BUF_SIZE + MAX_JUMBO_MTU -
 	    MAX_STANDARD_MTU;
 	mconf.max_pools = -1;
 	dpdk_init(&dconf, &mconf);
 	if (rte_eth_macaddr_get(options.portid,
 		&options.s_host_spec.mac_addr) != 0) {
 		rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n",
 		    options.portid);
 	}
 	// init threads
 	uint32_t cpu_idx = CPU_FFS(&options.cpu_set);
 	uint32_t tid = 0;
 	while (cpu_idx != 0) {
 		uint32_t lcore_id = cpu_idx - 1;
 		uint32_t node_id = rte_lcore_to_socket_id(lcore_id);
 		auto *tinfo = (struct thread_info *)nms_malloc(node_id,
 		    sizeof(struct thread_info));
 		tinfo->cache_lines = nms_malloc(node_id,
 		    CACHELINE_SIZE * options.thread_cacheline_cnt);
 		tinfo->load_buffer = nms_malloc(node_id,
 		    THREAD_LOAD_BUFFER_SZ);
 		tinfo->tid = tid;
 		tinfo->lcore_id = lcore_id;
 		tinfo->node_id = node_id;
 		tinfo->rxqid = tid;
 		tinfo->txqid = tid;
 		options.s_thr_info.push_back(tinfo);
 		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
 		    "main: thread %d assigned to cpu %d, node %d\n", tinfo->tid,
 		    tinfo->lcore_id, topo_core_to_numa(lcore_id));
 		tid++;
 		CPU_CLR(cpu_idx - 1, &options.cpu_set);
 		cpu_idx = CPU_FFS(&options.cpu_set);
 	}
 	sleep(INIT_DELAY);
 	for (int i = 0; i < options.num_threads; i++) {
 		struct thread_info *tinfo = options.s_thr_info.at(i);
 		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
 		    "main: launching thread %d on locore %d\n", tinfo->tid,
 		    tinfo->lcore_id);
 		if (rte_eal_remote_launch(locore_main,
 			(void *)options.s_thr_info.at(i),
 			tinfo->lcore_id) != 0) {
 			rte_exit(EXIT_FAILURE,
 			    "failed to launch function on locore %d\n",
 			    tinfo->lcore_id);
 		}
 	}
 	while (true) {
 		usleep(S2US);
 	}
 	// shouldn't get here
 	// clean up
 	for (int i = 0; i < options.num_threads; i++) {
 		struct thread_info *tinfo = options.s_thr_info.at(i);
 		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
 		    "main: waiting for locore %d...\n", tinfo->lcore_id);
 		if (rte_eal_wait_lcore(tinfo->lcore_id) != 0) {
 			rte_exit(EXIT_FAILURE, "failed to wait for locore %d\n",
 			    tinfo->lcore_id);
 		}
 	}
 	dpdk_cleanup(&dconf);
 	return 0;
 }
--- a/net/libnetsup/dpdk.cc
+++ b/net/libnetsup/dpdk.cc
@ -0,0 +1,204 @@
 #include "net/netsup.hh"
 #include <cstdlib>
 #include "rte_build_config.h"
 #include "rte_common.h"
 #include "rte_config.h"
 #include "rte_ether.h"
 #include "rte_lcore.h"
 #include "rte_mempool.h"
 #include "rte_mbuf.h"
 #include "rte_errno.h"
 #include "rte_ethdev.h"
 #include "ntr.h"
 static struct rte_mempool *g_mempools[MAX_NUMA_NODES] = {nullptr};
 static unsigned int g_mempool_sz = 0;
 static void
 mempool_init(struct mem_conf *mconf)
 {
 	struct rte_mempool * mbuf_pool;
 	char mempool_name[64];
 	for (int i = 0; i < (int)rte_socket_count(); i++) {
 		uint32_t nodeid = i;
 		// ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
 		//     "mempool_init: creating mempool for node %d\n", nodeid);
 		// create one mbuf pool per socket
 		snprintf(mempool_name, sizeof(mempool_name), "net_mempool_%d", nodeid);
 		mbuf_pool = rte_pktmbuf_pool_create(mempool_name, mconf->num_elements, 
 					mconf->cache_size, mconf->priv_size, 
 				mconf->data_room_size, nodeid);
 		if (mbuf_pool == nullptr) {
 			rte_exit(EXIT_FAILURE, "cannot create mbuf pool: %d\n", rte_errno);
 		}
 		g_mempools[nodeid] = mbuf_pool;
 		g_mempool_sz++;
 		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "mempool_init: created mempool for node %d\n", nodeid);
 	}
 }
 struct rte_mempool *
 mempool_get(int nodeid)
 {
 	if ((unsigned int)nodeid < g_mempool_sz) {
 		return g_mempools[nodeid];
 	}
 	return nullptr;
 }
 static void
 port_init(struct device_conf *dconf)
 {
 	struct rte_ether_addr addr;
 	struct rte_eth_dev_info dev_info {
 	};
 	struct rte_eth_conf port_conf;
 	struct rte_eth_txconf txconf {
 	};
 	struct rte_eth_rxconf rxconf {
 	};
 	int ret;
 	int num_threads = CPU_COUNT(&dconf->core_affinity);
 	if (rte_eth_dev_count_avail() == 0) {
 		rte_exit(EXIT_FAILURE, "number of ports must be > 0\n");
 	}
 	if (!rte_eth_dev_is_valid_port(dconf->portid)) {
 		rte_exit(EXIT_FAILURE, "cannot find port %d\n", dconf->portid);
 	}
 	if ((ret = rte_eth_macaddr_get(dconf->portid, &addr)) != 0) {
 		rte_exit(EXIT_FAILURE, "cannot get mac address of port: %d\n", ret);
 	}
 	ret = rte_eth_dev_info_get(dconf->portid, &dev_info);
 	if (ret != 0) {
 		rte_exit(EXIT_FAILURE, "failed to get dev info: %d", ret);
 	}
 	memset(&port_conf, 0, sizeof(struct rte_eth_conf));
 	port_conf.rxmode.mtu = dconf->mtu;
 	port_conf.rxmode.mq_mode = RTE_ETH_MQ_RX_RSS;
 	port_conf.rx_adv_conf.rss_conf.rss_key = nullptr;
 	port_conf.rx_adv_conf.rss_conf.rss_hf = dconf->rss_hf;
 	port_conf.rxmode.offloads = dconf->rx_offloads;
 	port_conf.txmode.offloads = dconf->tx_offloads;
 	/* Configure the Ethernet device. */
 	ret = rte_eth_dev_configure(dconf->portid, num_threads, num_threads, &port_conf);
 	if (ret != 0)
 		rte_exit(EXIT_FAILURE, "failed to configure port: %d\n", ret);
 	ret = rte_eth_dev_adjust_nb_rx_tx_desc(dconf->portid, &dconf->rx_ring_sz, &dconf->tx_ring_sz);
 	if (ret != 0)
 		rte_exit(EXIT_FAILURE, "failed to set rx tx queue size: %d\n", ret);
 	/* Allocate and set up 1 RX queue per thread per Ethernet port. */
 	rxconf = dev_info.default_rxconf;
 	rxconf.offloads = port_conf.rxmode.offloads;
 	rxconf.rx_nseg = 0;
 	rxconf.rx_seg = nullptr;
 	txconf = dev_info.default_txconf;
 	txconf.offloads = port_conf.txmode.offloads;
 	int core;
 	int qid = 0;
 	CPU_FOREACH_ISSET(core, &dconf->core_affinity) {
 		int socket = rte_lcore_to_socket_id(core);
 		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "port_init: setting up rx & tx queue for core %d (socket %d)...\n", core, socket);
 		ret = rte_eth_rx_queue_setup(dconf->portid, qid, dconf->rx_ring_sz, socket, &rxconf, mempool_get(socket));
 		if (ret < 0)
 			rte_exit(EXIT_FAILURE, "failed to setup rx queue for core %d: %d\n", core, ret);
 		ret = rte_eth_tx_queue_setup(dconf->portid, qid, dconf->tx_ring_sz, socket, &txconf);
 		if (ret < 0)
 			rte_exit(EXIT_FAILURE, "failed to setup tx queue for core %d: %d", core, ret);
 		qid++;
 	}
 	// set mtu
 	ret = rte_eth_dev_set_mtu(dconf->portid, dconf->mtu);
 	if (ret != 0)
 		rte_exit(EXIT_FAILURE, "failed to set mtu: %d\n", ret);
 	ret = rte_eth_dev_start(dconf->portid);
 	if (ret < 0)
 		rte_exit(EXIT_FAILURE, "failed to start port: %d\n", ret);
 	if (dconf->timesync) {
 		ret = rte_eth_timesync_enable(dconf->portid);
 		if (ret != 0)
 			rte_exit(EXIT_FAILURE, "failed to enable timesync: %d\n", ret);
 	}
 	/* Enable RX in promiscuous mode for the Ethernet device. */
 	ret = rte_eth_promiscuous_enable(dconf->portid);
 	if (ret != 0)
 		rte_exit(EXIT_FAILURE, "failed to enable promiscuous mode: %d\n", ret);
 	for (int i = 0; i < num_threads; i++) {
 		if (dconf->tx_fn != nullptr) {
 			if (rte_eth_add_tx_callback(dconf->portid, i, dconf->tx_fn, dconf->tx_user) == nullptr) {
 				rte_exit(EXIT_FAILURE, "failed to attach callback to tx queue %d\n", i);
 			}
 		}
 		if (dconf->rx_fn != nullptr) {
 			if (rte_eth_add_rx_callback(dconf->portid, i, dconf->rx_fn, dconf->rx_user) == nullptr) {
 				rte_exit(EXIT_FAILURE, "failed to attach callback to rx queue %d\n", i);
 			}
 		}
 	}
 	// sync_port_clock(portid);
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, 
 	"port_init: configured port %d on socket %d with mac addr %x:%x:%x:%x:%x:%x\n",
 		dconf->portid, rte_eth_dev_socket_id(dconf->portid),
 		addr.addr_bytes[0],
 		addr.addr_bytes[1],
 		addr.addr_bytes[2],
 		addr.addr_bytes[3],
 		addr.addr_bytes[4],
 		addr.addr_bytes[5]);
 }
 void
 dpdk_init(struct device_conf *dconf, struct mem_conf *mconf)
 {
 	if (rte_socket_count() > (int)MAX_NUMA_NODES) {
 		rte_exit(EXIT_FAILURE, "too many numa nodes\n");
 	}
 	// ensure 1-1 mapping
 	for (int i = 0; i < (int)rte_socket_count(); i++) {
 		if (rte_socket_id_by_idx(i) != i) {
 			rte_exit(EXIT_FAILURE, "socket %d has id %d instead.\n", i, rte_socket_id_by_idx(i));
 		}
 	}
 	mempool_init(mconf);
 	port_init(dconf);
 }
 void
 dpdk_cleanup(struct device_conf * dconf)
 {
 	rte_eth_dev_stop(dconf->portid);
 	rte_eth_dev_close(dconf->portid);
 	for (int i = 0; i < (int)rte_socket_count(); i++) {
 		rte_mempool_free(g_mempools[i]);
 	}
 }
--- a/net/libnetsup/portconf.cc
+++ b/net/libnetsup/portconf.cc
@ -0,0 +1,66 @@
 #include "rte_ethdev.h"
 #include "net/netsup.hh"
 #include <cstdlib>
 static struct port_conf port_confs[] = {
 	{
 		.driver_name = "net_cxgbe",
 		.rxoffload = RTE_ETH_RX_OFFLOAD_RSS_HASH | RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_IPV4_CKSUM,
 		.txoffload = RTE_ETH_TX_OFFLOAD_UDP_CKSUM | RTE_ETH_TX_OFFLOAD_IPV4_CKSUM,
 		.rss_hf = RTE_ETH_RSS_UDP | RTE_ETH_RSS_FRAG_IPV4,
 		.timesync = false
 	},
 	{
 		.driver_name = "net_i40e",
 		.rxoffload = RTE_ETH_RX_OFFLOAD_RSS_HASH | RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_IPV4_CKSUM,
 		.txoffload = RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE | RTE_ETH_TX_OFFLOAD_UDP_CKSUM | RTE_ETH_TX_OFFLOAD_IPV4_CKSUM,
 		.rss_hf = RTE_ETH_RSS_FRAG_IPV4 | RTE_ETH_RSS_NONFRAG_IPV4_UDP | RTE_ETH_RSS_NONFRAG_IPV4_OTHER | RTE_ETH_RSS_L2_PAYLOAD,
 		.timesync = false
 	},
 	{
 		.driver_name = "net_ice",
 		.rxoffload = RTE_ETH_RX_OFFLOAD_RSS_HASH | RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_IPV4_CKSUM | RTE_ETH_RX_OFFLOAD_TIMESTAMP,
 		.txoffload = RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE | RTE_ETH_TX_OFFLOAD_UDP_CKSUM | RTE_ETH_TX_OFFLOAD_IPV4_CKSUM,
 		.rss_hf = RTE_ETH_RSS_FRAG_IPV4 | RTE_ETH_RSS_NONFRAG_IPV4_UDP | RTE_ETH_RSS_NONFRAG_IPV4_OTHER | RTE_ETH_RSS_L2_PAYLOAD,
 		.timesync = false
 	},
 	{
 		.driver_name = "net_ixgbe",
 		.rxoffload = RTE_ETH_RX_OFFLOAD_RSS_HASH | RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_IPV4_CKSUM,
 		.txoffload = RTE_ETH_TX_OFFLOAD_UDP_CKSUM | RTE_ETH_TX_OFFLOAD_IPV4_CKSUM,
 		.rss_hf = RTE_ETH_RSS_IPV4 | RTE_ETH_RSS_NONFRAG_IPV4_UDP,
 		.timesync = true
 	}
 };
 static struct port_conf default_conf = {
 	.driver_name = "default",
 	.rxoffload = RTE_ETH_RX_OFFLOAD_RSS_HASH | RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_IPV4_CKSUM | RTE_ETH_RX_OFFLOAD_TIMESTAMP,
 	.txoffload = RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE | RTE_ETH_TX_OFFLOAD_UDP_CKSUM | RTE_ETH_TX_OFFLOAD_IPV4_CKSUM,
 	.rss_hf = RTE_ETH_RSS_FRAG_IPV4 | RTE_ETH_RSS_NONFRAG_IPV4_UDP | RTE_ETH_RSS_NONFRAG_IPV4_OTHER | RTE_ETH_RSS_L2_PAYLOAD,
 	.timesync = true
 };
 static const int port_size = sizeof(port_confs) / sizeof(port_confs[0]);
 int
 portconf_get(int portid, struct port_conf * out)
 {
 	struct rte_eth_dev_info dev_info {};
 	if (rte_eth_dev_info_get(portid, &dev_info) != 0) {
 		rte_exit(EXIT_FAILURE, "failed to obtain device info for port %d\n", portid);
 	}
 	for(int i = 0; i < port_size; i++) {
 		struct port_conf * conf = &port_confs[i];
 		if (strcmp(conf->driver_name, dev_info.driver_name) == 0) {
 			memcpy(out, conf, sizeof(struct port_conf));
 			return 0;
 		}
 	}
 	fprintf(stdout, "portconf_get: unable to find matching conf for port %d:%s, returning default conf.\n", portid, dev_info.driver_name);
 	memcpy(out, &default_conf, sizeof(struct port_conf));
 	return -1;
 }
--- a/net/rat.cc
+++ b/net/rat.cc
@ -0,0 +1,909 @@
 #include <atomic>
 #include <cstddef>
 #include <cstdlib>
 #include <list>
 #include <map>
 #include <mutex>
 #include <random>
 #include <vector>
 #include <sys/endian.h>
 #include <topo.h>
 #include <rte_byteorder.h>
 #include <rte_common.h>
 #include <rte_config.h>
 #include <rte_eal.h>
 #include <rte_ethdev.h>
 #include <rte_ether.h>
 #include <rte_launch.h>
 #include <rte_lcore.h>
 #include <rte_mbuf.h>
 #include <unistd.h>
 #include "ntr.h"
 #include "gen.hh"
 #include "net/netsup.hh"
 #include "net/pkt.hh"
 #include "nms.h"
 constexpr static unsigned int BURST_SIZE = 32;
 static unsigned int
 epoch_mk(unsigned int id, unsigned int epoch)
 {
 	return (id << 24) | epoch;
 }
 static unsigned int
 epoch_get_id(unsigned int epoch)
 {
 	return epoch >> 24;
 }
 static unsigned int
 epoch_get_epoch(unsigned int epoch)
 {
 	return epoch & 0x00FFFFFF;
 }
 struct epoch_info {
 	unsigned int epoch;
 	uint64_t ts;
 };
 struct thread_info {
 	unsigned int id { 0 };
 	unsigned int lcore_id { 0 };
 	unsigned int rxqid { 0 };
 	unsigned int txqid { 0 };
 	int socket_id;
 	// this field is read by the stat collecting thread
 	std::atomic<int> recved_pkts { 0 };
 	std::atomic<int> lost_pkts { 0 };
 	Generator *ia_gen { nullptr };
 	Generator *load_gen0 { nullptr };
 	Generator *load_gen1 { nullptr };
 	std::mutex
 	    mtx; // this lock protects data shared between worker threads, i.e.:
 	std::list<struct epoch_info *> recved_epochs;
 	thread_info() = default;
 };
 constexpr static int STATE_SYNC = 0;	 // waiting for SYNC
 constexpr static int STATE_SYNC_ACK = 1; // Waiting for sending SYNC_ACK
 constexpr static int STATE_RUNNING = 2;	 // Running
 constexpr static int STATE_FIN = 3;	 // FIN received
 constexpr static int WORKLOAD_MAX_ARGS = 2;
 struct options_t {
 	unsigned int run_time { 5 };
 	// parameters
 	int slave_mode { 0 };
 	uint32_t rage_quit_time { UINT32_MAX };
 	char ia_gen[256] { "fixed:0" };
 	char load_gen[WORKLOAD_MAX_ARGS][256] = {{"fixed:0"}, {"fixed:0"}};
 	uint32_t workload_type {LOAD_TYPE_CPU};
 	uint32_t target_qps { 0 };
 	uint32_t depth { 1 };
 	struct net_spec server_spec { };
 	cpuset_t cpu_set = CPUSET_T_INITIALIZER(0x2); // 1 thread @ core 2
 	uint32_t pkt_loss_delay_ms { UINT32_MAX };
 	bool jumbo_frame_enabled { false };
 	int pkt_pad_sz { 0 };
 	int port_mtu { MAX_STANDARD_MTU };
 	int portid { 0 };
 	// states
 	unsigned int s_num_threads { 1 }; // 1 thread
 	struct net_spec s_host_spec { };
 	struct net_spec s_master_spec { };
 	struct conn_spec s_master_cspec {
 		.src = &s_host_spec, .src_port = DEFAULT_RAT_PORT,
 		.dst = &s_master_spec, .dst_port = DEFAULT_RAT_PORT,
 	};
 	std::vector<struct thread_info *> s_thr_info;
 	std::atomic<int> s_state { STATE_RUNNING }; // default non master mode
 	// states for qps
 	std::atomic<uint64_t> s_ts_begin { 0 };
 };
 static struct options_t options;
 static inline void
 calc_stats(uint64_t now, uint32_t *qps, uint32_t *recved_pkt,
    uint32_t *total_loss)
 {
 	uint32_t recv = 0;
 	uint32_t loss = 0;
 	for (auto i : options.s_thr_info) {
 		recv += i->recved_pkts.load();
 		loss += i->lost_pkts.load();
 	}
 	if (recved_pkt != nullptr) {
 		*recved_pkt = recv;
 	}
 	if (total_loss != nullptr) {
 		*total_loss = loss;
 	}
 	if (qps != nullptr) {
 		*qps = (uint32_t)((double)(recv) /
 		    ((double)(now - options.s_ts_begin.load()) / (double)S2NS));
 	}
 }
 static void
 proto_loop(struct thread_info *tinfo)
 {
 	struct rte_mbuf *tx_buf;
 	struct rte_mbuf *rx_bufs[BURST_SIZE];
 	struct pkt_hdr *pkt_data;
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
 	    "proto_loop <thread %d>: waiting for SYNC from cat\n", tinfo->id);
 	while (options.s_state.load() == STATE_SYNC) {
 		const uint16_t nb_rx = rte_eth_rx_burst(options.portid,
 		    tinfo->rxqid, rx_bufs, BURST_SIZE);
 		if (nb_rx > 0) {
 			for (int i = 0; i < nb_rx; i++) {
 				struct pkt_hdr *each = check_valid_packet(
 				    rx_bufs[i], &options.s_host_spec.mac_addr);
 				if (each != nullptr) {
 					uint16_t type = rte_be_to_cpu_16(
 					    each->type);
 					if (type == PKT_TYPE_SYNC) {
 						int expected = STATE_SYNC;
 						ntr(NTR_DEP_USER1,
 						    NTR_LEVEL_INFO,
 						    "proto_loop <thread %d>: received SYNC from cat\n",
 						    tinfo->id);
 						if (!options.s_state
 							 .compare_exchange_strong(
 							     expected,
 							     STATE_SYNC_ACK)) {
 							// someone barged in,
 							// listen to that guy
 							ntr(NTR_DEP_USER1,
 							    NTR_LEVEL_WARNING,
 							    "proto_loop <thread %d>: failed to cmpxchg sync_recv.\n",
 							    tinfo->id);
 						} else {
 							pkt_hdr_to_netspec(each,
 							    &options
 								 .s_master_spec,
 							    nullptr, nullptr,
 							    nullptr);
 							if (alloc_pkt_hdr(
 								mempool_get(
 								    tinfo
 									->socket_id),
 								PKT_TYPE_SYNC_ACK,
 								&options
 								     .s_master_cspec,
 								0, &tx_buf,
 								&pkt_data) !=
 							    0) {
 								rte_exit(
 								    EXIT_FAILURE,
 								    "failed to alloc pkt hdr\n");
 							}
 							tx_burst_all(
 							    options.portid,
 							    tinfo->txqid,
 							    &tx_buf, 1);
 							expected =
 							    STATE_SYNC_ACK;
 							// we've done our job,
 							// set off the threads
 							if (!options.s_state
 								 .compare_exchange_strong(
 								     expected,
 								     STATE_RUNNING)) {
 								rte_exit(
 								    EXIT_FAILURE,
 								    "state unexpectedly changed\n");
 							}
 							ntr(NTR_DEP_USER1,
 							    NTR_LEVEL_INFO,
 							    "proto_loop <thread %d>: sent SYNC_ACK to cat\n",
 							    tinfo->id);
 						}
 					} else {
 						ntr(NTR_DEP_USER1,
 						    NTR_LEVEL_DEBUG,
 						    "proto_loop <thread %d>: ignoring invalid packet %p type %d.\n",
 						    tinfo->id,
 						    (void *)rx_bufs[i], type);
 					}
 				} else {
 					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 					    "proto_loop <thread %d>: ignoring invalid packet %p.\n",
 					    tinfo->id, (void *)rx_bufs[i]);
 					//dump_pkt(rx_bufs[i]);
 				}
 				rte_pktmbuf_free(rx_bufs[i]);
 			}
 		}
 	}
 	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 	    "proto_loop <thread %d>: exiting loop...\n", tinfo->id);
 }
 static void
 pkt_loop(struct thread_info *tinfo)
 {
 	struct rte_mbuf *tx_bufs[BURST_SIZE];
 	struct rte_mbuf *rx_bufs[BURST_SIZE];
 	std::vector<struct epoch_info *> recved_epochs;
 	std::map<unsigned int, struct epoch_info *> sent_epochs;
 	uint64_t cur_epoch = 0;
 	uint64_t next_ts;
 	uint64_t last_recv_ts = 0;
 	struct conn_spec srv_cspec;
 	rdport_generator src_port_gen(MIN_RANDOM_PORT);
 	rdport_generator dst_port_gen(MIN_RANDOM_PORT);
 	srv_cspec.src = &options.s_host_spec;
 	srv_cspec.dst = &options.server_spec;
 	next_ts = topo_uptime_ns();
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "pkt_loop <thread %d>: entering\n",
 	    tinfo->id);
 	while (options.s_state.load() == STATE_RUNNING) {
 		uint64_t now = topo_uptime_ns();
 		// always pop incoming packets
 		const uint16_t nb_rx = rte_eth_rx_burst(options.portid,
 		    tinfo->rxqid, rx_bufs, BURST_SIZE);
 		if (nb_rx > 0) {
 			for (int i = 0; i < nb_rx; i++) {
 				struct pkt_hdr *each = check_valid_packet(
 				    rx_bufs[i], &options.s_host_spec.mac_addr);
 				if (each == nullptr) {
 					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 					    "pkt_loop <thread %d>: ignoring invalid packet %p.\n",
 					    tinfo->id, (void *)rx_bufs[i]);
 					rte_pktmbuf_free(rx_bufs[i]);
 					continue;
 				}
 				uint16_t type = rte_be_to_cpu_16(each->type);
 				NTR_PKT(NTR_DEP_USER1, NTR_LEVEL_DEBUG, each,
 				    "locore_main <thread %d>: ", tinfo->id);
 				struct pkt_payload_epoch *pld_epoch;
 				struct epoch_info *einfo;
 				uint32_t epoch;
 				uint32_t id;
 				struct thread_info *other_t;
 				int int_expected = STATE_RUNNING;
 				switch (type) {
 				case PKT_TYPE_LOAD_RESP:
 					pld_epoch = (struct pkt_payload_epoch *)
 							each->payload;
 					epoch = rte_be_to_cpu_32(
 					    pld_epoch->epoch);
 					id = epoch_get_id(epoch);
 					// printf("Load resp size : %d\n",
 					// rx_bufs[i]->data_len);
 					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 					    "pkt_loop <thread %d>: packet %p epoch 0x%x id %d.\n",
 					    tinfo->id, (void *)rx_bufs[i],
 					    epoch, id);
 					if (id >= options.s_num_threads) {
 						ntr(NTR_DEP_USER1,
 						    NTR_LEVEL_WARNING,
 						    "pkt_loop <thread %d>: packet %p invalid id %d.\n",
 						    tinfo->id,
 						    (void *)rx_bufs[i], id);
 						break;
 					}
 					einfo = new struct epoch_info;
 					einfo->epoch = epoch;
 					einfo->ts = now;
 					other_t = options.s_thr_info.at(id);
 					other_t->mtx.lock();
 					other_t->recved_epochs.push_back(einfo);
 					other_t->mtx.unlock();
 					break;
 				case PKT_TYPE_FIN:
 					if (rte_is_same_ether_addr(
 						&each->eth_hdr.src_addr,
 						&options.s_master_spec
 						     .mac_addr)) {
 						ntr(NTR_DEP_USER1,
 						    NTR_LEVEL_DEBUG,
 						    "pkt_loop <thread %d>: recved FIN from cat.\n",
 						    tinfo->id);
 						// master told us to stop!
 						if (!options.s_state
 							 .compare_exchange_strong(
 							     int_expected,
 							     STATE_FIN)) {
 							ntr(NTR_DEP_USER1,
 							    NTR_LEVEL_WARNING,
 							    "pkt_loop <thread %d>: failed to cmpxchg state.\n",
 							    tinfo->id);
 						}
 						uint32_t qps;
 						uint32_t total_recv;
 						uint32_t total_loss;
 						calc_stats(now, &qps,
 						    &total_recv, &total_loss);
 						struct pkt_hdr *pkt_hdr;
 						if (alloc_pkt_hdr(
 							mempool_get(
 							    tinfo->socket_id),
 							PKT_TYPE_FIN_ACK,
 							&options.s_master_cspec,
 							0, &tx_bufs[0],
 							&pkt_hdr) != 0) {
 							rte_exit(EXIT_FAILURE,
 							    "failed to allocate pkt hdr\n");
 						}
 						auto pld_qps =
 						    (struct pkt_payload_qps *)
 							pkt_hdr->payload;
 						pld_qps->qps = rte_cpu_to_be_32(
 						    qps);
 						pld_qps->recved_pkts =
 						    rte_cpu_to_be_32(
 							total_recv);
 						pld_qps->lost_pkts =
 						    rte_cpu_to_be_32(
 							total_loss);
 						tx_burst_all(options.portid,
 						    tinfo->txqid, &tx_bufs[0],
 						    1);
 						options.s_state.store(
 						    STATE_FIN);
 						ntr(NTR_DEP_USER1,
 						    NTR_LEVEL_DEBUG,
 						    "pkt_loop <thread %d>: sent FIN_ACK to cat. QPS = %d.\n",
 						    tinfo->id, qps);
 					} else {
 						ntr(NTR_DEP_USER1,
 						    NTR_LEVEL_WARNING,
 						    "pkt_loop <thread %d>: invalid FIN packet from a different cat.\n",
 						    tinfo->id);
 					}
 					break;
 				default:
 					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 					    "pkt_loop: ignoring packet %p with unknown type %d.\n",
 					    (void *)rx_bufs[i], type);
 				}
 				rte_pktmbuf_free(rx_bufs[i]);
 			}
 		}
 		// dequeue receved epochs
 		struct epoch_info *einfo;
 		tinfo->mtx.lock();
 		while (!tinfo->recved_epochs.empty()) {
 			// only dequeue, process later
 			einfo = tinfo->recved_epochs.front();
 			tinfo->recved_epochs.pop_front();
 			// XXX: might call into the allocator
 			// otherwise we need to have an array and do batching
 			// => complex code and don't think it's worth it
 			recved_epochs.push_back(einfo);
 		}
 		tinfo->mtx.unlock();
 		if (!recved_epochs.empty())
 			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 			    "pkt_loop <thread %d>: dequeued %lu received epochs\n",
 			    tinfo->id, recved_epochs.size());
 		// process epochs
 		while (!recved_epochs.empty()) {
 			einfo = recved_epochs.back();
 			recved_epochs.pop_back();
 			auto it = sent_epochs.find(einfo->epoch);
 			if (it != sent_epochs.end()) {
 				ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 				    "pkt_loop <thread %d>: received epoch 0x%x\n",
 				    tinfo->id, epoch_get_epoch(einfo->epoch));
 				if (einfo->ts > last_recv_ts) {
 					last_recv_ts = einfo->ts;
 				}
 				delete it->second;
 				sent_epochs.erase(it);
 				tinfo->recved_pkts.fetch_add(1);
 			} else {
 				// we recved an epoch we never sent
 				ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 				    "pkt_loop <thread %d>: received epoch 0x%x but never sent it. Packet loss?\n",
 				    tinfo->id, einfo->epoch);
 			}
 			delete einfo;
 		}
 		// handle packet loss
 		for (auto it = sent_epochs.begin(); it != sent_epochs.end();) {
 			einfo = it->second;
 			if (now - einfo->ts >
 			    options.pkt_loss_delay_ms * MS2NS) {
 				// timed out
 				ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 				    "pkt_loop <thread %d>: epoch 0x%x is lost after not receiving for too long\n",
 				    tinfo->id, einfo->epoch);
 				delete it->second;
 				it = sent_epochs.erase(it);
 				tinfo->lost_pkts.fetch_add(1);
 			} else {
 				++it;
 			}
 		}
 		// check to send the next packet
 		uint32_t total_send = 0;
 		while (now >= next_ts && sent_epochs.size() < options.depth &&
 		    total_send < BURST_SIZE) {
 			struct pkt_payload_load *pld_load;
 			struct pkt_hdr *pkt_data;
 			next_ts += (int)(tinfo->ia_gen->generate() * S2NS);
 			// change dst port for every packet for RSS
 			srv_cspec.dst_port = dst_port_gen.next();
 			srv_cspec.src_port = src_port_gen.next();
 			if (alloc_pkt_hdr(mempool_get(tinfo->socket_id),
 				PKT_TYPE_LOAD, &srv_cspec, options.pkt_pad_sz,
 				&tx_bufs[total_send], &pkt_data) != 0) {
 				rte_exit(EXIT_FAILURE,
 				    "failed to allocate pkt hdr\n");
 			}
 			pld_load = (struct pkt_payload_load *)pkt_data->payload;
 			pld_load->type = rte_cpu_to_be_32(options.workload_type);
 			pld_load->arg0 = rte_cpu_to_be_32((uint32_t)tinfo->load_gen0->generate());
 			pld_load->arg1 = rte_cpu_to_be_32((uint32_t)tinfo->load_gen1->generate());
 			unsigned int epoch = epoch_mk(tinfo->id, cur_epoch);
 			pld_load->epoch = rte_cpu_to_be_32(epoch);
 			cur_epoch++;
 			einfo = new struct epoch_info;
 			einfo->epoch = epoch;
 			einfo->ts = now;
 			sent_epochs.insert({ epoch, einfo });
 			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 			    "pkt_loop <thread %d>: sending packet %p with epoch 0x%x\n",
 			    tinfo->id, (void *)tx_bufs[total_send], epoch);
 			total_send++;
 		}
 		tx_burst_all(options.portid, tinfo->txqid, tx_bufs, total_send);
 		// check rage quit only when we have sent a packet
 		if (last_recv_ts == 0) {
 			last_recv_ts = topo_uptime_ns();
 		}
 		if (topo_uptime_ns() >
 		    options.rage_quit_time * MS2NS + last_recv_ts) {
 			rte_exit(EXIT_FAILURE,
 			    "rat: thread %d waiting too long for resp. I F QUIT!\n",
 			    tinfo->id);
 		}
 	}
 	// clean up
 	for (auto it = sent_epochs.begin(); it != sent_epochs.end();) {
 		delete it->second;
 		++it;
 	}
 	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 	    "pkt_loop <thread %d>: exiting loop...\n", tinfo->id);
 }
 static int
 locore_main(void *tif)
 {
 	auto tinfo = (struct thread_info *)tif;
 	uint32_t core_id = rte_lcore_id();
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
 	    "locore_main <thread %d>: running on core %d rxqid %d txqid %d...\n", tinfo->id,
 	    core_id, tinfo->rxqid, tinfo->txqid);
 	if (rte_eth_dev_socket_id(options.portid) > 0 &&
 	    rte_eth_dev_socket_id(options.portid) != (int)rte_socket_id()) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
 		    "locore_main <thread %d>: WARNING, port %d is on remote NUMA node to "
 		    "polling thread.\n\tPerformance will "
 		    "not be optimal.\n",
 		    tinfo->id, options.portid);
 	}
 	if (options.slave_mode == 1) {
 		// perform rat protocol
 		proto_loop(tinfo);
 	}
 	// wait for the primary thread sending SYNC_ACK
 	while (options.s_state.load() != STATE_RUNNING) {
 	}
 	// store the current timestamp
 	options.s_ts_begin.store(topo_uptime_ns());
 	pkt_loop(tinfo);
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main <thread %d>: exited\n",
 	    tinfo->id);
 	return 0;
 }
 static void
 dump_options()
 {
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
 	    "Configuration:\n"
 	    "    verbosity = +%d\n"
 	    "    run time = %d\n"
 	    "    num threads = %d\n"
 	    "    rage quit time = %ul\n"
 	    "    slave mode = %d\n"
 	    "    interarrival dist = %s\n"
 	    "    workload type = %d\n"
 		"    workload arg0 = %s\n"
 		"    workload arg1 = %s\n"
 	    "    qps = %d\n"
 	    "    host IP = 0x%x\n"
 	    "    depth = %u\n"
 	    "    packet loss time threshold = %u\n"
 	    "    jumbo frame = %d\n"
 	    "    packet pad size = %d\n"
 	    "    portid = %d\n",
 	    ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING, options.run_time,
 	    options.s_num_threads, options.rage_quit_time, options.slave_mode,
 	    options.ia_gen, options.workload_type, options.load_gen[0], options.load_gen[1], options.target_qps,
 	    options.s_host_spec.ip, options.depth, options.pkt_loss_delay_ms,
 	    options.jumbo_frame_enabled, options.pkt_pad_sz, options.portid);
 }
 static void
 usage()
 {
 	fprintf(stdout,
 	    "Usage:\n"
 	    "    -v(vv): verbose mode\n"
 	    "    -h: display the information\n"
 	    "    -t: run time\n"
 	    "    -s: server net spec\n"
 	    "    -S: slave(rat) mode\n"
 	    "    -A: affinity mask\n"
 	    "    -i: inter-arrival time distribution\n"
 	    "    -w: workload type\n"
 		"	 -w (repeated): workload arg0 distribution\n"
 		"	 -w (repeated): workload arg1 distribution\n"
 	    "    -r: rage quit time (in ms)\n"
 	    "    -q: target QPS\n"
 	    "    -H: host net spec\n"
 	    "    -D: max number of packets in flight\n"
 	    "    -l: packet loss time threshold\n"
 	    "    -J: enable jumbo frame\n"
 	    "    -P: pad load packets to this size\n"
 	    "    -p: portid\n");
 }
 int
 main(int argc, char *argv[])
 {
 	struct thread_info *tinfo;
 	bool has_host_spec = false;
 	ntr_init();
 	// init dpdk
 	int ret = rte_eal_init(argc, argv);
 	if (ret < 0) {
 		rte_exit(EXIT_FAILURE, "rte_eal_init failed!\n");
 	}
 	argc -= ret;
 	argv += ret;
 	// set warning level
 	ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_WARNING);
 	{
 		int c;
 		int num_of_ws = 0;
 		// parse arguments
 		while ((c = getopt(argc, argv,
 			    "vht:s:SA:i:w:r:q:H:D:l:JP:p:")) != -1) {
 			switch (c) {
 			case 'v':
 				ntr_set_level(NTR_DEP_USER1,
 				    ntr_get_level(NTR_DEP_USER1) + 1);
 				break;
 			case 'h':
 				usage();
 				rte_exit(EXIT_SUCCESS, "\n");
 			case 't':
 				options.run_time = strtol(optarg, nullptr, 10);
 				break;
 			case 's':
 				if (str_to_netspec(optarg,
 					&options.server_spec) != 0) {
 					rte_exit(EXIT_FAILURE,
 					    "invalid server net spec\n");
 				}
 				break;
 			case 'S':
 				options.slave_mode = 1;
 				options.s_state =
 				    STATE_SYNC; // set state to wait for SYNC
 				break;
 			case 'A':
 				cpulist_to_cpuset(optarg, &options.cpu_set);
 				options.s_num_threads = CPU_COUNT(
 				    &options.cpu_set);
 				if (options.s_num_threads == 0) {
 					rte_exit(EXIT_FAILURE,
 					    "invalid cpu mask %s\n", optarg);
 				}
 				break;
 			case 'i':
 				strncpy(options.ia_gen, optarg,
 				    sizeof(options.ia_gen) - 1);
 				break;
 			case 'w':
 				if (num_of_ws == 0) {
 					options.workload_type = strtol(optarg, NULL, 10);
 					if (options.workload_type >= LOAD_TYPE_MAX) {
 						rte_exit(EXIT_FAILURE,
 					    	"invalid workload type %s\n", optarg);
 					}
 				} else if (num_of_ws <= WORKLOAD_MAX_ARGS) {
 					strncpy(options.load_gen[num_of_ws - 1], optarg, 255);	
 				}
 				num_of_ws++;
 				break;
 			case 'r':
 				options.rage_quit_time = strtol(optarg, nullptr,
 				    10);
 				break;
 			case 'q':
 				options.target_qps = strtol(optarg, nullptr,
 				    10);
 				break;
 			case 'H':
 				has_host_spec = true;
 				if (str_to_netspec(optarg,
 					&options.s_host_spec) != 0) {
 					rte_exit(EXIT_FAILURE,
 					    "invalid host net spec.\n");
 				}
 				break;
 			case 'D':
 				options.depth = strtol(optarg, nullptr, 10);
 				if (options.depth == 0) {
 					options.depth = UINT32_MAX;
 				}
 				break;
 			case 'l':
 				options.pkt_loss_delay_ms = strtol(optarg,
 				    nullptr, 10);
 				if (options.pkt_loss_delay_ms == 0) {
 					options.pkt_loss_delay_ms = UINT32_MAX;
 				}
 				break;
 			case 'J':
 				options.jumbo_frame_enabled = true;
 				options.port_mtu = MAX_JUMBO_MTU;
 				break;
 			case 'P':
 				options.pkt_pad_sz = strtol(optarg, nullptr,
 				    10);
 				break;
 			case 'p':
 				options.portid = strtol(optarg, nullptr, 10);
 				break;
 			default:
 				usage();
 				rte_exit(EXIT_FAILURE, "unknown argument: %c\n",
 				    c);
 			}
 		}
 	}
 	if (options.pkt_pad_sz != 0 &&
 	    options.pkt_pad_sz > mtu_to_pkt_size(options.port_mtu)) {
 		rte_exit(EXIT_FAILURE, "pkt_pad_sz is too large for mtu %d\n",
 		    options.port_mtu);
 	}
 	if (!has_host_spec) {
 		rte_exit(EXIT_FAILURE, "Must specify host IP.\n");
 	}
 	// init libtopo
 	if (topo_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) !=
 	    0) {
 		rte_exit(EXIT_FAILURE, "libtopo init failed!\n");
 	}
 	if (nms_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) !=
 	    0) {
 		rte_exit(EXIT_FAILURE, "libnms init failed!\n");
 	}
 	dump_options();
 	// configure memory and port
 	struct port_conf pconf;
 	struct device_conf dconf;
 	struct mem_conf mconf;
 	portconf_get(options.portid, &pconf);
 	if (!pconf.timesync) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
 		    "main: timesync disabled. hw timestamp unavailable.\n ");
 	}
 	dconf.mtu = options.port_mtu;
 	CPU_COPY(&options.cpu_set, &dconf.core_affinity);
 	dconf.portid = options.portid;
 	dconf.rss_hf = pconf.rss_hf;
 	dconf.rx_offloads = pconf.rxoffload;
 	dconf.tx_offloads = pconf.txoffload;
 	dconf.timesync = pconf.timesync;
 	dconf.rx_fn = nullptr;
 	dconf.rx_user = nullptr;
 	dconf.rx_ring_sz = 2048;
 	dconf.tx_fn = nullptr;
 	dconf.tx_user = nullptr;
 	dconf.tx_ring_sz = 2048;
 	mconf.cache_size = 512;
 	mconf.priv_size = 0;
 	mconf.num_elements = (dconf.rx_ring_sz + dconf.tx_ring_sz) *
 	    rte_lcore_count() / rte_socket_count();
 	mconf.data_room_size = RTE_MBUF_DEFAULT_BUF_SIZE + MAX_JUMBO_MTU -
 	    MAX_STANDARD_MTU;
 	mconf.max_pools = -1;
 	dpdk_init(&dconf, &mconf);
 	if (rte_eth_macaddr_get(options.portid,
 		&options.s_host_spec.mac_addr) != 0) {
 		rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n",
 		    options.portid);
 	}
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
 	    "Configured port %d with mac addr %x:%x:%x:%x:%x:%x\n",
 	    options.portid, options.s_host_spec.mac_addr.addr_bytes[0],
 	    options.s_host_spec.mac_addr.addr_bytes[1],
 	    options.s_host_spec.mac_addr.addr_bytes[2],
 	    options.s_host_spec.mac_addr.addr_bytes[3],
 	    options.s_host_spec.mac_addr.addr_bytes[4],
 	    options.s_host_spec.mac_addr.addr_bytes[5]);
 	unsigned int cpuset_idx = CPU_FFS(&options.cpu_set);
 	unsigned int tid = 0;
 	while (cpuset_idx != 0) {
 		unsigned int lcore_id = cpuset_idx - 1;
 		tinfo = new thread_info;
 		tinfo->ia_gen = createGenerator(options.ia_gen);
 		tinfo->load_gen0 = createGenerator(options.load_gen[0]);
 		tinfo->load_gen1 = createGenerator(options.load_gen[1]);
 		if (tinfo->ia_gen == nullptr || tinfo->load_gen0 == nullptr || tinfo->load_gen1 == nullptr) {
 			rte_exit(EXIT_FAILURE,
 			    "invalid ia_gen or ld_gen string\n");
 		}
 		tinfo->ia_gen->set_lambda((double)options.target_qps /
 		    (double)(options.s_num_threads));
 		tinfo->id = tid;
 		tinfo->lcore_id = lcore_id;
 		tinfo->socket_id = rte_lcore_to_socket_id(lcore_id);
 		tinfo->rxqid = tid;
 		tinfo->txqid = tid;
 		options.s_thr_info.push_back(tinfo);
 		tid++;
 		CPU_CLR(lcore_id, &options.cpu_set);
 		cpuset_idx = CPU_FFS(&options.cpu_set);
 	}
 	sleep(INIT_DELAY);
 	for (unsigned int i = 0; i < options.s_num_threads; i++) {
 		tinfo = options.s_thr_info.at(i);
 		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
 		    "main: launching thread %d on locore %d\n", tinfo->id,
 		    tinfo->lcore_id);
 		if (rte_eal_remote_launch(locore_main,
 			(void *)options.s_thr_info.at(i),
 			tinfo->lcore_id) != 0) {
 			rte_exit(EXIT_FAILURE,
 			    "failed to launch function on locore %d\n",
 			    tinfo->lcore_id);
 		}
 	}
 	// poor man's timer
 	uint32_t second = 0;
 	// this loop exit is signaled by SYNC_FIN in slave mode and by itself in
 	// non slave mode
 	while (options.s_state.load() != STATE_FIN) {
 		if (options.slave_mode != 1) {
 			if (second >= options.run_time) {
 				options.s_state.store(STATE_FIN);
 				break;
 			}
 			usleep(1 * S2US);
 			second++;
 		}
 	}
 	for (unsigned int i = 0; i < options.s_num_threads; i++) {
 		tinfo = options.s_thr_info.at(i);
 		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
 		    "main: waiting for locore %d...\n", tinfo->lcore_id);
 		if (rte_eal_wait_lcore(tinfo->lcore_id) != 0) {
 			rte_exit(EXIT_FAILURE, "failed to wait for locore %d\n",
 			    tinfo->lcore_id);
 		}
 	}
 	uint32_t qps;
 	uint32_t total_recv;
 	uint32_t total_loss;
 	calc_stats(topo_uptime_ns(), &qps, &total_recv, &total_loss);
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "qps = %d, recv = %d, loss = %d\n",
 	    qps, total_recv, total_loss);
 	for (auto each : options.s_thr_info) {
 		delete each->load_gen0;
 		delete each->load_gen1;
 		delete each->ia_gen;
 		delete each;
 	}
 	// clean up
 	dpdk_cleanup(&dconf);
 	return 0;
 }
--- a/scripts/cc_pin.py
+++ b/scripts/cc_pin.py
@ -0,0 +1,50 @@
 import os
 import sys
 import getopt
 import subprocess
 options = getopt.getopt(sys.argv[1:], 'b:s:d:p:')[0]
 base=0
 stride=2
 num = 0
 port = 0
 for opt, arg in options:
    if opt == '-b':
        base = int(arg)
    elif opt == '-s':
        stride = int(arg)
    elif opt == '-d':
        num = int(arg)
    elif opt == '-p':
        port = int(arg)
 result = subprocess.run("sysctl -a", shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 lines = result.stdout.decode().split('\n')
 cclines : list[str] = []
 for line in lines:
    if ("irq" in line) and (f"t6nex{num}" in line) and (f"{port}a" in line):
        cclines.append(line)
 if len(cclines) == 0:
    print(f"No t6nex {num}a lines from sysctl.\n")
    exit(1)
 irqs = []
 for line in cclines:
    eles = line.split(' ')
    irq = eles[0]
    if (irq.startswith("irq") and irq.endswith(":")):
        irq = irq[3:-1]
        irqs.append(int(irq))
    else:
        print(f"Unknown line format: f{line}")
 print(f"Detected {len(irqs)} irqs:\n{str(irqs)}")
 for irq in irqs:
    print(f"Setting irq{irq}'s affinity to core {base}...")
    subprocess.run(f"cpuset -l {base} -x {irq}", check=True, shell=True)
    base = base + stride
 exit(0)
--- a/scripts/copy-mount.sh
+++ b/scripts/copy-mount.sh
@ -0,0 +1,9 @@
 #!/bin/sh
 scp -P77 mount.sh oscar@icelake1-int.rcs.uwaterloo.ca:~/
 scp -P77 mount_small.sh oscar@icelake1-int.rcs.uwaterloo.ca:~/
 scp -P77 mount.sh oscar@milan1-int.rcs.uwaterloo.ca:~/
 scp -P77 mount_small.sh oscar@milan1-int.rcs.uwaterloo.ca:~/
 scp -P77 mount.sh oscar@icelake2-int.rcs.uwaterloo.ca:~/
 scp -P77 mount_small.sh oscar@icelake2-int.rcs.uwaterloo.ca:~/
 scp -P77 mount.sh oscar@milan2-int.rcs.uwaterloo.ca:~/
 scp -P77 mount_small.sh oscar@milan2-int.rcs.uwaterloo.ca:~/
--- a/scripts/dpdk.py
+++ b/scripts/dpdk.py
@ -0,0 +1,230 @@
 from cgi import test
 from site import abs_paths
 import subprocess as sp
 import time
 import select
 import os
 import datetime
 import pwd
 import sys
 import getopt
 import numpy as np
 import re
 import libpar as par
 import libtc as tc
 import libmechspec as mechspec
 import netexp
 only_max_qps = True
 # [[counter names], counting mode (0 = sampling, 1 = counting)]
 pmc_counters = [
 	"",
 	# [["mem_load_l3_miss_retired.local_dram"], 1],
 	# [["mem_load_l3_miss_retired.remote_dram"], 1],
 	# [["mem_load_l3_miss_retired.remote_hitm"], 1],
 	# [["mem_load_l3_miss_retired.remote_fwd"], 1]
 	# [["mem_trans_retired.load_latency_gt_8"], 0],
 	# [["mem_trans_retired.load_latency_gt_16"], 0],
 	# [["mem_trans_retired.load_latency_gt_32"], 0],
 	# [["mem_trans_retired.load_latency_gt_64"], 0],
 	# [["mem_trans_retired.load_latency_gt_128"], 0],
 	# [["mem_trans_retired.load_latency_gt_256"], 0],
 	# [["mem_trans_retired.load_latency_gt_512"], 0],
 	#[["mem_trans_retired.load_latency_gt_8", ""], 0],
 ]
 # pkt_pad
 clt_pkt_pads = [
 	0,
 	# 256,
 	# 512,
 	# 1024,
 	# 2048,
 	# 4096,
 	# 8192
 ]
 clt_pkt_pads_depth = {}
 clt_pkt_pads_depth[0] = 8
 clt_pkt_pads_depth[256] = 6
 clt_pkt_pads_depth[512] = 6
 clt_pkt_pads_depth[1024] = 4
 clt_pkt_pads_depth[1518] = 4
 clt_pkt_pads_depth[2048] = 2
 clt_pkt_pads_depth[4096] = 2
 clt_pkt_pads_depth[8192] = 1
 clt_pkt_pads_depth[9018] = 1
 # clt_load
 clt_wrkld = [
 	[0, "fixed:0", "fixed:0"],
 	# [0, "uniform:1000", "fixed:0"],
 	# [0, "uniform:100", "fixed:0"],
 	# [0, "uniform:10", "fixed:0"],
 	# [1, "uniform:480", "uniform:1024"],
 	# [1, "uniform:480", "uniform:256"],
 	# [1, "uniform:480", "uniform:64"]
 ]
 # paths
 file_dir = os.path.dirname(os.path.realpath(__file__))
 root_dir = os.path.join(file_dir,"..")
 # [srv_affinity, OPTIONAL( memgen_affinity, iteration, buffer_size, target_dom )]
 server_affinity = [
 	["1,3,5,7,9,11,13,15,17,19,21,23"],
 	["25,27,29,31,33,35,37,39,41,43,45,47"],
 	#["1,3,5,7,9,11,13,15,17,19,21,23", "26,28,30,32,34,36,38,40,42,44,46", -1, 512*1024*1024, 0],
 	#["25,27,29,31,33,35,37,39,41,43,45,47", "2,4,6,8,10,12,14,16,18,20,22", -1, 512*1024*1024, 1],
 	# "65,67,69,71,73,75,77,79,81,83,85,87,89,91,93,95,97,99,101,103,105,107,109,111,113,115,117,119,121,123,125,127",
 	# "1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63",
 	# "1,3,5,7,9,11,13,15",
 	# "17,19,21,23,25,27,29,31",
 	# "33,35,37,39,41,43,45,47",
 	# "49,51,53,55,57,59,61,63"
 ]
 def flush_netresult(conf : netexp.NetExpConf, result : netexp.NetExpResult):
 	sample_out = tc.get_odir() + "/" + str(result.parser.qps) + ".txt"
 	with open(sample_out, "w") as f:
 		f.write(result.sample)
 	if conf.enable_pmc:
 		pmc_out = tc.get_odir() + "/" + str(result.parser.qps) + ".pmc"
 		if conf.pmc_mode != 0:
 			with open(pmc_out, "w") as f:
 				f.write(result.pmc_parser.raw)
 		else:
 			with open(pmc_out, "wb") as f:
 				f.write(result.pmc_parser[0])
 			with open(pmc_out + "_parsed", "w") as g:
 				g.write(result.pmc_parser[1])
 	tc.log_print("=== Summary - qps: " + str(result.parser.qps) + " master loss: " + str(float(result.parser.master_loss) / float(result.parser.master_recv + result.parser.master_loss) * 100.00) + "% slave loss: " + str(float(result.parser.slave_loss) / float(result.parser.slave_recv + result.parser.slave_loss) * 100.0) + "%" )
 	tc.log_print("=== Server HW:")
 	tc.log_print(par.mutilate_data.build_mut_output(result.parser.srv_hwlat, [result.parser.qps]) + "\n")
 	tc.log_print("=== Server SW:")
 	tc.log_print(par.mutilate_data.build_mut_output(result.parser.srv_swlat, [result.parser.qps]) + "\n")
 	tc.log_print("=== Client HW:")
 	tc.log_print(par.mutilate_data.build_mut_output(result.parser.clt_hwlat, [result.parser.qps]) + "\n")
 	tc.log_print("=== Client SW:")
 	tc.log_print(par.mutilate_data.build_mut_output(result.parser.clt_swlat, [result.parser.qps]) + "\n")
 	if conf.enable_pmc:
 		if conf.pmc_mode != 0:
 			tc.log_print("=== PMC:")
 			tc.log_print("counter: " + result.pmc_parser.counter + " count: " + str(result.pmc_parser.count) + " cores: " + str(result.pmc_parser.cores))
 def main():
 	tc.set_ssh_param("-o StrictHostKeyChecking=no -p77")
 	tc.set_ssh_user("oscar")
 	output_dirname = "run"
 	conf = netexp.NetExpConf()
 	conf.srv_mechspec = mechspec.LAB.SKYLAKE1_10G
 	conf.clt_mechspecs = [mechspec.LAB.SKYLAKE3_10G, mechspec.LAB.SKYLAKE5_10G]
 	conf.mst_mechspec = mechspec.LAB.SKYLAKE2_10G
 	conf.finalize_mechspecs()
 	conf.root_dir = "/numam.d/build/bin"
 	# server fixed configs
 	conf.srv_port = 0
 	# client fixed configs
 	conf.clt_ia = "exponential"
 	conf.clt_affinity = "1,3,5,7,9,11,13,15,17,19,21,23"
 	conf.clt_port = 0
 	conf.clt_pkt_loss_lat = 5000
 	conf.clt_rage_quit_lat = 5000
 	# master fixed configs
 	conf.mst_port = 0
 	conf.mst_warmup = 5
 	conf.mst_duration = 20
 	conf.mst_qps = 100
 	conf.mst_ia = "exponential"
 	conf.mst_pkt_loss_lat = 5000
 	conf.mst_pkt_loss_max = 100
 	conf.mst_affinity = "2"
 	# pmc stuff
 	conf.pmc_sampling_rate = 4096
 	conf.pmc_counting_interval = 0.1
 	options = getopt.getopt(sys.argv[1:], 'scSD')[0]
 	for opt, arg in options:
 		if opt in ('-s'):
 			netexp.stop_all(conf)
 			return
 		elif opt in ('-c'):
 			conf.enable_client_only=True
 		elif opt in ('-S'):
 			netexp.setup(conf, bench = True, dpdk = False)
 			return
 		elif opt in ('-D'):
 			netexp.setup(conf, bench=False, dpdk=True)
 			return
 	tc.init("~/results.d/numam_neo/" + output_dirname + "_" + datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
 	cpcmd = "cp " + __file__ + " " + tc.get_odir() + "/"
 	tc.log_print(cpcmd)
 	sp.check_call(cpcmd, shell=True)
 	for eaff in server_affinity:
 		conf.srv_affinity = eaff[0]
 		conf.enable_memgen = False
 		if len(eaff) > 1:
 			conf.enable_memgen = True
 			conf.memgen_affinity = eaff[1]
 			conf.memgen_iteration = eaff[2]
 			conf.memgen_size = eaff[3]
 			conf.memgen_tgtdom = eaff[4]
 		for epad in clt_pkt_pads:
 			conf.clt_pkt_pad = 0
 			conf.clt_pkt_depth = clt_pkt_pads_depth[conf.clt_pkt_pad]
 			for eload in clt_wrkld:
 				conf.clt_wrkld = eload[0]
 				conf.clt_wrkarg0 = eload[1]
 				conf.clt_wrkarg1 = eload[2]
 				for epmc in pmc_counters:
 					conf.enable_pmc = False
 					if len(epmc) > 0:
 						conf.enable_pmc = True
 						conf.pmc_counters = epmc[0]
 						conf.pmc_mode = epmc[1]
 					test_name = "affinity" + eaff[0] + "_pad" + str(epad) + "_load" + str(eload[0]) + "," + str(eload[1]) + "," + str(eload[2])
 					if (conf.enable_memgen):
 						test_name += "_memload" + str(eaff[1]) + "," + str(eaff[2]) + "," + str(eaff[3]) + "," + str(eaff[4])
 					if (conf.enable_pmc):
 						test_name += "_pmc" + str(epmc[1]) + "_" + conf.get_pmc_str()
 					tc.begin(test_name)
 					conf.clt_qps = 0
 					tc.log_print("============ " + test_name + " QPS: MAX ============")
 					result : netexp.NetExpResult = netexp.run(conf)
 					flush_netresult(conf, result)
 					max_qps = result.parser.qps
 					if conf.enable_client_only:
 						return
 					if only_max_qps:
 						continue
 					finish = (int)(max_qps - max(conf.mst_qps, 0.01 * max_qps))
 					step = (int)(finish / 10)
 					cur_qps = step
 					while cur_qps <= finish:
 						tc.log_print("============ " + test_name + " QPS: " + str(cur_qps) + " ============")
 						conf.clt_qps = cur_qps
 						result : netexp.NetExpResult = netexp.run(conf)
 						flush_netresult(result)
 						cur_qps += step
 						tc.log_print("")
 			tc.end()
 	netexp.stop_all(conf)
 main()
--- a/scripts/graph.py
+++ b/scripts/graph.py
@ -0,0 +1,132 @@
 #!/usr/bin/env python3.6
 import pandas as pd
 import matplotlib.pyplot as plt
 from matplotlib import ticker
 import numpy as np
 import sys
 import re
 import os
 import json
 import libpar as par
 import getopt
 import math
 import concurrent.futures as CF
 def process_dir(rootdir):
    ret = []
    print("Processing directory " + rootdir + " ...")
    for subdir in os.listdir(rootdir):
        each_dir = os.path.join(rootdir, subdir)
        if os.path.isfile(each_dir) and each_dir.endswith(".txt"):
            output = None
            try:
                with open(each_dir, 'r') as f:
                    if len(f.readlines()) <= 1:
                        print("Skipping empty file - " + each_dir)
                        continue
                with open(each_dir, 'r') as f:
                    output = f.read()
                    parser = par.khat_parser() 
                    parser.parse(output)
                    print("Processed raw data - " + each_dir)
                    ret.append(parser)
            except:
                print("Unrecognized format - " + subdir)
    print("")
    return ret
 marker_map = ["o", "P", "s", "v", "*", "+", "^", "1", "2", "d", "X", "o", "P", "s", "v", "*", "+", "^", "1", "2", "d", "X"]
 color_map = ["xkcd:black", "xkcd:red", "xkcd:blue", "xkcd:green", "xkcd:cyan", "xkcd:purple", "xkcd:orange", "xkcd:salmon", "xkcd:lightgreen", "xkcd:indigo", "xkcd:brown", "xkcd:bubblegum", "xkcd:lavender", "xkcd:maroon", "xkcd:fern", "xkcd:sky", "xkcd:orchid", "xkcd:sienna"]
 parser_idx_labels = ["srv_hw", "srv_sw", "clt_hw", "clt_sw"]
 def add_curve(eax, label : str, qps_arr : [], lat_arr : [], marker : str, color : str):
    df_dict = {}
    df_dict['qps'] = qps_arr
    df_dict['lat'] = lat_arr
    df = pd.DataFrame(df_dict)
    df = df.sort_values('qps')
    eax.plot('qps', 'lat', data = df, label=label, marker=marker, color=color, markersize=8)
 # adds curves (avg and 99th percentile) for a specific parser idx
 def add_curves(rax, label : str, parsers : [], parser_idx : int, marker : str, color : str):
    qps_arr = []
    avg_arr = []
    p99_arr = []
    for parser in parsers:
        qps_arr.append(parser.qps)
        each_lat_arr = []
        each_lat_arr.extend(parser.get_stat_arr(parser_idx))
        avg_arr.append(np.mean(each_lat_arr))
        p99_arr.append(np.percentile(each_lat_arr, 99))
    add_curve(rax[0], label, qps_arr, avg_arr, marker, color)
    add_curve(rax[1], label, qps_arr, p99_arr, marker, color)
 # generate the graphs for a parser index
 def generate_graph(aff_to_parser : {}, parser_idx : int, fn : str):
    marker_idx = 0
    color_idx = 0
    fig, rax = plt.subplots(2, 1)
    rax[0].set_yscale("log")
    rax[0].set_title("Average")
    rax[0].set_xlabel("QPS")
    rax[0].set_ylabel("Latency (ns)")
    rax[0].xaxis.get_major_formatter().set_scientific(False)
    rax[0].yaxis.set_minor_formatter(ticker.ScalarFormatter())
    rax[1].set_yscale("log")
    rax[1].set_title("99th percentile")
    rax[1].set_xlabel("QPS")
    rax[1].set_ylabel("Latency (ns)")
    rax[1].xaxis.get_major_formatter().set_scientific(False)
    rax[1].yaxis.set_minor_formatter(ticker.ScalarFormatter())
    print("Generating graph => " + fn + "...")
    for aff in aff_to_parser:
        # each affinity gets a different marker type
        marker_type = marker_map[marker_idx]
        color_type = color_map[color_idx]
        marker_idx += 1
        color_idx += 1
        print("    Processing affinity " + aff + "...")
        add_curves(rax, aff, aff_to_parser[aff], parser_idx, marker_type, color_type)
    rax[0].legend()
    rax[1].legend()
    fig.set_size_inches(23.4, 16.5)
    plt.savefig(fn, dpi=150)
    plt.close()
 def main():
    datdir = None
    options = getopt.getopt(sys.argv[1:], 'd:')[0]
    for opt, arg in options:
        if opt in ('-d'):
            datdir = arg
    if datdir == None:
        raise Exception("Must specify -d parameter")
    dat = {}
    for subdir in os.listdir(datdir):
        each_dir = os.path.join(datdir, subdir)
        if not os.path.isfile(each_dir):
            dat[subdir] = process_dir(each_dir)
    for i in range(len(parser_idx_labels)):
        generate_graph(dat, i, datdir + "/" + parser_idx_labels[i])
 if __name__ == "__main__":
    main()
--- a/scripts/histo.py
+++ b/scripts/histo.py
@ -0,0 +1,105 @@
 import pandas as pd
 import matplotlib.pyplot as plt
 import matplotlib.mlab as mlab
 import numpy as np
 import sys
 import re
 import os
 import json
 import getopt
 import math
 import concurrent.futures as CF
 import libpar as par
 num_bins = 250
 extra_pct = []
 def saveplot(fp : str, data : [], title : str):
    plt.hist(data, num_bins)
    plt.xlabel("Delay")
    plt.title(title)
    plt.ylabel("Frequency")
    f = plt.gcf()
    f.set_size_inches(11.69, 8.27)
    f.savefig(fp + "_" + title + "_" + ".png", dpi=160)
    plt.clf()
    print("Generated - " + fp + "_" + title + "_" + ".png")
 executor = CF.ProcessPoolExecutor(max_workers=int(os.cpu_count()))
 def clean_data(dat: []):
    ret = []
    arr = np.array(dat)
    cutoff = np.percentile(arr, 99)
    for i in arr:
        if i <= cutoff:
            ret.append(i)
    return ret
 def process_file(each_dir):
    try:
        print("Processing " + each_dir + " ...")
        with open(each_dir, 'r') as f:
            parser = par.khat_parser()
            parser.parse(f.read())
        sh = []
        ss = []
        ch = []
        cs = []
        for pt in parser.datapt:
            sh.append(pt.s_htx - pt.s_hrx)
            ss.append(pt.s_stx - pt.s_srx)
            ch.append(pt.c_hrx - pt.c_htx)
            cs.append(pt.c_srx - pt.c_stx)
        sh = clean_data(sh)
        ss = clean_data(ss)
        ch = clean_data(ch)
        cs = clean_data(cs)
        saveplot(each_dir, sh, "server_hw_delay")
        saveplot(each_dir, ss, "server_sw_delay")
        saveplot(each_dir, ch, "client_hw_delay")
        saveplot(each_dir, cs, "client_sw_delay")
        # output median, etc.
        with open(each_dir + "_" + "stats.txt", 'w') as f:
            f.write("===================== SERVER HW ====================\n")
            f.write(par.mutilate_data.build_mut_output(sh, [len(sh)]))
            f.write("\n===================== SERVER SW ====================\n")
            f.write(par.mutilate_data.build_mut_output(ss, [len(ss)]))
            f.write("\n===================== CLIENT HW ====================\n")
            f.write(par.mutilate_data.build_mut_output(ch, [len(ch)]))
            f.write("\n===================== CLIENT SW ====================\n")
            f.write(par.mutilate_data.build_mut_output(cs, [len(cs)]))
    except Exception:
        print("Unexpected error:", sys.exc_info())
 def process_dir(rootdir):
    for subdir in os.listdir(rootdir):
        each_dir = os.path.join(rootdir, subdir)
        if os.path.isfile(each_dir):
            if each_dir.endswith(".txt") or each_dir.endswith(".sample"):
                process_file(each_dir)
        else:
            process_dir(each_dir)
 def main():    
    datdir = None
    options = getopt.getopt(sys.argv[1:], 'd:')[0]
    for opt, arg in options:
        if opt in ('-d'):
            datdir = arg
    if datdir == None:
        raise Exception("Must specify -d parameter")
    process_dir(datdir)
    executor.shutdown()
 if __name__ == "__main__":
    main()
--- a/scripts/libs/libmechspec.py
+++ b/scripts/libs/libmechspec.py
@ -0,0 +1,25 @@
 class NetSpec:
    def __init__(self, fqdn, ip, mac) -> None:
        self.mac = mac
        self.ip = ip
        self.fqdn = fqdn
        self.netspec = ip + "@" + mac
 class LabNetSpecs:
    def __init__(self) -> None:
        self.SKYLAKE1_10G = NetSpec(fqdn = "skylake1.rcs.uwaterloo.ca",ip = "192.168.123.11", mac = "3c:15:fb:62:9b:28")
        self.SKYLAKE2_10G = NetSpec(fqdn = "skylake2.rcs.uwaterloo.ca",ip = "192.168.123.12", mac = "3c:15:fb:c9:f3:36")
        self.SKYLAKE3_10G = NetSpec(fqdn = "skylake3.rcs.uwaterloo.ca",ip = "192.168.123.13", mac = "3c:15:fb:c9:f3:4b")
        self.SKYLAKE4_10G = NetSpec(fqdn = "skylake4.rcs.uwaterloo.ca",ip = "192.168.123.14", mac = "")
        self.SKYLAKE5_10G = NetSpec(fqdn = "skylake5.rcs.uwaterloo.ca",ip = "192.168.123.15", mac = "3c:15:fb:c9:f3:28")
        self.SKYLAKE6_10G = NetSpec(fqdn = "skylake6.rcs.uwaterloo.ca",ip = "192.168.123.16", mac = "3c:15:fb:62:9b:2f")
        self.SKYLAKE7_10G = NetSpec(fqdn = "skylake7.rcs.uwaterloo.ca",ip = "192.168.123.17", mac = "3c:15:fb:c9:f3:44")
        self.SKYLAKE8_10G = NetSpec(fqdn = "skylake8.rcs.uwaterloo.ca",ip = "192.168.123.18", mac = "3c:15:fb:62:9c:be")
        self.MILAN1_100G = NetSpec(fqdn = "milan1-int.rcs.uwaterloo.ca",ip = "192.168.123.19", mac = "")
        self.MILAN1_10G = NetSpec(fqdn = "milan1-int.rcs.uwaterloo.ca",ip = "192.168.123.19", mac = "a0:42:3f:4d:cb:bc")
        self.ICELAKE2_100G = NetSpec(fqdn = "icelake2-int.rcs.uwaterloo.ca",ip = "192.168.123.20", mac = "")
        self.ICELAKE2_10G = NetSpec(fqdn = "icelake2-int.rcs.uwaterloo.ca",ip = "192.168.123.20", mac = "")
 LAB = LabNetSpecs()
--- a/scripts/libs/libpar.py
+++ b/scripts/libs/libpar.py
@ -0,0 +1,196 @@
 import json
 import numpy as np
 class iperf_json_parser:
    def __init__(self, inputs):
        self.aggregate_egress_bps = 0
        self.jsonobjs = []
        for input in inputs:
            jsobj = json.loads(input)
            self.jsonobjs.append(jsobj)
            each_bps = jsobj['end']['sum_sent']['bits_per_second']
            self.aggregate_egress_bps += each_bps
 class memloadgen_parser:
    def __init__(self, input, min, max):
        lines = input.split('\n')
        if max > len(lines):
            max = len(lines)
        if len(lines) <= min:
            raise Exception("Not enough lines!")
        if min > max:
            min = max
        arr = []
        for i in range(min, max):
            arr.append(int(lines[i]))
        self.bps = np.mean(arr)
 class pmc_parser:
    def __init__(self, input):
        self.raw = input
        lines = input.split('\n')
        if len(lines) < 2:
            raise Exception("Invalid pmc file format")
        spec = lines[0].strip()
        if (spec[0] != '#'):
            raise Exception("Invalid pmc file spec line: \"" + lines[0] + "\"")
        spec = spec.split(' ')
        self.cores = len(spec) - 1
        elements = spec[1].split('/')
        if (len(elements) != 3):
            raise Exception("Invalid pmc file spec line: \"" + lines[0] + "\"")
        self.counter = elements[2].strip()
        last_line = lines[-1]
        elements = last_line.split(' ')
        total = 0
        for e in elements:
            if (len(e) > 0):
                total += int(e)
        self.count = total
 class khat_parser:
    class pt:
        def __init__(self):
            self.s_htx = 0
            self.s_hrx = 0
            self.s_stx = 0
            self.s_srx = 0
            self.c_htx = 0
            self.c_hrx = 0
            self.c_stx = 0
            self.c_srx = 0
            self.master_total = 0
            self.master_loss = 0
            self.slave_total = 0
            self.slave_loss = 0
            self.qps = 0
    def __init__(self):
        self.datapt = []
        self.srv_hwlat = []
        self.srv_swlat = []
        self.clt_hwlat = []
        self.clt_swlat = []
        self.lat_idx_arr = []
        self.lat_idx_arr.append(self.srv_hwlat)
        self.lat_idx_arr.append(self.srv_swlat)
        self.lat_idx_arr.append(self.clt_hwlat)
        self.lat_idx_arr.append(self.clt_swlat)
    def get_stat_arr(self, idx : int):
        return self.lat_idx_arr[idx]
    def parse(self, output : str):
        first = True
        for line in output.splitlines():
            # the first line is qps
            cells = line.split(',')
            if (first):
                if len(cells) != 5:
                    raise Exception("Invalid headline:" + line)
                self.qps = int(cells[0])
                self.master_recv = int(cells[1])
                self.master_loss = int(cells[2])
                self.slave_recv = int(cells[3])
                self.slave_loss = int(cells[4])
                first = False
                continue
            if len(cells) != 8:
                raise Exception("Invalid line:" + line)
            pt = self.pt()
            pt.c_srx = int(cells[0])
            pt.c_stx = int(cells[1])
            pt.c_hrx = int(cells[2])
            pt.c_htx = int(cells[3])
            pt.s_srx = int(cells[4])
            pt.s_stx = int(cells[5])
            pt.s_hrx = int(cells[6])
            pt.s_htx = int(cells[7])
            self.datapt.append(pt)
            self.srv_hwlat.append(pt.s_htx - pt.s_hrx)
            self.srv_swlat.append(pt.s_stx - pt.s_srx)
            self.clt_hwlat.append(pt.c_hrx - pt.c_htx)
            self.clt_swlat.append(pt.c_srx - pt.c_stx)
 class mutilate_data:
    def __init__(self):
        self.dat = {}
        self.qps = 0
    def to_string(self):
        ret = "Throughput: " + str(self.qps) + "\n" + json.dumps(self.dat)
        return ret
    @staticmethod
    def parse_mut_output(output):
        ret = mutilate_data()
        succ_qps = False
        succ_read = False
        table = [None, "avg", "std", "min", "5th", "10th", "50th", "90th", "95th", "99th"]
        table_legacy = [None, "avg", "std", "min", "5th", "10th", "90th", "95th", "99th"]
        for line in output.splitlines():
            if line.find("Total QPS") != -1:
                spl = line.split()
                if len(spl) == 7:
                    ret.qps = float(spl[3])
                    succ_qps = True
                else:
                    break
            elif line.find("read") != -1:
                spl = line.split()
                if len(spl) == 10:
                    for i in range(1, len(spl)):
                        ret.dat[table[i]] = float(spl[i])
                    succ_read = True
                elif len(spl) == 9:
                    for i in range(1, len(spl)):
                        ret.dat[table_legacy[i]] = float(spl[i])
                    succ_read = True
                else:
                    break
        if not (succ_qps and succ_read):
            raise Exception("Failed to parse data")
        return ret
    @staticmethod
    def parse_mut_sample(fn):
        f = open(fn, "r")
        qps = []
        lat = []
        lines = f.readlines()
        for line in lines:
            entry = line.split()
            if len(entry) != 2:
                raise Exception("Unrecognized line: " + line)
            qps.append(float(entry[0]))
            lat.append(float(entry[1]))
        f.close()
        return qps, lat
    # generate mutilate output format
    @staticmethod
    def build_mut_output(lat_arr, qps_arr):
        output = '{0: <10}'.format('#type') + '{0: >10}'.format('avg') + '{0: >10}'.format('std') + \
                        '{0: >10}'.format('min') + '{0: >10}'.format('5th') + '{0: >10}'.format('10th') + \
                        '{0: >10}'.format('50th') + '{0: >10}'.format('90th')  + '{0: >10}'.format('95th') + '{0: >10}'.format('99th') + "\n"
        output += '{0: <10}'.format('read') + '{0: >10}'.format("{:.1f}".format(np.mean(lat_arr))) + ' ' + \
                        '{0: >10}'.format("{:.1f}".format(np.std(lat_arr))) + ' ' + \
                        '{0: >10}'.format("{:.1f}".format(np.min(lat_arr))) + ' ' + \
                        '{0: >10}'.format("{:.1f}".format(np.percentile(lat_arr, 5))) + ' ' + \
                        '{0: >10}'.format("{:.1f}".format(np.percentile(lat_arr, 10))) + ' ' + \
                        '{0: >10}'.format("{:.1f}".format(np.percentile(lat_arr, 50))) + ' ' + \
                        '{0: >10}'.format("{:.1f}".format(np.percentile(lat_arr, 90))) + ' ' + \
                        '{0: >10}'.format("{:.1f}".format(np.percentile(lat_arr, 95))) + ' ' + \
                        '{0: >10}'.format("{:.1f}".format(np.percentile(lat_arr, 99))) + ' ' + "\n" \
        output += "\n" + "Total QPS = " + "{:.1f}".format(np.mean(qps_arr)) + " (0 / 0s)"
        return output
--- a/scripts/libs/libtc.py
+++ b/scripts/libs/libtc.py
@ -0,0 +1,189 @@
 import subprocess as sp
 import time
 import select
 import os
 import pwd
 import sys
 import datetime
 import random
 import re
 from threading import Thread 
 tc_logfile = None
 def log_print(info):
 	print(info)
 	if tc_logfile != None:
 		tc_logfile.write(info + "\n")
 		tc_logfile.flush()
 tc_output_dir=""
 tc_cur_test = ""
 tc_test_id = 0
 def init(odir = "./results.d/"):
 	global tc_output_dir
 	tc_output_dir = odir
 	tc_output_dir = os.path.expanduser(tc_output_dir)
 	os.system("mkdir -p " + tc_output_dir)
 	global tc_logfile
 	tc_logfile = open(tc_output_dir + "/log.txt", "w+")
 def begin(name):
 	global tc_test_id
 	global tc_cur_test
 	tc_cur_test = name
 	tc_test_id += 1
 	os.system("mkdir -p " + get_odir())
 	log_print("\n===== Test #" + str(tc_test_id) + " - " + tc_cur_test + " started =====")
 def end():
 	global tc_cur_test
 	log_print("\n===== Test #" + str(tc_test_id) + " - " + tc_cur_test + " completed =====")
 	tc_cur_test = ""
 def get_odir():
 	return tc_output_dir + "/" + tc_cur_test
 SCHED_QUEUE = 1
 SCHED_CPU = 2
 SCHED_BEST = 4
 SCHED_FEAT_WS = 1
 def make_sched_flag(sched, args, feat = 0, fargs = 0):
 	return (sched & 0xFF) | (args & 0xFF) << 8 | (feat & 0xFF) << 16 | (fargs & 0xFF) << 24
 TUNE_RTSHARE = 2
 TUNE_TFREQ = 1
 def make_tune_flag(obj, val):
 	return (obj & 0xFFFF) | (val & 0xFFFF) << 16 
 def get_username():
    return pwd.getpwuid( os.getuid() )[0]
 ssh_param = ""
 def set_ssh_param(para):
 	global ssh_param
 	ssh_param = para
 def get_ssh_param():
 	global ssh_param
 	return ssh_param
 ssh_user = None
 def set_ssh_user(user):
 	global ssh_user
 	ssh_user = user
 def get_ssh_user():
 	global ssh_user
 	return ssh_user
 def remote_exec(srv : list[str], cmd : str, blocking=True, check=True) -> sp.Popen:
 	sub = []
 	for s in srv:
 		p = sp.Popen(["ssh " + ssh_param + " " + ((ssh_user + "@") if ssh_user != None else "") + s + " \"" + cmd +"\""], shell=True, stdout=sp.PIPE, stderr=sp.PIPE)
 		sub.append(p)
 	if blocking:
 		for p in sub:
 			p.wait()
 			if check and p.returncode != 0:
 				raise Exception("Command failed " + cmd)
 	return sub
 def check_stderr(p, sel, exclude = []):# -> tuple[bool, list[str]]:
 	max_stderr_rd = 10
 	err = []
 	while sel.poll(1) and max_stderr_rd > 0:
 		err.append(p.stderr.readline().decode().strip())
 		max_stderr_rd = max_stderr_rd - 1
 	good = True
 	for e in err:
 		e = e.strip()
 		if len(e) == 0:
 			continue
 		good = False
 		for exc in exclude:
 			if exc in e:
 				good = True
 				break
 	return good, err
 # stderr threads
 errthr_objs = []
 errthr_sigstop = False
 errthr_failed = False
 def errthr_get_failed():
 	return errthr_failed
 def thr_check_stderr(p : sp.Popen, name: str, exclude):
 	global errthr_failed
 	sel = select.poll()
 	sel.register(p.stderr, select.POLLIN)
 	local_failed = False
 	while(not errthr_sigstop):
 		if (not local_failed):
 			status, err = check_stderr(p, sel, exclude=exclude)
 			if not status:
 				errthr_failed = True
 				local_failed = True
 				log_print("Error detected in \"" + name + "\":")
 				for e in err:
 					log_print("        \"" + e + "\"")
 				log_print("")
 		time.sleep(random.uniform(0.001, 0.1))
 def errthr_start():
 	global errthr_sigstop
 	global errthr_failed
 	errthr_sigstop = False
 	errthr_failed = False
 	for thr in errthr_objs:
 		thr.daemon = True
 		thr.start()
 def errthr_create(cp, name, exclude = None):
 	global errthr_objs
 	for i in range(len(cp)):
 		errthr_objs.append(Thread(target = thr_check_stderr, args=(cp[i], name[i], exclude)))
 def errthr_stop():
 	global errthr_objs
 	global errthr_sigstop
 	errthr_sigstop = True
 	for thr in errthr_objs:
 		thr.join()
 	errthr_objs.clear()
 def parse_hostfile(fp):
 	ret = {}
 	fh = open(fp, "r")
 	content = fh.readlines()
 	fh.close()
 	content = [x.strip() for x in content]
 	for line in content:
 		spl = line.split(" ")
 		if len(spl) >= 2:
 			ret[spl[0]] = spl[1]
 			log_print("Parsed: hostname \"" + spl[0] + "\" -> \"" + spl[1] + "\"")
 	return ret
 def process_hostnames(names, hosts):
 	ret = []
 	for line in names:
 		if line in hosts:
 			ret.append(hosts[line])
 		else:
 			ret.append(line)
 	return ret
 def get_cpuset_core(threads):
 	ret = "cpuset -l 0-" + str(threads * 2 - 1) + " "
 	return ret
--- a/scripts/netexp.py
+++ b/scripts/netexp.py
@ -0,0 +1,340 @@
 import time
 import subprocess as sp
 import os
 import libpar as par
 import libtc as tc
 import libmechspec as mechspec
 class NetExpResult:
 	def __init__(self):
 		self.parser = None
 		self.pmc_parser = None
 		self.sample = None
 class NetExpConf:
 	def __init__(self):
 		self.root_dir = ""
 		self.enable_client_only = False
 		self.enable_memgen = False
 		self.memgen_affinity = ""
 		self.memgen_iteration = -1
 		self.memgen_size = 512 * 1024 * 1024
 		self.memgen_tgtdom = 1
 		self.srv_affinity = ""
 		self.srv_mechspec = None
 		self.srv_port = 0
 		self.clt_qps = 0
 		self.clt_mechspecs = []
 		self.clt_affinity = "1"
 		self.clt_wrkld = 0
 		self.clt_wrkarg0 = "fixed:0"
 		self.clt_wrkarg1 = "fixed:0"
 		self.clt_pkt_loss_lat = 1000
 		self.clt_rage_quit_lat = 1000
 		self.clt_port = 0
 		self.clt_pkt_pad = 0
 		self.clt_pkt_depth = 1
 		self.clt_ia = "exponential"
 		self.mst_mechspec = None
 		self.mst_affinity = "2"
 		self.mst_qps = 100
 		self.mst_port = 0
 		self.mst_pkt_loss_lat = 1000
 		self.mst_pkt_loss_max = 1000
 		self.mst_duration = 10
 		self.mst_warmup = 5
 		self.mst_ia = "exponential"
 		self.enable_pmc = False
 		self.pmc_counters = []
 		self.pmc_mode = 0 # 0 = sampling
 		self.pmc_sampling_rate = 8192
 		self.pmc_counting_interval = 0.1
 	def __build_fqdn_arr(self, ns):
 		ret = []
 		for n in ns:
 			if n != None:
 				ret.append(n.fqdn)
 		return ret
 	def get_pmc_str(self):
 		ret = ""
 		for counter in self.pmc_counters:
 			ret = ret + counter + ","
 		return ret[:-1]
 	def calc_client_qps(self):
 		return 0 if self.clt_qps == 0 else (int)((self.clt_qps - self.mst_qps) / len(self.clt_mechspecs))
 	def finalize_mechspecs(self):
 		self.clt_fqdns = self.__build_fqdn_arr(self.clt_mechspecs)
 		self.srv_fqdns = self.__build_fqdn_arr([self.srv_mechspec])
 		self.mst_fqdns = self.__build_fqdn_arr([self.mst_mechspec])
 __SAMPLE_FN = "sample.txt.tmp"
 __PMC_FN = "pmc.txt.tmp"
 def __keep_result(conf : NetExpConf):
 	result = NetExpResult()
 	target_scp_fn = tc.get_odir() + "/" + __SAMPLE_FN
 	scpcmd = "scp -P77 " + tc.get_ssh_user() + "@" + conf.mst_mechspec.fqdn + ":" + conf.root_dir + "/" + __SAMPLE_FN + " " + target_scp_fn
 	tc.log_print(scpcmd)
 	sp.check_call(scpcmd, shell=True)
 	result.parser = par.khat_parser()
 	with open(target_scp_fn, "r") as f:
 		result.sample = f.read()
 		result.parser.parse(result.sample)
 	rmcmd = "rm " + target_scp_fn
 	tc.log_print(rmcmd)
 	sp.check_call(rmcmd, shell=True)
 	if conf.enable_pmc:
 		target_pmc_fn = tc.get_odir() + "/" + __PMC_FN
 		pmcscpcmd = "scp -P77 " + tc.get_ssh_user() + "@" + conf.srv_mechspec.fqdn + ":" + conf.root_dir + "/" + __PMC_FN + " " + target_pmc_fn
 		tc.log_print(pmcscpcmd)
 		sp.check_call(pmcscpcmd, shell=True)
 		if conf.pmc_mode == 0:
 			pmcproccmd = "sudo pmcstat -R " + conf.root_dir + "/" + __PMC_FN  + " -m " + conf.root_dir + "/" + __PMC_FN + ".proc"
 			tc.log_print(pmcproccmd)
 			tc.remote_exec(conf.srv_fqdns, pmcproccmd)
 			pmcscpcmd = "scp -P77 " + tc.get_ssh_user() + "@" + conf.srv_mechspec.fqdn + ":" + conf.root_dir + "/" + __PMC_FN + ".proc" + " " + target_pmc_fn + ".proc"
 			tc.log_print(pmcscpcmd)
 			sp.check_call(pmcscpcmd, shell=True)
 			if conf.pmc_mode != 0:
 				with open(target_pmc_fn, "r") as f:
 					result.pmc_parser = par.pmc_parser(f.read())
 			else:
 				with open(target_pmc_fn, "rb") as f:
 					with open(target_pmc_fn + ".proc", "r") as g:
 						result.pmc_parser = [f.read(), g.read()]
 				rmcmd = "rm " + target_pmc_fn + ".proc"
 				tc.log_print(rmcmd)
 				sp.check_call(rmcmd, shell=True)
 		rmcmd = "rm " + target_pmc_fn
 		tc.log_print(rmcmd)
 		sp.check_call(rmcmd, shell=True)
 	return result
 def stop_all(conf : NetExpConf):
 	# stop clients
 	tc.log_print("Stopping clients...")
 	tc.remote_exec(conf.clt_fqdns, "sudo killall -9 rat; sudo killall -9 cat; sudo killall -9 khat; sudo killall -9 memloadgen", check=False)
 	# stop master
 	tc.log_print("Stopping master...")
 	tc.remote_exec(conf.mst_fqdns, "sudo killall -9 rat; sudo killall -9 cat; sudo killall -9 khat; sudo killall -9 memloadgen", check=False)
 	if not conf.enable_client_only:
 		# stop server
 		tc.log_print("Stopping server...")
 		tc.remote_exec(conf.srv_fqdns, "sudo killall -9 rat; sudo killall -9 cat; sudo killall -9 khat; sudo killall -9 memloadgen", check=False)
 	if conf.enable_pmc:
 		tc.log_print("Stopping server PMC...")
 		tc.remote_exec(conf.srv_fqdns, "sudo killall -9 pmcstat", check=False)
 def __run_setup_cmd(conf : NetExpConf, cmd : str, desc : str):
 	all = []
 	all.extend(conf.srv_fqdns)
 	all.extend(conf.clt_fqdns)
 	all.extend(conf.mst_fqdns)
 	ssrv : list[tuple[str, sp.Popen]] = []
 	for s in all:
 		tc.log_print(f"Running \'{desc}\' on {s}...")
 		ssrv.append((s, tc.remote_exec([s], cmd, blocking=False, check=False)[0]))
 	for p in ssrv:
 		_ , stderr = p[1].communicate()
 		if p[1].returncode != 0:
 			print(f"{ p[0] } \'{desc}\' failed. stderr:\n{stderr.decode()}\n")
 		else:
 			print(f"{ p[0] } \'{desc}\' succeeded")
 def setup(conf : NetExpConf, bench : False, dpdk : False):
 	libtopo_path = "/libtopo"
 	dpdk_path = "/dpdk"
 	bench_path = "/numam.d"
 	if dpdk:
 		setup_cmd = f'''sudo rm -rf {libtopo_path}; sudo rm -rf /usr/local/include/libtopo; 
 					sudo rm -rf /usr/local/lib/libtopo; 
 					sudo mkdir -p {libtopo_path}; 
 					sudo chmod 777 {libtopo_path}; 
 					cd {libtopo_path}; 
 					git clone https://git.quacker.org/d/libtopo; 
 					cd libtopo; 
 					mkdir build; 
 					cd build; 
 					cmake ../; 
 					sudo make install'''
 		__run_setup_cmd(conf, setup_cmd, "dpdk - libtopo")
 		setup_cmd = f'''sudo pkg install -y meson pkgconf py39-pyelftools; 
 					sudo rm -rf {dpdk_path} 
 					sudo mkdir -p {dpdk_path}; 
 					sudo chmod 777 {dpdk_path}; 
 					cd {dpdk_path}; 
 					git clone https://git.quacker.org/d/numam-dpdk; 
 					cd numam-dpdk; 
 					git checkout migration; 
 					CC=gcc CXX=g++ meson -Denable_kmods=true build; 
 					cd build; 
 					sudo ninja install'''
 		__run_setup_cmd(conf, setup_cmd, "dpdk - dpdk")
 	if bench:
 		setup_cmd = f'''sudo rm -rf {bench_path};
 						sudo mkdir -p {bench_path};
 						sudo chmod 777 {bench_path}'''
 		__run_setup_cmd(conf, setup_cmd, "bench - remove")
 		all = []
 		all.extend(conf.srv_fqdns)
 		all.extend(conf.clt_fqdns)
 		all.extend(conf.mst_fqdns)
 		dir = f"{os.path.dirname(__file__)}/../"
 		for clt in all:
 			print("Syncing files to " + clt + "...")
 			rsync_cmd = f"rsync -az --no-perms --rsync-path=\"sudo rsync\" --omit-dir-times -e \"ssh -p77\" {dir} {tc.get_ssh_user()}@{clt}:{bench_path}/"
 			sp.check_call(rsync_cmd, shell=True)
 		setup_cmd = f'''cd {bench_path};
 						sudo rm -rf build;
 						mkdir build;
 						cd build;
 						cmake ../;
 						make -j8 khat cat rat memloadgen'''
 		__run_setup_cmd(conf, setup_cmd, "bench - compile")
 def run(conf : NetExpConf):
 	stop_all(conf)
 	while True:
 		server_cmd = "sudo "
 		if conf.enable_pmc:
 			if conf.pmc_mode != 0:
 				pmc_cmd = "sudo pmcstat -C -w " + str(conf.pmc_counting_interval) + " -s " + conf.get_pmc_str() + " -o " + conf.root_dir + "/" + __PMC_FN
 			else:
 				pmc_cmd = "sudo pmcstat -n " + str(conf.pmc_sampling_rate) + " -S " + conf.get_pmc_str() + " -O " + conf.root_dir + "/" + __PMC_FN
 			tc.log_print("Starting server PMC...")
 			tc.log_print(pmc_cmd)
 			spmc = tc.remote_exec(conf.srv_fqdns, pmc_cmd, blocking=False)
 		server_cmd += conf.root_dir + "/khat --log-level lib.eal:err -- -A " + conf.srv_affinity + \
 				" -H " + conf.srv_mechspec.netspec + " -p " + str(conf.srv_port)
 		if int(conf.clt_pkt_pad) > 1518:
 			server_cmd += " -J "
 		if conf.enable_client_only:
 			ssrv = None
 			tc.log_print(server_cmd)
 		else:
 			# start server
 			tc.log_print("Starting server...")
 			tc.log_print(server_cmd)
 			ssrv = tc.remote_exec(conf.srv_fqdns, server_cmd, blocking=False)
 		if conf.enable_memgen:
 			memgen_cmd = "sudo " + conf.root_dir + "/memloadgen -b " + str(conf.memgen_size) + " -s " + conf.memgen_affinity + \
 				" -i " + str(conf.memgen_iteration) + " -d " + str(conf.memgen_tgtdom)
 			tc.log_print("Starting memloadgen...")
 			tc.log_print(memgen_cmd)
 			smem = tc.remote_exec(conf.srv_fqdns, memgen_cmd, blocking=False)
 		# start clients
 		tc.log_print("Starting clients...")
 		sclt = []
 		sclt_name = []
 		for i in range(len(conf.clt_fqdns)):
 			client_cmd = "sudo " + conf.root_dir + "/rat --log-level lib.eal:err -- -S -A " + conf.clt_affinity + \
 				" -i " + conf.clt_ia + \
 				" -q " + str(conf.calc_client_qps()) + \
 				" -H " + conf.clt_mechspecs[i].netspec + \
 				" -s " + conf.srv_mechspec.netspec + \
 				" -r " + str(conf.clt_rage_quit_lat) + \
 				" -l " + str(conf.clt_pkt_loss_lat) + \
 				" -w " + str(conf.clt_wrkld) + \
 				" -w " + str(conf.clt_wrkarg0) + \
 				" -w " + str(conf.clt_wrkarg1) + \
 				" -P " + str(conf.clt_pkt_pad) + \
 				" -D " + str(conf.clt_pkt_depth) + \
 				" -p " + str(conf.clt_port)
 			if int(conf.clt_pkt_pad) > 1518:
 				client_cmd += " -J "
 			tc.log_print(client_cmd)
 			sclt.append(tc.remote_exec([conf.clt_fqdns[i]], client_cmd, blocking=False)[0])
 			sclt_name.append(conf.clt_fqdns[i])
 		time.sleep(5)
 		# start master
 		tc.remote_exec
 		tc.log_print("Starting master...")
 		master_cmd = "sudo " + conf.root_dir + "/cat --log-level lib.eal:err -- " + \
 							  " -s " + conf.srv_mechspec.netspec + \
 							  " -o " + conf.root_dir + "/" + __SAMPLE_FN + \
 							  " -t " + str(conf.mst_duration) + \
 							  " -T " + str(conf.mst_warmup) + \
 							  " -i " + conf.mst_ia + \
 							  " -q " + str(conf.mst_qps) + \
 							  " -l " + str(conf.mst_pkt_loss_lat) + \
 							  " -L " + str(conf.mst_pkt_loss_max) + \
 							  " -A " + conf.mst_affinity + \
 							  " -H " + conf.mst_mechspec.netspec + \
 							  " -p " + str(conf.mst_port)
 		for clt in conf.clt_mechspecs:
 			master_cmd += " -S " + clt.netspec
 		tc.log_print(master_cmd)
 		sp = tc.remote_exec(conf.mst_fqdns, master_cmd, blocking=False)
 		p = sp[0]
 		# launch stderr monitoring thread
 		exclude = ["Pseudo-terminal", "ice_", "i40e_"]
 		tc.errthr_create([p], conf.mst_fqdns, exclude)
 		if not conf.enable_client_only:
 			tc.errthr_create(ssrv, conf.srv_fqdns, exclude)
 		tc.errthr_create(sclt, sclt_name, exclude)
 		if conf.enable_memgen:
 			tc.errthr_create(smem, ["memloadgen"], exclude)
 		if conf.enable_pmc:
 			tc.errthr_create(spmc, ["pmcstat"], exclude)
 		tc.errthr_start()
 		success = False
 		cur = 0
 		# selec = select.poll()
 		# selec.register(p.stdout, select.POLLIN)
 		while True:
 			# either failed or timeout
 			# we use failure detection to save time for long durations
 			if tc.errthr_get_failed() or cur >= (conf.mst_warmup + conf.mst_duration) * 3:
 				break
 			# while selec.poll(1):
 			# 	print(p.stdout.readline())
 			if p.poll() != None:
 				success = True
 				break
 			time.sleep(1)
 			cur = cur + 1
 		stop_all(conf)
 		tc.errthr_stop()
 		tc.log_print("Cooling down...")
 		time.sleep(5)
 		if success:
 			return __keep_result(conf)
--- a/scripts/storage/parse.py
+++ b/scripts/storage/parse.py
@ -0,0 +1,112 @@
 #!/usr/bin/env python3.6
 import numpy as np
 import sys
 import re
 import os
 import json
 import getopt
 import math
 import concurrent.futures as CF
 columns = [
    ("Req per second", "rps", ".2f"),
    ("Bytes per second", "bps", ".2f"),
    ("Average Latency", "lat_avg", ".2f"),
    ("50th Latency", "lat_50", ".0f"),
    ("95th Latency", "lat_95", ".0f"),
    ("99th Latency", "lat_99", ".0f"),
    ("Latency stddev", "lat_std", ".2f")
 ]
 TIME = 30
 REQ_SZ = 4096
 class DatObj:
    def __init__(self, raw : list, time : int, req_sz : int):
        self.raw = raw
        self.rps = len(raw) / time
        self.bps = self.rps * req_sz
        self.lat_avg = np.average(self.raw)
        self.lat_99 = np.percentile(self.raw, 99)
        self.lat_95 = np.percentile(self.raw, 95)
        self.lat_50 = np.percentile(self.raw, 50)
        self.lat_std = np.std(self.raw)
 def parse_file(lines : list, time : int, req_sz : int) -> DatObj :
    raw = []
    for line in lines:
        if len(line) > 0:
            raw.append(int(line))
    return DatObj(raw, time, req_sz)
 def output_col():
    ret = "Benchmark"
    for name,_,_ in columns:
        ret = ret + "," + name + "," + name + " (NUMA)" + "," + "% change"
    return ret
 def get_attr_or_none(obj, attr):
    if (obj != None):
        val = getattr(obj, attr)
    else:
        val = None
    return val
 def output_objs(name: str, obj : DatObj, obj_numa : DatObj):
    ret = name
    for _, attr, fmt in columns:
        val = get_attr_or_none(obj, attr)
        val_numa = get_attr_or_none(obj_numa, attr)
        ret = ret + "," + (format(val, fmt) if val != None else "N/A") 
        ret = ret + "," + (format(val_numa, fmt) if val_numa != None else "N/A") 
        if val == None or val_numa == None:
            ret = ret + "," + "N/A"
        else:
            ret = ret + "," + format((val_numa - val) / val * 100, ".2f") + "%"
    return ret
 def process_file(f : str, obj_map):
    with open(f, "r") as fp:
        lines = fp.readlines()
    bench_name = os.path.basename(f)
    obj_map[bench_name] = parse_file(lines, TIME, REQ_SZ)
    print("Processed file " + f + ". Benchmark name: " + bench_name)
 def process_dir(path : str, obj_map):
    files = [os.path.abspath(os.path.join(path, x)) for x in os.listdir(path)]
    for f in files:
        if (".sh" in f):
            continue
        if (os.path.isfile(f)):
            process_file(f, obj_map)
 def main():
    datdir = None
    options = getopt.getopt(sys.argv[1:], 'd:')[0]
    for opt, arg in options:
        if opt in ('-d'):
            datdir = arg
    if datdir == None:
        raise Exception("Must specify -d parameter")
    obj_map = dict()
    process_dir(datdir, obj_map)
    with open("results.csv", "w") as f:
        f.write(output_col())
        f.write("\n")
        for bench in obj_map:
            if bench.endswith("_numa"):
                continue
            f.write(output_objs(bench, obj_map[bench], obj_map.get(bench+"_numa")))
            f.write("\n")
 if __name__ == "__main__":
    main()
--- a/scripts/storage/test_posix.sh
+++ b/scripts/storage/test_posix.sh
@ -0,0 +1,19 @@
 # rand_read
 sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D /dev/nvd0 -P R,100 -Q 3 -o rand_read
 sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D /dev/nvd0 -P R,100 -Q 3 -o rand_read_numa
 # rand_write
 sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D /dev/nvd0 -P R,0 -Q 3 -o rand_write
 sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D /dev/nvd0 -P R,0 -Q 3 -o rand_write_numa
 # mono_read
 sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D /dev/nvd0 -P M,100 -Q 3 -o mono_read
 sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D /dev/nvd0 -P M,100 -Q 3 -o mono_read_numa
 # mono_write
 sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D /dev/nvd0 -P M,0 -Q 3 -o mono_write
 sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D /dev/nvd0 -P M,0 -Q 3 -o mono_write_numa
 # mixed
 sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D /dev/nvd0 -P R,70 -Q 3 -o mixed_read
 sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D /dev/nvd0 -P R,70 -Q 3 -o mixed_read_numa
--- a/scripts/storage/test_spdk_bdev.sh
+++ b/scripts/storage/test_spdk_bdev.sh
@ -0,0 +1,19 @@
 # rand_read
 sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P R,100 -Q 3 -o rand_read -k bdev
 sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P R,100 -Q 3 -o rand_read_numa -k bdev
 # rand_write
 sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P R,0 -Q 3 -o rand_write -k bdev
 sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P R,0 -Q 3 -o rand_write_numa -k bdev
 # mono_read
 sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P M,100 -Q 3 -o mono_read -k bdev
 sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P M,100 -Q 3 -o mono_read_numa -k bdev
 # mono_write
 sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P M,0 -Q 3 -o mono_write -k bdev
 sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P M,0 -Q 3 -o mono_write_numa -k bdev
 # mixed
 sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P R,70 -Q 3 -o mixed_read -k bdev
 sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P R,70 -Q 3 -o mixed_read_numa -k bdev
--- a/scripts/storage/test_spdk_nvme.sh
+++ b/scripts/storage/test_spdk_nvme.sh
@ -0,0 +1,19 @@
 # rand_read
 sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P R,100 -Q 3 -o rand_read -k bdev
 sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P R,100 -Q 3 -o rand_read_numa -k bdev
 # rand_write
 sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P R,0 -Q 3 -o rand_write -k bdev
 sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P R,0 -Q 3 -o rand_write_numa -k bdev
 # mono_read
 sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P M,100 -Q 3 -o mono_read -k bdev
 sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P M,100 -Q 3 -o mono_read_numa -k bdev
 # mono_write
 sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P M,0 -Q 3 -o mono_write -k bdev
 sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P M,0 -Q 3 -o mono_write_numa -k bdev
 # mixed
 sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P R,70 -Q 3 -o mixed_read -k bdev
 sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P R,70 -Q 3 -o mixed_read_numa -k bdev
--- a/storage/birb.cc
+++ b/storage/birb.cc
@ -0,0 +1,797 @@
 #include <sys/endian.h>
 #include <sys/errno.h>
 #include <sys/types.h>
 #include <x86/_stdint.h>
 #include <getopt.h>
 #include <pthread.h>
 #include <pthread_np.h>
 #include <threads.h>
 #include <unistd.h>
 #include <cerrno>
 #include <cstddef>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <fstream>
 #include <chrono>
 #include <list>
 #include <set>
 #include "rte_lcore.h"
 #include "spdk/cpuset.h"
 #include "spdk/stdinc.h"
 #include "spdk/thread.h"
 #include "spdk/env.h"
 #include "spdk/event.h"
 #include "spdk/log.h"
 #include "spdk/string.h"
 #include "gen.hh"
 #include "ntr.h"
 #include "defs.hh"
 #include "nm.hh"
 #include "storage/io_gen.hh"
 #include "storage/drivers/driver.hh"
 #include "storage/drivers/bdev.hh"
 #include "storage/drivers/nvme.hh"
 static inline uint64_t get_cur_ts_nano()
 {
    return std::chrono::duration_cast<std::chrono::nanoseconds>
              (std::chrono::high_resolution_clock::now().time_since_epoch()).count();
 }
 /*
 * We'll use this struct to gather housekeeping hello_context to pass between
 * our events and callbacks.
 */
 static constexpr unsigned long MAX_SPEC_LEN = 32;
 static constexpr unsigned long MAX_DEV_NAME_LEN = 32;
 static constexpr unsigned long MAX_OUTPUT_FILE_LEN = 256;
 struct options_t {
 	// args
 	int verbosity = NTR_LEVEL_DEFAULT;
 	int num_threads = 1;
 	unsigned long cpumask = 1;
 	char pattern_spec[MAX_SPEC_LEN] = "R,100";
 	char ia_spec[MAX_SPEC_LEN] = "fixed";
 	unsigned int time = 5;
 	unsigned int warmup = 2;
 	unsigned int queue_depth = 1;
 	char dev_name[MAX_DEV_NAME_LEN] = "Malloc0";
 	char driver_name[MAX_DEV_NAME_LEN] = "bdev";
 	unsigned int read_pct = 0;
 	io_generator_address_mode addr_mode = IOGEN_ADDR_UNIFORM_RANDOM;
 	char output_file[MAX_OUTPUT_FILE_LEN] = "output.txt";
 	unsigned long req_size = 4096;
 	unsigned long rps = 0;
 };
 struct main_thread_cb_vars {
 	uint32_t worker_thread_init_cnt;
 	uint32_t worker_thread_stop_cnt;
 };
 struct worker_thread_cb_vars {
 	uint32_t worker_start;
 	uint32_t worker_stop;
 	struct thread_context * ctx;
 	std::list<struct io_request *> * free_ios;
 };
 static __thread void * cb_vars;
 static struct options_t options;
 struct io_record {
 	uint64_t start_ts;
 	uint64_t end_ts;
 };
 struct io_request {
 	uint64_t start_ts;
 	io_generator_opcode op;
 	char * user_buf;
 	char * dma_buf;
 };
 struct thread_context {
 	unsigned int tid;
 	unsigned int coreid;
 	unsigned int sockid;
 	pthread_t sys_thread;
 	struct spdk_thread * main_thread;
 	birb_driver * driver;
 	unsigned long start_region_offset;
 	unsigned long start_region_length;
 	/* modified by worker threads */
 	struct spdk_thread * sp_thread;
 	std::list<io_record *> *io_records;
 	uint64_t overhead_avg;
 	uint32_t overhead_cnt;
 	uint64_t overhead_max;
 	uint64_t overhead_min;
 };
 static void dump_options()
 {
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: Options:\n"
 					"    dev name: %s\n"
 					"    driver name: %s\n"
                    "    worker threads: 0x%lx\n"
 					"    number of threads: %d\n"
 					"    IO request size: %lu\n"
 					"    IO requests per second: %lu\n"
 					"    IO pattern: %s\n"
 					"    IO queue depth: %d\n"
 					"    IO addressing mode: %d\n"
 					"    read percent: %u\n"
 					"    inter-arrival dist: %s\n"
 					"    run time: %d\n"
 					"    warmup time: %d\n"
 					"    output file: %s\n",
 					options.dev_name,
 					options.driver_name,
 					options.cpumask,
 					options.num_threads,
 					options.req_size,
 					options.rps,
 					options.pattern_spec,
 					options.queue_depth,
 					options.addr_mode,
 					options.read_pct,
 					options.ia_spec,
 					options.time,
 					options.warmup,
 					options.output_file
 	);
 }
 static void usage()
 {
 	fprintf(stdout, 
 		" -V(VV): verbose mode\n"
 		" -D: dev name\n"
 		" -k: driver to use (default bdev)\n"
 		" -a: worker threads spec (0x3 = spawn 2 threads on core 1 & 2)\n"
 		" -b: IO request size\n"
 		" -q: IO requests per second\n"
 		" -P: IO request pattern\n"
 		" -Q: IO request queue depth\n"
 		" -I: inter-arrival time distribution\n"
 		" -t: total run time\n"
 		" -w: warm up time\n"
 		" -o: latency response output file\n");
 }
 static int parse_arg(int c, char *arg)
 {
 	switch (c) {
 	case 'V':
 		ntr_set_level(NTR_DEP_USER1,
 			ntr_get_level(NTR_DEP_USER1) + 1);
 		break;
 	case 'D':
 		strncpy(options.dev_name, arg, MAX_DEV_NAME_LEN);
 		break;
 	case 'k':
 		strncpy(options.driver_name, arg, MAX_DEV_NAME_LEN);
 		break;
 	case 'a':
 		options.cpumask = strtoull(optarg, nullptr, 16);
 		options.num_threads = cmask_get_num_cpus(
 			options.cpumask);
 		if (options.num_threads == 0) {
 			fprintf(stderr,
 				"must run at least one thread\n");
 			return EINVAL;
 		}
 		break;
 	case 'b':
 		options.req_size = strtoull(
 			optarg, nullptr, 10);
 		break;
 	case 'q':
 		options.rps = strtoull(
 			optarg, nullptr, 10);
 		break;
 	case 'Q':
 		options.queue_depth = strtoull(
 			optarg, nullptr, 10);
 		break;
 	case 'P':
 		strncpy(options.pattern_spec, optarg, MAX_SPEC_LEN);
 		break;
 	case 'I':
 		strncpy(options.ia_spec, optarg, MAX_SPEC_LEN);
 		break;
 	case 't':
 		options.time = strtoull(
 			optarg, nullptr, 10);
 		break;
 	case 'w':
 		options.warmup = strtoull(
 			optarg, nullptr, 10);
 		break;
 	case 'o':
 		strncpy(options.output_file, optarg, MAX_OUTPUT_FILE_LEN);
 		break;
 	case 'h':
 	default:
 		return EINVAL;
 	}
 	return 0;
 }
 static birb_driver * 
 birb_create_driver(const char * driver_name, void * context)
 {
 	if (strcmp(driver_name, "bdev") == 0) {
 		return new birb_bdev_driver(reinterpret_cast<const char *>(context));
 	} else if (strcmp(driver_name, "nvme") == 0) {
 		return new birb_nvme_driver(reinterpret_cast<const char *>(context));
 	} else {
 		return nullptr;
 	}
 }
 static birb_driver_thread_context * 
 birb_create_thread_context(birb_driver * driver)
 {
 	if (driver->get_type() == birb_driver::BIRB_DRV_BDEV) {
 		return new birb_bdev_thread_context(dynamic_cast<birb_bdev_driver *>(driver));
 	} else if (driver->get_type() == birb_driver::BIRB_DRV_NVME) {
 		return new birb_nvme_thread_context(dynamic_cast<birb_nvme_driver *>(driver));
 	} else {
 		return nullptr;
 	}
 }
 static void
 birb_destroy_driver(birb_driver * drv)
 {
 	delete drv;
 }
 static void
 birb_destroy_thread_context(birb_driver_thread_context * ctx)
 {
 	delete ctx;
 }
 /*
 * Callback function for io completion.
 */
 static void
 worker_io_complete(bool success, void *cb_arg)
 {
 	auto vars = (struct worker_thread_cb_vars *)cb_vars;
 	auto req = (struct io_request *)cb_arg;
 	uint64_t end_ts = get_cur_ts_nano();
 	if (!success) {
 		// XXX: print warning for errors for now
 		ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "thread %d <worker_io_complete>: io request failed\n", vars->ctx->tid);
 	} else {
 		auto rec = new struct io_record;
 		rec->start_ts = req->start_ts;
 		rec->end_ts = end_ts;
 		vars->ctx->io_records->push_back(rec);
 		if (req->op == IOGEN_READ) {
 			memcpy(req->user_buf, req->dma_buf, options.req_size);
 		}
 		ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d <worker_io_complete>: completed io request type %d\n", vars->ctx->tid, req->op);
 	}
 	vars->free_ios->push_back(req);
 }
 static void
 cb_notify_main_init(void * arg)
 {
 	auto * ctx = (struct thread_context *)arg;
 	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "cb_notify_main_init: from thread %d to main.\n", ctx->tid);
 	auto * vars = (struct main_thread_cb_vars *) cb_vars;
 	vars->worker_thread_init_cnt++; 
 }
 static void
 cb_notify_main_stop(void * arg)
 {
 	auto * ctx = (struct thread_context *)arg;
 	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "cb_notify_main_stop: from thread %d to main.\n", ctx->tid);
 	auto * vars = (struct main_thread_cb_vars *) cb_vars;
 	vars->worker_thread_stop_cnt++; 
 }
 static void
 cb_notify_worker_start(void * arg)
 {
 	auto * ctx = (struct thread_context *)arg;
 	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "cb_notify_worker_start: from main to thread %d.\n", ctx->tid);
 	auto * vars = (struct worker_thread_cb_vars *) cb_vars;
 	vars->worker_start = 1;	
 }
 static void
 cb_notify_worker_stop(void * arg)
 {
 	auto * ctx = (struct thread_context *)arg;
 	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "cb_notify_worker_stop: from main to thread %d.\n", ctx->tid);
 	auto * vars = (struct worker_thread_cb_vars *) cb_vars;
 	vars->worker_stop = 1;	
 }
 static void 
 main_thread_cb_vars_init(struct main_thread_cb_vars * vars)
 {
 	vars->worker_thread_init_cnt = 0;
 	vars->worker_thread_stop_cnt = 0;
 }
 static void
 worker_thread_cb_vars_init(struct worker_thread_cb_vars * vars, struct thread_context * ctx, 
 	std::list<struct io_request *> * free_ios)
 {
 	vars->worker_start = 0;
 	vars->worker_stop = 0;
 	vars->ctx = ctx;
 	vars->free_ios = free_ios;
 }
 static void * 
 worker_thread_main(void * arg)
 {
 	int rc = 0;
 	constexpr static unsigned int SPDK_THREAD_NAME_SZ = 16;
 	struct worker_thread_cb_vars vars;
 	auto *ctx = (struct thread_context *)arg;
 	birb_driver_thread_context * driver_thread_ctx;
 	std::list<struct io_request *> free_ios;
 	char spdk_thread_name[SPDK_THREAD_NAME_SZ];
 	struct spdk_cpuset * cpuset;
 	Generator * ia_gen = nullptr;
 	io_generator * io_gen = nullptr;
 	struct io_generator_ctx io_ctx;
 	uint64_t next_ts;
 	uint64_t a_offset;
 	uint64_t last_loop_ts = 0;
 	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: init...\n", ctx->tid);
 	ctx->overhead_avg = 0;
 	ctx->overhead_cnt = 0;
 	ctx->overhead_max = 0;
 	ctx->overhead_min = UINT64_MAX;
 	// create spdk thread
 	cpuset = spdk_cpuset_alloc();
 	if (cpuset == nullptr) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to alloc cpuset\n");
 		rc = ENOMEM;
 		goto cleanup;
 	}
 	spdk_cpuset_zero(cpuset);
 	spdk_cpuset_set_cpu(cpuset, ctx->coreid, true);
 	snprintf(spdk_thread_name, SPDK_THREAD_NAME_SZ, "birb_worker_%u", ctx->tid);
 	ctx->sp_thread = spdk_thread_create(spdk_thread_name, cpuset);
 	if (ctx->sp_thread == nullptr) {
 		rc = ENOMEM;
 		goto cleanup;
 	}
 	spdk_set_thread(ctx->sp_thread);
 	// create thread context
 	driver_thread_ctx = birb_create_thread_context(ctx->driver);
 	if (driver_thread_ctx == nullptr || driver_thread_ctx->get_status() != birb_driver::BIRB_SUCCESS) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: could not create thread context!\n", ctx->tid);
 		rc = EINVAL;
 		goto cleanup;
 	}
 	// create io request objects
 	for (unsigned int i = 0; i < options.queue_depth; i++) {
 		auto dma_buf = (char *)spdk_dma_zmalloc_socket(options.req_size, ctx->driver->get_align(), NULL, ctx->sockid);
 		auto user_buf = (char *)nm_malloc(ctx->sockid, options.req_size);
 		if (dma_buf == nullptr || user_buf == nullptr) {
 			ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: could not allocate buffers!\n", ctx->tid);
 			rc = ENOMEM;
 			goto cleanup;
 		}
 		auto io_req = new struct io_request;
 		io_req->dma_buf = dma_buf;
 		io_req->user_buf = user_buf;
 		free_ios.push_back(io_req);
 	}
 	// init thread local states
 	worker_thread_cb_vars_init(&vars, ctx, &free_ios);
 	cb_vars = &vars;
 	ia_gen = createGenerator(options.ia_spec);
 	if (ia_gen == nullptr) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: could not allocate ia generator!\n", ctx->tid);
 		rc = EINVAL;
 		goto cleanup;
 	}
 	ia_gen->set_lambda((double)options.rps / (double)(options.num_threads));
 	io_gen = new io_generator(options.req_size, ctx->start_region_length, options.read_pct, options.addr_mode);
 	if (io_gen == nullptr) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: could not allocate ia generator!\n", ctx->tid);
 		rc = EINVAL;
 		goto cleanup;
 	}
 	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: init complete.\n", ctx->tid);
 	if ((rc = spdk_thread_send_msg(ctx->main_thread, cb_notify_main_init, ctx)) != 0) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: could not send message %d\n", ctx->tid, rc);
 		goto cleanup;
 	}
 	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: waiting for start...\n", ctx->tid);
 	while (vars.worker_start != 1) {
 		spdk_thread_poll(spdk_get_thread(), 0, 0);
 	}
 	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: started...\n", ctx->tid);
 	/* random delay 0-100 us */
 	usleep(nm_get_uptime_ns() % 100);
 	next_ts = get_cur_ts_nano();
 	while (true) {
 		uint64_t cur_loop_ts = get_cur_ts_nano();
 		if (last_loop_ts > 0) {
 			uint64_t overhead = cur_loop_ts - last_loop_ts;
 			if (ctx->overhead_max < overhead) {
 				ctx->overhead_max = overhead;
 			}
 			if (ctx->overhead_min > overhead) {
 				ctx->overhead_min = overhead;
 			}
 			ctx->overhead_avg = ctx->overhead_avg * ctx->overhead_cnt + overhead;
 			ctx->overhead_cnt++;
 			ctx->overhead_avg /= ctx->overhead_cnt;
 		}
 		last_loop_ts = cur_loop_ts;
 		spdk_thread_poll(spdk_get_thread(), 0, 0);
 		driver_thread_ctx->poll();
 		if (vars.worker_stop != 0) {
 			if (free_ios.size() >= options.queue_depth) {
 				break;
 			}
 		} else {
 			if (!free_ios.empty()) {
 				auto io_req = free_ios.front();
 				uint64_t cur_ts = get_cur_ts_nano();
 				if (cur_ts >= next_ts) {
 					io_gen->issue(&io_ctx, io_req->dma_buf);
 					a_offset = io_ctx.offset + ctx->start_region_offset;
 					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: issuing IO type %d at offset 0x%lx size 0x%lx...\n", ctx->tid, io_ctx.op, a_offset, io_ctx.size);
 					io_req->start_ts = cur_ts;
 					io_req->op = io_ctx.op;
 					if(io_ctx.op == IOGEN_READ) {
 						rc = driver_thread_ctx->read(a_offset, io_ctx.size, io_req->dma_buf, worker_io_complete, io_req);
 					} else {
 						rc = driver_thread_ctx->write(a_offset, io_ctx.size, io_req->dma_buf, worker_io_complete, io_req);
 					}
 					if (rc != 0) {
 						ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "thread %d: failed to issue io %d, retrying...", ctx->tid, rc);
 					} else {
 						free_ios.pop_front();
 						next_ts = next_ts + ia_gen->generate() * S2NS;
 					}
 				}
 			}
 		}
 	}
 cleanup:
 	while (!free_ios.empty()) {
 		auto req = free_ios.front();
 		free_ios.pop_front();
 		spdk_dma_free(req->dma_buf);
 		nm_free(ctx->sockid, req->user_buf);
 	}
 	if (ia_gen != nullptr) {
 		delete ia_gen;
 	}
 	if (io_gen != nullptr) {
 		delete io_gen;
 	}
 	if (cpuset != nullptr) {
 		spdk_cpuset_free(cpuset);
 	}
 	if (driver_thread_ctx != nullptr) {
 		birb_destroy_thread_context(driver_thread_ctx);
 	}
 	if (rc == 0) {
 		if ((rc = spdk_thread_send_msg(ctx->main_thread, cb_notify_main_stop, ctx)) != 0) {
 			ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: could not send message %d\n", ctx->tid, rc);
 		}
 	}
 	spdk_thread_exit(ctx->sp_thread);
 	while (!spdk_thread_is_exited(ctx->sp_thread)) {
 		spdk_thread_poll(ctx->sp_thread, 0, 0);
 	};
 	if (ctx->sp_thread != nullptr) {
 		spdk_set_thread(nullptr);
 		spdk_thread_destroy(ctx->sp_thread);
 	}
 	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: stopped...\n", ctx->tid);
 	if (rc != 0) {
 		spdk_app_stop(rc);
 	}
 	return nullptr;
 }
 static void
 parse_pattern(char * pattern, unsigned int * read_pct, io_generator_address_mode * addr_mode)
 {
 	char * token = strtok(pattern, ",");
 	if (strcmp(token, "M") == 0) {
 		*addr_mode = IOGEN_ADDR_MONOTONIC_INCREASING;
 	} else {
 		*addr_mode = IOGEN_ADDR_UNIFORM_RANDOM;
 	}
 	token = strtok(nullptr, ",");
 	*read_pct = strtoull(token, nullptr, 10);
 }
 static void
 birb_main(void * arg1 UNUSED)
 {
 	int rc = 0;
 	std::list<struct thread_context *> worker_threads;
 	std::ofstream output_file;
 	struct main_thread_cb_vars vars;
 	birb_driver * drv = nullptr;
 	unsigned long record_cutoff_time = 0;
 	unsigned long current_s = 0;
 	unsigned int total_reqs = 0;
 	unsigned int tid = 0;
 	unsigned long per_thread_cap = 0;
 	int cur_core;
 	/* initialize driver */
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: initializing device driver for device %s\n", options.dev_name);
 	drv = birb_create_driver(options.driver_name, options.dev_name);
 	if (drv == nullptr || drv->get_status() != birb_driver::BIRB_SUCCESS) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to create device driver.\n");
 		rc = EINVAL;
 		goto end;
 	}
 	per_thread_cap = drv->get_capacity() / options.num_threads;
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: initialized device with capacity %zu bytes ~= %zu MB\n", drv->get_capacity(), drv->get_capacity() / 1024 / 1024);
 	/* misc init */
 	main_thread_cb_vars_init(&vars);
 	cb_vars = &vars;
 	parse_pattern(options.pattern_spec, &options.read_pct, &options.addr_mode);
 	dump_options();
 	output_file.open(options.output_file, std::ofstream::out);
 	if (!output_file) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to open output file %s\n", options.output_file);
 		rc = EINVAL;
 		goto end;
 	}
 	cur_core = cmask_get_next_cpu(&options.cpumask);
 	while(cur_core != NEXT_CPU_NULL) {
 		auto * ctx = new struct thread_context;
 		memset(ctx, 0, sizeof(struct thread_context));
 		if (ctx == NULL) {
 			ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to alloc thread ctx.\n");
 			spdk_app_stop(ENOMEM);
 			return;
 		}
 		ctx->tid = tid++;
 		ctx->driver = drv;
 		ctx->main_thread = spdk_get_thread();
 		ctx->sockid = rte_lcore_to_socket_id(cur_core);
 		ctx->coreid = cur_core;
 		ctx->io_records = new std::list<struct io_record *>();
 		ctx->start_region_length = per_thread_cap;
 		ctx->start_region_offset = per_thread_cap * ctx->tid;
 		// create sys thread
 		pthread_attr_t attr;
 		cpuset_t scpuset;
 		CPU_ZERO(&scpuset);
 		CPU_SET(cur_core, &scpuset);
 		pthread_attr_init(&attr);
 		pthread_attr_setaffinity_np(&attr, sizeof(cpuset_t), &scpuset);
 		rc = pthread_create(&ctx->sys_thread, &attr, worker_thread_main, ctx);
 		if (rc != 0) {
 			ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to create sys thread: %d\n", rc);
 			rc = EINVAL;
 			goto end;
 		}
 		worker_threads.push_back(ctx);
 		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: created worker thread %d on core %d socket %d offset 0x%lx length %ld\n", ctx->tid, cur_core, ctx->sockid, 
 																																	ctx->start_region_offset, 
 																																	ctx->start_region_length);
 		cur_core = cmask_get_next_cpu(&options.cpumask);
 	}
 	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "main: waiting for worker thread init...\n");
 	while(vars.worker_thread_init_cnt < (uint32_t)options.num_threads) {
 		spdk_thread_poll(spdk_get_thread(), 0, 0);
 	}
 	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "main: starting worker threads...\n");
 	for (struct thread_context * tctx : worker_threads) {
 		rc = spdk_thread_send_msg(tctx->sp_thread, cb_notify_worker_start, tctx);
 		if (rc != 0) {
 			ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to send message %d\n", rc);
 			goto end;
 		}
 	}
 	/* main event loop */
 	while(current_s < options.time) {
 		if (current_s >= options.warmup && record_cutoff_time == 0) {
 			record_cutoff_time = get_cur_ts_nano();
 		}
 		usleep(1 * S2US);
 		current_s++;
 	}
 	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "main: stopping worker threads...\n");
 	for (struct thread_context * tctx : worker_threads) {
 		rc = spdk_thread_send_msg(tctx->sp_thread, cb_notify_worker_stop, tctx);
 		if (rc != 0) {
 			ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to send message %d\n", rc);
 			goto end;
 		}
 	}
 	while(vars.worker_thread_stop_cnt < (uint32_t)options.num_threads) {
 		spdk_thread_poll(spdk_get_thread(), 0, 0);
 	}
 	// keep stats
 	for (struct thread_context * tctx : worker_threads) {
 		uint64_t last_ts = 0;
 		uint64_t processed = 0;
 		for (struct io_record * r : *tctx->io_records) {
 			if (r->start_ts >= record_cutoff_time) {
 				if (r->end_ts > last_ts) {
 					last_ts = r->end_ts;
 				}
 				processed++;
 				output_file << r->end_ts - r->start_ts << std::endl;
 				total_reqs++;
 			}
 		}
 		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: thread %d processed requests: %lu, last request %lu. Overhead - avg %lu min %lu max %lu\n", 
 											tctx->tid, processed, last_ts, tctx->overhead_avg, tctx->overhead_min, tctx->overhead_max);
 	}
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: total requests: %u, bytes per second: %lu\n", 
 						total_reqs, total_reqs * options.req_size / (options.time - options.warmup));
 end:
 	if (drv != nullptr) {
 		birb_destroy_driver(drv);
 	}
 	output_file.close();
 	for (struct thread_context * tctx : worker_threads) {
 		for (struct io_record * r : *tctx->io_records) {
 			delete r;
 		}
 		delete tctx->io_records;
 		delete tctx;
 	}
 	exit(0);
 	spdk_app_stop(rc);
 	return;
 }
 int
 main(int argc, char **argv)
 {
 	struct spdk_app_opts opts = {};
 	int rc = 0;
 	ntr_init();
 	ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_INFO);
 	/* Set default values in opts structure. */
 	spdk_app_opts_init(&opts, sizeof(opts));
 	opts.name = "birb";
 	/*
 	 * Parse built-in SPDK command line parameters as well
 	 * as our custom one(s).
 	 */
 	if ((rc = spdk_app_parse_args(argc, argv, &opts, "VD:k:a:b:q:Q:P:I:t:w:o:", NULL, parse_arg,
 				      usage)) != SPDK_APP_PARSE_ARGS_SUCCESS) {
 		exit(rc);
 	}
 	nm_init(options.verbosity);
 	/*
 	 * spdk_app_start() will initialize the SPDK framework, call hello_start(),
 	 * and then block until spdk_app_stop() is called (or if an initialization
 	 * error occurs, spdk_app_start() will return with rc even without calling
 	 * hello_start().
 	 */
 	rc = spdk_app_start(&opts, birb_main, NULL);
 	if (rc) {
 		SPDK_ERRLOG("ERROR starting application\n");
 	}
 	/* At this point either spdk_app_stop() was called, or spdk_app_start()
 	 * failed because of internal error.
 	 */
 	/* Gracefully close out all of the SPDK subsystems. */
 	spdk_app_fini();
 	return rc;
 }
--- a/storage/birb_posix.cc
+++ b/storage/birb_posix.cc
@ -0,0 +1,585 @@
 #include <sys/endian.h>
 #include <sys/errno.h>
 #include <sys/signal.h>
 #include <sys/types.h>
 #include <fcntl.h>
 #include <getopt.h>
 #include <pthread.h>
 #include <pthread_np.h>
 #include <threads.h>
 #include <unistd.h>
 #include <aio.h>
 #include <getopt.h>
 #include <sys/ioctl.h>
 #include <sys/disk.h>
 #include <cerrno>
 #include <cstddef>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <fstream>
 #include <chrono>
 #include <list>
 #include <set>
 #include "gen.hh"
 #include "ntr.h"
 #include "defs.hh"
 #include "nm.hh"
 #include "storage/io_gen.hh"
 static inline uint64_t get_cur_ts_nano()
 {
    return std::chrono::duration_cast<std::chrono::nanoseconds>
              (std::chrono::high_resolution_clock::now().time_since_epoch()).count();
 }
 /*
 * We'll use this struct to gather housekeeping hello_context to pass between
 * our events and callbacks.
 */
 static constexpr unsigned long MAX_SPEC_LEN = 32;
 static constexpr unsigned long MAX_DEV_NAME_LEN = 32;
 static constexpr unsigned long MAX_OUTPUT_FILE_LEN = 256;
 struct options_t {
 	// args
 	int verbosity = NTR_LEVEL_DEFAULT;
 	int num_threads = 1;
 	unsigned long cpumask = 1;
 	char pattern_spec[MAX_SPEC_LEN] = "R,100";
 	char ia_spec[MAX_SPEC_LEN] = "fixed";
 	unsigned int time = 5;
 	unsigned int warmup = 2;
 	unsigned int queue_depth = 1;
 	char dev_name[MAX_DEV_NAME_LEN] = "Malloc0";
 	char driver_name[MAX_DEV_NAME_LEN] = "bdev";
 	unsigned int read_pct = 0;
 	io_generator_address_mode addr_mode = IOGEN_ADDR_UNIFORM_RANDOM;
 	char output_file[MAX_OUTPUT_FILE_LEN] = "output.txt";
 	unsigned long req_size = 4096;
 	unsigned long rps = 0;
 };
 std::atomic<int> worker_thread_init_cnt(0);
 std::atomic<int> worker_thread_stop_cnt(0);
 std::atomic<int> worker_start(0);
 std::atomic<int> worker_stop(0);
 static struct options_t options;
 struct io_record {
 	uint64_t start_ts;
 	uint64_t end_ts;
 };
 struct io_request {
 	uint64_t start_ts;
 	io_generator_opcode op;
 	char * user_buf;
 	char * dma_buf;
 	struct aiocb aio;
 };
 struct thread_context {
 	unsigned int tid;
 	unsigned int coreid;
 	unsigned int sockid;
 	pthread_t sys_thread;
 	int disk_fd;
 	unsigned long start_region_offset;
 	unsigned long start_region_length;
 	/* modified by worker threads */
 	std::list<io_record *> *io_records;
 	uint64_t overhead_avg;
 	uint32_t overhead_cnt;
 	uint64_t overhead_max;
 	uint64_t overhead_min;
 };
 static void dump_options()
 {
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: Options:\n"
 					"    dev name: %s\n"
 					"    driver name: %s\n"
                    "    worker threads: 0x%lx\n"
 					"    number of threads: %d\n"
 					"    IO request size: %lu\n"
 					"    IO requests per second: %lu\n"
 					"    IO pattern: %s\n"
 					"    IO queue depth: %d\n"
 					"    IO addressing mode: %d\n"
 					"    read percent: %u\n"
 					"    inter-arrival dist: %s\n"
 					"    run time: %d\n"
 					"    warmup time: %d\n"
 					"    output file: %s\n",
 					options.dev_name,
 					options.driver_name,
 					options.cpumask,
 					options.num_threads,
 					options.req_size,
 					options.rps,
 					options.pattern_spec,
 					options.queue_depth,
 					options.addr_mode,
 					options.read_pct,
 					options.ia_spec,
 					options.time,
 					options.warmup,
 					options.output_file
 	);
 }
 static void usage()
 {
 	fprintf(stdout, 
 		" -V(VV): verbose mode\n"
 		" -D: dev name\n"
 		" -k: driver to use (default bdev)\n"
 		" -a: worker threads spec (0x3 = spawn 2 threads on core 1 & 2)\n"
 		" -b: IO request size\n"
 		" -q: IO requests per second\n"
 		" -P: IO request pattern\n"
 		" -Q: IO request queue depth\n"
 		" -I: inter-arrival time distribution\n"
 		" -t: total run time\n"
 		" -w: warm up time\n"
 		" -o: latency response output file\n");
 }
 static void * 
 worker_thread_main(void * arg)
 {
 	int rc = 0;
 	auto *ctx = (struct thread_context *)arg;
 	std::list<struct io_request *> free_ios;
 	std::list<struct io_request *> prog_ios;
 	Generator * ia_gen = nullptr;
 	io_generator * io_gen = nullptr;
 	struct io_generator_ctx io_ctx;
 	uint64_t next_ts;
 	uint64_t a_offset;
 	uint64_t last_loop_ts = 0;
 	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: init...\n", ctx->tid);
 	// create io request objects
 	for (unsigned int i = 0; i < options.queue_depth; i++) {
 		auto buf = (char *)nm_malloc(ctx->sockid, options.req_size);
 		auto user_buf = (char *)nm_malloc(ctx->sockid, options.req_size);
 		if (buf == nullptr || user_buf == nullptr) {
 			ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: could not allocate buffers!\n", ctx->tid);
 			rc = ENOMEM;
 			goto cleanup;
 		}
 		auto io_req = new struct io_request;
 		io_req->dma_buf = buf;
 		io_req->user_buf = user_buf;
 		io_req->aio.aio_fildes = ctx->disk_fd;
 		io_req->aio.aio_nbytes = options.req_size;
 		io_req->aio.aio_buf = buf;
 		io_req->aio.aio_sigevent.sigev_notify = SIGEV_NONE;
 		io_req->aio.aio_reqprio = 0;
 		free_ios.push_back(io_req);
 	}
 	// init thread local states
 	ia_gen = createGenerator(options.ia_spec);
 	if (ia_gen == nullptr) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: could not allocate ia generator!\n", ctx->tid);
 		rc = EINVAL;
 		goto cleanup;
 	}
 	ia_gen->set_lambda((double)options.rps / (double)(options.num_threads));
 	io_gen = new io_generator(options.req_size, ctx->start_region_length, options.read_pct, options.addr_mode);
 	if (io_gen == nullptr) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: could not allocate ia generator!\n", ctx->tid);
 		rc = EINVAL;
 		goto cleanup;
 	}
 	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: init complete.\n", ctx->tid);
 	worker_thread_init_cnt.fetch_add(1);
 	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: waiting for start...\n", ctx->tid);
 	while (worker_start.load() == 0) {}
 	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: started...\n", ctx->tid);
 	/* random delay 0-100 us */
 	usleep(nm_get_uptime_ns() % 100);
 	next_ts = get_cur_ts_nano();
 	while (true) {
 		uint64_t cur_ts = get_cur_ts_nano();
 		if (last_loop_ts > 0) {
 			uint64_t overhead = cur_ts - last_loop_ts;
 			if (ctx->overhead_max < overhead) {
 				ctx->overhead_max = overhead;
 			}
 			if (ctx->overhead_min > overhead) {
 				ctx->overhead_min = overhead;
 			}
 			ctx->overhead_avg = ctx->overhead_avg * ctx->overhead_cnt + overhead;
 			ctx->overhead_cnt++;
 			ctx->overhead_avg /= ctx->overhead_cnt;
 		}
 		last_loop_ts = cur_ts;
 		// process io completion
 		auto itr = prog_ios.begin();
 		while (itr != prog_ios.end()) {
 			int err;
 			struct io_request * ioreq = *itr;
 			if ((err = aio_error(&ioreq->aio)) != EINPROGRESS) {
 				if (err == 0) {
 					auto rec = new struct io_record;
 					rec->start_ts = ioreq->start_ts;
 					rec->end_ts = cur_ts;
 					ctx->io_records->push_back(rec);
 					if (ioreq->op == IOGEN_READ) {
 						memcpy(ioreq->user_buf, ioreq->dma_buf, options.req_size);
 					}
 					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d <worker_io_complete>: completed io request type %d\n", ctx->tid, ioreq->op);
 				} else {
 					ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "thread %d: aio failed with %d...\n", ctx->tid, err);
 				}
 				if (aio_return(&ioreq->aio) == -1) {
 					ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: aio_return failed with %d...\n", ctx->tid, errno);
 					exit(errno);
 				}
 				/* cleanup */
 				itr = prog_ios.erase(itr);
 				free_ios.push_back(ioreq);
 			} else {
 				++itr;
 			}
 		}
 		if (worker_stop.load() == 1) {
 			if (free_ios.size() >= options.queue_depth) {
 				break;
 			}
 		} else {
 			if (!free_ios.empty()) {
 				auto io_req = free_ios.front();
 				cur_ts = get_cur_ts_nano();
 				if (cur_ts >= next_ts) {
 					io_gen->issue(&io_ctx, io_req->dma_buf);
 					a_offset = io_ctx.offset + ctx->start_region_offset;
 					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: issuing IO type %d at offset 0x%lx size 0x%lx...\n", ctx->tid, io_ctx.op, a_offset, io_ctx.size);
 					io_req->start_ts = cur_ts;
 					io_req->op = io_ctx.op;
 					io_req->aio.aio_offset = a_offset;
 					if(io_ctx.op == IOGEN_READ) {
 						rc = aio_read(&io_req->aio);
 					} else {
 						rc = aio_write(&io_req->aio);
 					}
 					if (rc != 0) {
 						ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "thread %d: failed to issue io %d, retrying...\n", ctx->tid, errno);
 					} else {
 						free_ios.pop_front();
 						prog_ios.push_back(io_req);
 						next_ts = next_ts + ia_gen->generate() * S2NS;
 					}
 				}
 			}
 		}
 	}
 cleanup:
 	while (!free_ios.empty()) {
 		auto req = free_ios.front();
 		free_ios.pop_front();
 		nm_free(ctx->sockid, req->dma_buf);
 		nm_free(ctx->sockid, req->user_buf);
 	}
 	if (ia_gen != nullptr) {
 		delete ia_gen;
 	}
 	if (io_gen != nullptr) {
 		delete io_gen;
 	}
 	worker_thread_stop_cnt.fetch_add(1);
 	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: stopped...\n", ctx->tid);
 	return nullptr;
 }
 static void
 parse_pattern(char * pattern, unsigned int * read_pct, io_generator_address_mode * addr_mode)
 {
 	char * token = strtok(pattern, ",");
 	if (strcmp(token, "M") == 0) {
 		*addr_mode = IOGEN_ADDR_MONOTONIC_INCREASING;
 	} else {
 		*addr_mode = IOGEN_ADDR_UNIFORM_RANDOM;
 	}
 	token = strtok(nullptr, ",");
 	*read_pct = strtoull(token, nullptr, 10);
 }
 static void
 birb_main()
 {
 	int rc = 0;
 	std::list<struct thread_context *> worker_threads;
 	std::ofstream output_file;
 	unsigned long record_cutoff_time = 0;
 	unsigned long current_s = 0;
 	unsigned int total_reqs = 0;
 	unsigned int tid = 0;
 	unsigned long per_thread_cap = 0;
 	int cur_core;
 	int disk_fd;
 	off_t disk_size;
 	u_int disk_sec_size;
 	/* initialize driver */
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: initializing device driver for device %s\n", options.dev_name);
 	disk_fd = open(options.dev_name, O_RDWR | O_DIRECT);
 	if (disk_fd == -1) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to open device - %d\n", errno);
 		exit(errno);
 	}
 	rc = ioctl(disk_fd, DIOCGMEDIASIZE, &disk_size);
 	if (rc == -1) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to obtain disk size - %d\n", errno);
 		exit(errno);
 	}
 	rc = ioctl(disk_fd, DIOCGSECTORSIZE, &disk_sec_size);
 	if (rc == -1) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to obtain disk sector size - %d\n", errno);
 		exit(errno);
 	}
 	per_thread_cap = disk_size / options.num_threads;
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: initialized device with capacity %zu bytes ~= %zu MB, sector %u bytes\n", disk_size, disk_size / 1024 / 1024, disk_sec_size);
 	parse_pattern(options.pattern_spec, &options.read_pct, &options.addr_mode);
 	dump_options();
 	output_file.open(options.output_file, std::ofstream::out);
 	if (!output_file) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to open output file %s\n", options.output_file);
 		rc = EINVAL;
 		goto end;
 	}
 	cur_core = cmask_get_next_cpu(&options.cpumask);
 	while(cur_core != NEXT_CPU_NULL) {
 		auto * ctx = new struct thread_context;
 		memset(ctx, 0, sizeof(struct thread_context));
 		if (ctx == NULL) {
 			ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to alloc thread ctx.\n");
 			exit(ENOMEM);
 		}
 		ctx->tid = tid++;
 		ctx->sockid = nm_get_node_from_core(cur_core);
 		ctx->coreid = cur_core;
 		ctx->io_records = new std::list<struct io_record *>();
 		ctx->start_region_length = per_thread_cap;
 		ctx->start_region_offset = per_thread_cap * ctx->tid;
 		ctx->disk_fd = disk_fd;
 		// create sys thread
 		pthread_attr_t attr;
 		cpuset_t scpuset;
 		CPU_ZERO(&scpuset);
 		CPU_SET(cur_core, &scpuset);
 		pthread_attr_init(&attr);
 		pthread_attr_setaffinity_np(&attr, sizeof(cpuset_t), &scpuset);
 		rc = pthread_create(&ctx->sys_thread, &attr, worker_thread_main, ctx);
 		if (rc != 0) {
 			ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to create sys thread: %d\n", rc);
 			rc = EINVAL;
 			goto end;
 		}
 		worker_threads.push_back(ctx);
 		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: created worker thread %d on core %d socket %d offset 0x%lx length %ld\n", ctx->tid, cur_core, ctx->sockid, 
 																																	ctx->start_region_offset, 
 																																	ctx->start_region_length);
 		cur_core = cmask_get_next_cpu(&options.cpumask);
 	}
 	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "main: waiting for worker thread init...\n");
 	while(worker_thread_init_cnt.load() < options.num_threads) {
 	}
 	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "main: starting worker threads...\n");
 	worker_start.store(1);
 	/* main event loop */
 	while(current_s < options.time) {
 		if (current_s >= options.warmup && record_cutoff_time == 0) {
 			record_cutoff_time = get_cur_ts_nano();
 		}
 		usleep(1 * S2US);
 		current_s++;
 	}
 	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "main: stopping worker threads...\n");
 	worker_stop.store(1);
 	while(worker_thread_stop_cnt.load() < options.num_threads) {
 	}
 	// keep stats
 	for (struct thread_context * tctx : worker_threads) {
 		uint64_t last_ts = 0;
 		uint64_t processed = 0;
 		for (struct io_record * r : *tctx->io_records) {
 			if (r->start_ts >= record_cutoff_time) {
 				if (r->end_ts > last_ts) {
 					last_ts = r->end_ts;
 				}
 				processed++;
 				output_file << r->end_ts - r->start_ts << std::endl;
 				total_reqs++;
 			}
 		}
 		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: thread %d processed requests: %lu, last request %lu. Overhead - avg %lu min %lu max %lu\n", 
 											tctx->tid, processed, last_ts, tctx->overhead_avg, tctx->overhead_min, tctx->overhead_max);
 	}
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: total requests: %u, bytes per second: %lu\n", 
 						total_reqs, total_reqs * options.req_size / (options.time - options.warmup));
 end:
 	if (disk_fd != -1) {
 		close(disk_fd);
 	}
 	output_file.close();
 	for (struct thread_context * tctx : worker_threads) {
 		for (struct io_record * r : *tctx->io_records) {
 			delete r;
 		}
 		delete tctx->io_records;
 		delete tctx;
 	}
 	return;
 }
 int
 main(int argc, char **argv)
 {
 	ntr_init();
 	ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_INFO);
 	int c;
 	while (( c = getopt(argc, argv, "VD:k:a:b:q:Q:P:I:t:w:o:")) != -1)
 	{
 		switch (c) {
 			case 'V':
 				ntr_set_level(NTR_DEP_USER1,
 					ntr_get_level(NTR_DEP_USER1) + 1);
 				break;
 			case 'D':
 				strncpy(options.dev_name, optarg, MAX_DEV_NAME_LEN);
 				break;
 			case 'k':
 				strncpy(options.driver_name, optarg, MAX_DEV_NAME_LEN);
 				break;
 			case 'a':
 				options.cpumask = strtoull(optarg, nullptr, 16);
 				options.num_threads = cmask_get_num_cpus(
 					options.cpumask);
 				if (options.num_threads == 0) {
 					fprintf(stderr,
 						"must run at least one thread\n");
 					return EINVAL;
 				}
 				break;
 			case 'b':
 				options.req_size = strtoull(
 					optarg, nullptr, 10);
 				break;
 			case 'q':
 				options.rps = strtoull(
 					optarg, nullptr, 10);
 				break;
 			case 'Q':
 				options.queue_depth = strtoull(
 					optarg, nullptr, 10);
 				break;
 			case 'P':
 				strncpy(options.pattern_spec, optarg, MAX_SPEC_LEN);
 				break;
 			case 'I':
 				strncpy(options.ia_spec, optarg, MAX_SPEC_LEN);
 				break;
 			case 't':
 				options.time = strtoull(
 					optarg, nullptr, 10);
 				break;
 			case 'w':
 				options.warmup = strtoull(
 					optarg, nullptr, 10);
 				break;
 			case 'o':
 				strncpy(options.output_file, optarg, MAX_OUTPUT_FILE_LEN);
 				break;
 			case 'h':
 				usage();
 				exit(0);
 			default:
 				usage();
 				exit(EINVAL);
 		}
 	}
 	nm_init(options.verbosity);
 	birb_main();
 	return 0;
 }
--- a/storage/drivers/bdev.cc
+++ b/storage/drivers/bdev.cc
@ -0,0 +1,95 @@
 #include <sys/endian.h>
 #include "storage/drivers/bdev.hh"
 #include "ntr.h"
 #include "spdk/bdev.h"
 #include "spdk/thread.h"
 size_t
 birb_bdev_driver::get_capacity()
 {
    return block_num * block_sz;
 }
 birb_driver::birb_driver_status
 birb_bdev_driver::get_status()
 {
    return this->status;
 }
 void
 birb_bdev_driver::bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev * bdev UNUSED,
 		 void * event_ctx UNUSED)
 {
 	ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "bdev_event_cb: unsupported bdev event: type %d\n", type);
 }
 void 
 birb_bdev_driver::print_all_bdev()
 {
 	struct spdk_bdev * cur = spdk_bdev_first();
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "birb_bdev_driver: all registered block devices: ");
 	while(cur != NULL) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "%s, ", spdk_bdev_get_name(cur));
 		cur = spdk_bdev_next(cur);
 	}
 }
 birb_bdev_driver::birb_bdev_driver(const char * dev_name) : bdev_desc(nullptr),
 															bdev(nullptr),
 															block_sz(0),
 															block_num(0),
 															status(BIRB_FAIL)
 {
    int rc;
    rc = spdk_bdev_open_ext(dev_name, true, birb_bdev_driver::bdev_event_cb, NULL, &this->bdev_desc);
 	if (rc != 0) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "birb_bdev_driver: failed to open bdev: %d\n", rc);
 		return;
 	}
 	/* A bdev pointer is valid while the bdev is opened. */
 	this->bdev = spdk_bdev_desc_get_bdev(this->bdev_desc);
    this->block_sz = spdk_bdev_get_block_size(this->bdev);
    this->block_num = spdk_bdev_get_num_blocks(this->bdev);
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "birb_bdev_driver: bdev block size %zu bytes, blocks count %zu\n", this->block_sz, this->block_num);
    this->status = BIRB_SUCCESS;
 }
 birb_bdev_driver::~birb_bdev_driver()
 {
    if (this->status == BIRB_SUCCESS) {
 		spdk_bdev_close(this->bdev_desc);
 	}
 }
 birb_driver::birb_driver_type
 birb_bdev_driver::get_type()
 {
 	return BIRB_DRV_BDEV;
 }
 size_t
 birb_bdev_driver::get_align()
 {
 	return spdk_bdev_get_buf_align(this->bdev);
 }
 struct spdk_bdev *
 birb_bdev_driver::get_bdev()
 {
 	return this->bdev;
 }
 struct spdk_bdev_desc *
 birb_bdev_driver::get_bdev_desc()
 {
 	return this->bdev_desc;
 }
--- a/storage/drivers/bdev_thread.cc
+++ b/storage/drivers/bdev_thread.cc
@ -0,0 +1,72 @@
 #include <sys/endian.h>
 #include "storage/drivers/bdev.hh"
 #include "ntr.h"
 #include "spdk/bdev.h"
 #include "spdk/thread.h"
 birb_bdev_thread_context::birb_bdev_thread_context(birb_bdev_driver * driver) : io_channel(nullptr),
 																				status(birb_driver::BIRB_FAIL),
 																				driver(driver)
 {
 	struct spdk_bdev_desc * desc = driver->get_bdev_desc();
 	// obtain io channel
 	this->io_channel = spdk_bdev_get_io_channel(desc);
 	if (io_channel == nullptr) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "birb_bdev_thread_context: could not create bdev I/O channel!\n");
 	}
 	this->status = birb_driver::BIRB_SUCCESS;
 }
 birb_driver::birb_driver_status
 birb_bdev_thread_context::get_status()
 {
    return this->status;
 }
 birb_bdev_thread_context::~birb_bdev_thread_context()
 {
 	if (this->io_channel != nullptr) {
 		spdk_put_io_channel(this->io_channel);
 	}
 }
 /*
 * Callback function for io completion.
 */
 void
 birb_bdev_thread_context::io_callback(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
 {
 	spdk_bdev_free_io(bdev_io);
 	auto ctx = reinterpret_cast<struct cb_context *>(cb_arg);
 	ctx->cb(success, ctx->ctx);
 	delete ctx;
 }
 int
 birb_bdev_thread_context::read(size_t offset, size_t size, char * buffer, callback callback, void * context)
 {
 	auto ctx = new struct cb_context;
 	ctx->cb = callback;
 	ctx->ctx = context;
 	return spdk_bdev_read(driver->get_bdev_desc(), this->io_channel, buffer, offset, size, io_callback, reinterpret_cast<void*>(ctx));
 }
 int 
 birb_bdev_thread_context::write(size_t offset, size_t size, char * buffer, callback callback, void * context)
 {
 	auto ctx = new struct cb_context;
 	ctx->cb = callback;
 	ctx->ctx = context;
 	return spdk_bdev_write(driver->get_bdev_desc(), this->io_channel, buffer, offset, size, io_callback, reinterpret_cast<void*>(ctx));
 }
 void
 birb_bdev_thread_context::poll()
 {
 	return;
 }
--- a/storage/drivers/nvme.cc
+++ b/storage/drivers/nvme.cc
@ -0,0 +1,135 @@
 #include <sys/endian.h>
 #include "ntr.h"
 #include "spdk/nvme.h"
 #include "spdk/thread.h"
 #include "storage/drivers/nvme.hh"
 size_t
 birb_nvme_driver::get_capacity()
 {
    return spdk_nvme_ns_get_size(this->ns);
 }
 birb_driver::birb_driver_status
 birb_nvme_driver::get_status()
 {
    return this->status;
 }
 void
 birb_nvme_driver::attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
 	  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts UNUSED)
 {
 	struct spdk_nvme_ns * ns;
 	auto ctx = reinterpret_cast<struct attach_context *>(cb_ctx);
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "birb_nvme_driver: attached to nvme at %s\n", trid->traddr);
 	for (int nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); nsid != 0;
 	     nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) {
 		ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
 		if (ns == nullptr || !spdk_nvme_ns_is_active(ns)) {
 			continue;
 		}
 		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "birb_nvme_driver: namespace id: %d size: %zu LBA size: %u\n", spdk_nvme_ns_get_id(ns), spdk_nvme_ns_get_size(ns), spdk_nvme_ns_get_sector_size(ns));
 		/* XXX: use the first namespace */
 		break;
 	}
 	*ctx->ns = ns;
 	*ctx->ctrlr = ctrlr;
 	ctx->valid = 1;
 }
 bool
 birb_nvme_driver::probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
 	 struct spdk_nvme_ctrlr_opts *opts UNUSED)
 {
 	printf("birb_nvme_driver: found nvme at %s\n", trid->traddr);
 	auto ctx = reinterpret_cast<struct attach_context *>(cb_ctx);
    if (strcmp(trid->traddr, ctx->dev_name) == 0) {
 		return true;
    }
 	return false;
 }
 birb_nvme_driver::birb_nvme_driver(const char * dev_name) : status(BIRB_FAIL),
 															ctrlr(nullptr),
 															ns(nullptr),
 															opts()
 {
    int rc;
    struct spdk_nvme_transport_id trid;
 	struct attach_context ctx;
 	ctx.ctrlr = &this->ctrlr;
 	ctx.ns = &this->ns;
 	ctx.dev_name = dev_name;
 	ctx.valid = 0;
    spdk_nvme_trid_populate_transport(&trid, SPDK_NVME_TRANSPORT_PCIE);
 	snprintf(trid.subnqn, sizeof(trid.subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN);
    rc = spdk_nvme_probe(&trid, reinterpret_cast<void *>(&ctx), probe_cb, attach_cb, nullptr);
 	if (rc != 0) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "birb_nvme_driver: failed to probe nvme device: %d\n", rc);
 		goto end;
 	}
 	if (ctx.valid != 1) {
 		rc = EINVAL;
 		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "birb_nvme_driver: could not find device: %s\n", dev_name);
 		goto end;
 	}
 	if (spdk_nvme_ns_get_csi(this->ns) == SPDK_NVME_CSI_ZNS) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "birb_nvme_driver: zoned nvme namespace is unsupported\n");
 		spdk_nvme_detach(this->ctrlr);
 		goto end;
 	} else {
 		spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &this->opts, sizeof(this->opts));
 		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "birb_nvme_driver: io queue depth: %d io queue requests: %d\n", opts.io_queue_size, opts.io_queue_requests);
 		this->status = BIRB_SUCCESS;
 	}
 end:
 	return;	
 }
 birb_nvme_driver::~birb_nvme_driver()
 {
    if (this->ctrlr != nullptr) {
 		spdk_nvme_detach(this->ctrlr);
 	}
 }
 birb_driver::birb_driver_type
 birb_nvme_driver::get_type()
 {
 	return BIRB_DRV_NVME;
 }
 size_t
 birb_nvme_driver::get_align()
 {
 	return 0x1000;
 }
 spdk_nvme_ctrlr * 
 birb_nvme_driver::get_ctrlr()
 {
 	return this->ctrlr;
 }
 spdk_nvme_ns * 
 birb_nvme_driver::get_ns()
 {
 	return this->ns;
 }
 spdk_nvme_io_qpair_opts * 
 birb_nvme_driver::get_io_qpair_opts()
 {
 	return &this->opts;
 }
--- a/storage/drivers/nvme_thread.cc
+++ b/storage/drivers/nvme_thread.cc
@ -0,0 +1,90 @@
 #include <sys/endian.h>
 #include "storage/drivers/nvme.hh"
 #include "ntr.h"
 #include "spdk/bdev.h"
 #include "spdk/nvme.h"
 #include "spdk/nvme_spec.h"
 #include "spdk/thread.h"
 birb_nvme_thread_context::birb_nvme_thread_context(birb_nvme_driver * driver) : status(birb_driver::BIRB_FAIL),
 																				driver(driver),
                                                                                qpair(nullptr)
 {
 	struct spdk_nvme_ctrlr * ctrlr = driver->get_ctrlr();
    struct spdk_nvme_qpair * qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, driver->get_io_qpair_opts(), sizeof(struct spdk_nvme_io_qpair_opts));
    if (qpair == nullptr) {
        ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "birb_nvme_thread_context: could not allocate qpairs.\n");
    } else {
        this->qpair = qpair;
        status = birb_driver::BIRB_SUCCESS;
    }
 }
 birb_driver::birb_driver_status
 birb_nvme_thread_context::get_status()
 {
    return this->status;
 }
 birb_nvme_thread_context::~birb_nvme_thread_context()
 {
 	if (this->qpair != nullptr) {
 		spdk_nvme_ctrlr_free_io_qpair(this->qpair);
 	}
 }
 /*
 * Callback function for io completion.
 */
 void
 birb_nvme_thread_context::io_callback(void *arg, const struct spdk_nvme_cpl *completion)
 {
    bool success = !spdk_nvme_cpl_is_error(completion);
 	auto ctx = reinterpret_cast<struct cb_context *>(arg);
 	ctx->cb(success, ctx->ctx);
 	delete ctx;
 }
 uint32_t
 birb_nvme_thread_context::size_to_lba(size_t size, int lba_size)
 {
    return (size - 1) / lba_size + 1;
 }
 uint64_t
 birb_nvme_thread_context::addr_to_lba(size_t addr, int lba_size)
 {
    return addr / lba_size;
 }
 int
 birb_nvme_thread_context::read(size_t offset, size_t size, char * buffer, callback callback, void * context)
 {
 	auto ctx = new struct cb_context;
 	ctx->cb = callback;
 	ctx->ctx = context;
    struct spdk_nvme_ns * ns = this->driver->get_ns();
    int lba_size = spdk_nvme_ns_get_sector_size(ns);
 	return spdk_nvme_ns_cmd_read(ns, this->qpair, buffer, addr_to_lba(offset, lba_size), size_to_lba(size, lba_size), io_callback, reinterpret_cast<void*>(ctx), 0);
 }
 int 
 birb_nvme_thread_context::write(size_t offset, size_t size, char * buffer, callback callback, void * context)
 {
 	auto ctx = new struct cb_context;
 	ctx->cb = callback;
 	ctx->ctx = context;
    struct spdk_nvme_ns * ns = this->driver->get_ns();
    int lba_size = spdk_nvme_ns_get_sector_size(ns);
 	return spdk_nvme_ns_cmd_write(ns, this->qpair, buffer, addr_to_lba(offset, lba_size), size_to_lba(size, lba_size), io_callback, reinterpret_cast<void*>(ctx), 0);
 }
 void
 birb_nvme_thread_context::poll()
 {
    spdk_nvme_qpair_process_completions(this->qpair, 0);
 }
--- a/storage/io_gen.cc
+++ b/storage/io_gen.cc
@ -0,0 +1,57 @@
 #include <sys/endian.h>
 #include <random>
 #include "nm.hh"
 #include "storage/io_gen.hh"
 io_generator::io_generator(
                    unsigned long req_size,
                    unsigned long capacity,
                    unsigned int read_pct,
                    io_generator_address_mode addr_mode) : cur_offset(0),
                                                            capacity(capacity),
                                                            req_size(req_size),
                                                            read_pct(read_pct),
                                                            addr_mode(addr_mode),
                                                            rng(rd()),
                                                            dist(std::uniform_int_distribution<int>(0, 99)),
                                                            addr_rng(addr_rd()),
                                                            addr_dist(std::uniform_int_distribution<uint64_t>(0, capacity - 1))
 {
    rng.seed(nm_get_uptime_ns());
    addr_rng.seed(nm_get_uptime_ns());
 }
 /* returns 0 on success */
 int io_generator::issue(struct io_generator_ctx *ctx, char * buf)
 {
    ctx->size = req_size;
    // determine next IO offset
    if (addr_mode == IOGEN_ADDR_MONOTONIC_INCREASING) {
        if (cur_offset + req_size > capacity) {
            cur_offset = 0;   
        }
        ctx->offset = cur_offset;
        cur_offset = cur_offset + req_size;
    } else {
        ctx->offset = (addr_dist(addr_rng) / req_size) * req_size;
        if (ctx->offset + req_size > capacity) {
            ctx->offset -= req_size;
        }
    }
    // determine next IO data
    int op_rng = dist(rng);
    if (op_rng < (int)read_pct) {
        ctx->op = IOGEN_READ;
    } else {
        ctx->op = IOGEN_WRITE;
        int data = dist(rng);
        memset(buf, data, req_size);
    }
    return 0;
 }
--- a/tests/nms_test.c
+++ b/tests/nms_test.c
@ -0,0 +1,32 @@
 #include "nms.h"
 #include <assert.h>
 #include <stdio.h>
 int main(void)
 {
    void * ret;
    nms_init(1);
    // duplicate init
    nms_init(1);
    // 1G 
    ret = nms_malloc(0, 1024 * 1024 * 1024);
    assert(ret != NULL);
    printf("1G: %p\n", ret);
    // two 511Ms 
    ret = nms_malloc(0, 511 * 1024 * 1024);
    assert(ret != NULL);
    printf("511M: %p\n", ret);
    ret = nms_malloc(0, 511 * 1024 * 1024);
    assert(ret != NULL);
    printf("511M: %p\n", ret);
    // another 1G
    ret = nms_malloc(0, 1024 * 1024 * 1024);
    assert(ret != NULL);
    printf("1G: %p\n", ret);
    return 0;
 }
--- a/util/memloadgen.cc
+++ b/util/memloadgen.cc
@ -0,0 +1,239 @@
 #include <sys/endian.h>
 #include <sys/select.h>
 #include <sys/signal.h>
 #include "gen.hh"
 #include <array>
 #include <atomic>
 #include <cstdlib>
 #include <cstring>
 #include <list>
 #include <iostream>
 #include <fstream>
 #include "ntr.h"
 #include "nms.h"
 #include <getopt.h>
 #include <pthread.h>
 #include <unistd.h>
 #include <topo.h>
 static void
 usage()
 {
 	fprintf(stdout,
 	    "Usage:\n"
 		"    -v: verbose mode\n"
 	    "    -b: buffer size\n"
 		"    -q: bytes per second\n"
 	    "    -d: destination domain index\n"
 	    "    -s: worker threads cpu list\n"
 		"    -m: pull mode cpu list\n"
 		"    -S: enable shared buffer\n"
 		"    -t: time to run\n"
 		"    -T: transaction size\n"
 		"    -i: inter arrival time distribution\n"
 		"    -o: output file path\n"
 		"    -H: history size for pct adjustment\n"
 		"    -M: print this string when threads are ready to run\n");
 	fflush(stdout);
 }
 static char output_file[256] = "memloadgen_samples.txt";
 int main(int argc, char * argv[])
 {
 	ntr_init();
 	ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_WARNING);
 	size_t arr_sz = 64 * 1024 * 1024;
 	uint32_t time = -1;
 	uint64_t bps = 0;
 	uint64_t transaction_size = arr_sz;
 	cpuset_t threads, modes;
 	char magic[256] = {0};
 	CPU_ZERO(&threads);
 	CPU_ZERO(&modes);
 	CPU_SET(0, &threads);
 	char ia_dist[32] = "fixed";
 	int history_sz = 5;
 	std::list<uint64_t> history;
 	int shared_buffer = 0;
 	int rate_ctrl = 0;
 	cpuset_t domain_mask;
 	CPU_ZERO(&domain_mask);
 	CPU_SET(0, &domain_mask);
 	{
 		int c;
 		// parse arguments
 		while ((c = getopt(argc, argv, "vhb:d:s:m:So:T:t:q:i:H:M:")) != -1) {
 			switch (c) {
 			case 'v':
 				ntr_set_level(NTR_DEP_USER1, ntr_get_level(NTR_DEP_USER1) + 1);
 				break;
 			case 'h':
 				usage();
 				exit(0);
 			case 'b':
 				arr_sz = strtoull(optarg, nullptr, 10);
 				break;
 			case 'd':
 				cpulist_to_cpuset(optarg, &domain_mask);
 				break;
 			case 's':
 				cpulist_to_cpuset(optarg, &threads);
 				break;
 			case 'm':
 				cpulist_to_cpuset(optarg, &modes);
 				break;
 			case 'S':
 				shared_buffer = 1;
 				break;
 			case 'o':
 				strncpy(output_file, optarg, 256);
 				break;
 			case 't':
 				time = strtoul(optarg, nullptr, 10);
 				break;
 			case 'T':
 				transaction_size = strtoul(optarg, nullptr, 10);
 				break;
 			case 'q':
 				bps = (uint64_t)strtoull(optarg, nullptr, 10);
 				break;
 			case 'i':
 				strncpy(ia_dist, optarg, sizeof(ia_dist));
 				break;
 			case 'H':
 				history_sz = strtol(optarg, nullptr, 10);
 				break;
 			case 'M':
 				strncpy(magic, optarg, sizeof(magic));
 				break;
 			default:
 				usage();
 				exit(0);
 			}
 		}
 	}
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "Configruation:\n"
 									   "    buffer size: %ld\n"
 									   "    num threads: %d\n"
 									   "    target domain: %ld\n"
 									   "    bytes per second: %lu\n"
 									   "    interarrival distribution: %s\n"
 									   "    shared buffer: %d\n"
 									   "    transaction time: %lu\n"
 									   "    runtime: %d\n"
 									   "    history: %d\n"
 									   "    magic: %s\n",
 									    arr_sz, CPU_COUNT(&threads), 
 										CPU_FFS(&domain_mask) - 1, bps, 
 										ia_dist, shared_buffer, 
 										transaction_size,time, history_sz, magic);
 	// init topo
 	if (topo_init(ntr_get_level(NTR_DEP_USER1) != NTR_LEVEL_DEFAULT)) {
 		fprintf(stderr, "libtopo init failed!\n");
 		exit(1);
 	}
 	// init 
 	if (nms_init(ntr_get_level(NTR_DEP_USER1) != NTR_LEVEL_DEFAULT)) {
 		fprintf(stderr, "libnms init failed!\n");
 		exit(1);
 	}
 	bool success = false;
 	memload_generator::memload_generator_options opts;
 	opts.buffer_size = arr_sz;
 	opts.trans_per_second = bps / transaction_size;
 	opts.shared_buffer = shared_buffer;
 	opts.transaction_size = transaction_size;
 	opts.verbose = ntr_get_level(NTR_DEP_USER1) != NTR_LEVEL_DEFAULT;
 	strncpy(opts.ia_dist, ia_dist, sizeof(opts.ia_dist));
 	std::ofstream ofile;
 	ofile.open(output_file, std::ios::out | std::ios::trunc);
    auto mgen = new memload_generator(&threads, &modes, &domain_mask, &opts, &success);
 	if (strlen(magic) > 0) {
 		fprintf(stdout, "%s\n", magic);
 		fflush(stdout);
 	}
 	if (!mgen->start()) {
 		fprintf(stderr, "failed to start memloadgen!\n");
 		exit(1);
 	}
 	struct timeval stval;
 	stval.tv_sec = 0;
 	stval.tv_usec = 0;
 	char pct_line[64] = {0};
 	uint64_t prev_ts = topo_uptime_ns();
 	uint64_t prev_trans = mgen->get_transactions();
 	uint32_t cur_time = 0;
 	while(cur_time < time) {
 		usleep(S2US);
 		uint64_t cur_ts = topo_uptime_ns();
 		uint64_t trans = mgen->get_transactions();
 		uint64_t bps = (uint64_t)((double)((trans - prev_trans) * transaction_size) / ((double)(cur_ts - prev_ts) / (double)S2NS));
 		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "%ldB,%ldM\n", bps, bps / 1024 / 1024);
 		ofile << "s," << cur_time << "," << bps << std::endl;
 		ofile.flush();
 		prev_ts = cur_ts;
 		prev_trans = trans;
 		cur_time++;
 		if (rate_ctrl == 0) {
 			// keep history
 			history.emplace_back(bps);
 			if ((int)history.size() > history_sz) {
 				history.pop_front();
 			}
 			fd_set fdset;
 			FD_ZERO(&fdset);
 			FD_SET(STDIN_FILENO, &fdset);
 			int ret = select(1, &fdset, NULL, NULL, &stval);
 			if (ret < 0) {
 				if (errno != EINTR) {
 					fprintf(stderr, "select() failed with %d\n", errno);
 					exit(1);
 				}
 			} else if (ret > 0) {
 				if (FD_ISSET(STDIN_FILENO, &fdset)) {
 					ret = read(STDIN_FILENO, pct_line, sizeof(pct_line) - 1);
 					if (ret < 0) {
 						fprintf(stderr, "read() failed with %d\n", errno);
 						exit(1);
 					}
 					unsigned int pct = strtoul(pct_line, NULL, 10);
 					uint64_t sum = 0;
 					size_t sz = history.size();
 					while (history.size() > 0) {
 						sum += history.front();
 						history.pop_front();
 					}
 					uint64_t newbps = ((sum / sz) * (double)pct / 100.0);
 					mgen->set_transactions(newbps / transaction_size);
 					ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "adjusted target bps to %u%% = %ldB ~= %ldM\n", pct, newbps, newbps / 1024 / 1024);
 					ofile << "p," << cur_time << "," << pct << std::endl;
 					ofile.flush();
 					rate_ctrl = 1;
 				}
 			}
 		}
 	}
 	mgen->stop();
 	delete mgen;
 	ofile.close();
 	return 0;
 }
--- a/util/mornafah.c
+++ b/util/mornafah.c
@ -0,0 +1,237 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "nms.h"
 #include <getopt.h>
 #include <unistd.h>
 #include <topo.h>
 #include <immintrin.h>
 #include <x86intrin.h>
 #include <stdatomic.h>
 #include <string.h>
 #include <time.h>
 #include <math.h>
 #include <errno.h>
 #include <stdint.h>
 #include <sys/cpuset.h>
 #include <sys/sysctl.h>
 #include <pthread.h>
 #include <pthread_np.h>
 #define BUFFER_SIZE (128 * 1024 * 1024)
 #define BUFFER_CNT (BUFFER_SIZE / sizeof(int))
 static _Atomic int flush = 0;
 static _Atomic uint64_t offset = 0;
 static int * remote_buffer = NULL;
 static uint64_t * latencies;
 static int times = 100;
 static int local_core = 0;
 static int remote_core = 1;
 static int cache_mode = 0;
 static int verbose = 0;
 static int random_access = 0;
 static uint64_t tsc_freq = 0;
 static inline uint64_t cyc2ns(uint64_t cyc)
 {
 	return (double)cyc / ((double)tsc_freq / 1000000000.0);
 }
 static inline uint64_t read_time(void)
 {
 	uint64_t l;
 	unsigned int a;
 	l = __rdtscp(&a);
 	_mm_lfence();
 	return l;
 }
 static void * local_thread(void *)
 {
 	int temp, *addr;
 	uint64_t start, end;
 	printf("Local thread running...\n");
 	while(times > 0) {
 		if (random_access) {
 			// change offset
 			offset = (rand() % BUFFER_CNT) * sizeof(int);
 		}
 		flush = 1;
 		while(flush != 0) {
 		}
 		addr = (int *)((char *)remote_buffer + offset);
 		if (verbose > 1) {
 			printf("Local thread(%d): flushing %p.\n", local_core, addr);
 		}
 		_mm_clflush(addr);
 		_mm_mfence();
 		atomic_signal_fence(memory_order_seq_cst);
 		start = read_time();
 		temp = *addr;
 		end = read_time();
 		atomic_signal_fence(memory_order_seq_cst);
 		if (verbose > 1) {
 			printf("Local thread(%d): read %p.\n", local_core, addr);
 		}
 		latencies[times - 1] = end - start;
 		times--;
 	}
 	return (void *)(uintptr_t)temp;
 }
 static void * remote_thread(void *)
 {
 	int temp;
 	int * addr;
 	printf("Remote thread running...\n");
 	while(1) {
 		while(flush == 0) {
 		}
 		addr = (int *)((char *)remote_buffer + offset);
 		if(cache_mode) {
 			temp = *addr;
 			_mm_mfence();
 		} else {
 			_mm_clflush(addr);
 			_mm_mfence();
 		}
 		if (verbose > 1) {
 			printf("Remote thread(%d): %p %s.\n", remote_core, addr, cache_mode ? "read into cache" : "flushed");
 		}
 		flush = 0;
 	}
 	return (void *)(uintptr_t)temp;
 }
 int main(int argc, char * argv[])
 {
 	{
 		int c;
 		// parse arguments
 		while ((c = getopt(argc, argv, "l:r:t:vR")) != -1) {
 			switch (c) {
 			case 'l':
 				local_core = atoi(optarg);
 				break;
 			case 'r':
 				remote_core = atoi(optarg);
 				break;
 			case 't':
 				times = atoi(optarg);
 				break;
 			case 'R':
 				random_access = 1;
 				break;
 			case 'v':
 				verbose++;
 				break;
 			default:
 				exit(1);
 			}
 		}
 	}
 	srand(time(NULL));
 	// init topo
 	if (topo_init(1)) {
 		fprintf(stderr, "libtopo init failed!\n");
 		exit(1);
 	}
 	// init 
 	if (nms_init(1)) {
 		fprintf(stderr, "libnms init failed!\n");
 		exit(1);
 	}
 	size_t sz = sizeof(tsc_freq);
 	int rc;
 	if ((rc = sysctlbyname("machdep.tsc_freq", &tsc_freq, &sz, NULL, 0)) < 0) {
 		fprintf(stderr,"failed to query tsc frequency via sysctl (%d)\n", errno);
 	} else {
 		fprintf(stdout,"system tsc frequency = %lu\n", tsc_freq);
 	}
 	latencies = malloc(sizeof(uint64_t) * times);
 	const int remote_numa = topo_core_to_numa(remote_core);
 	const int local_numa = topo_core_to_numa(local_core);
 	const int total = times;
 	remote_buffer = nms_malloc(remote_numa, BUFFER_SIZE);
 	// fill with random values
 	for (int i = 0; i < BUFFER_SIZE; i++) {
 		remote_buffer[i] = rand();
 	}
 	pthread_attr_t lattr, rattr;
 	pthread_t lthread, rthread;
 	cpuset_t lcpuset, rcpuset;
 	CPU_ZERO(&lcpuset);
 	CPU_ZERO(&rcpuset);
 	CPU_SET(local_core, &lcpuset);
 	CPU_SET(remote_core, &rcpuset);
 	pthread_attr_init(&rattr);
 	pthread_attr_setaffinity_np(&rattr, sizeof(cpuset_t), &rcpuset);
 	pthread_attr_init(&lattr);
 	pthread_attr_setaffinity_np(&lattr, sizeof(cpuset_t), &lcpuset);
 	printf("local thread: %d numa: %d, remote: %d numa: %d\n", local_core, local_numa, remote_core, remote_numa);
 	pthread_create(&lthread, &lattr, local_thread, NULL);
 	pthread_create(&rthread, &rattr, remote_thread, NULL);
 	pthread_join(lthread, NULL);
 	uint64_t min = UINT64_MAX;
 	uint64_t max = 0;
 	uint64_t sum = 0;
 	for (int i = total - 1; i >= 0; i--) {
 		if (verbose) {
 			printf("%lu,\n", latencies[i]);
 		}
 		if (min > latencies[i]) {
 			min = latencies[i];
 		}
 		if (max < latencies[i]) {
 			max = latencies[i];
 		}
 		sum += latencies[i];
 	}
 	double var = 0.0;
 	double avg = (double)sum / (double)total;
 	for (int i = total - 1; i >= 0; i--) {
 		var += pow(latencies[i] - avg, 2);
 	}
 	var = sqrt(var / avg);
 	printf("Avg: %lu cycles (%lu ns)\n"
 		   "Std: %lu cycles (%lu ns)\n"
 		   "Min: %lu cycles (%lu ns)\n"
 		   "Max: %lu cycles (%lu ns)\n", 
 		   (uint64_t)avg, cyc2ns((uint64_t)avg),
 		   (uint64_t)var, cyc2ns((uint64_t)var),
 		   min, cyc2ns(min),
 		   max, cyc2ns(max));
 	free(latencies);
 	return 0;
 }
Author	SHA1	Message	Date
quackerd	3320852dd5	sandybridge doesn't support clflushopt	2023-12-06 04:22:46 +08:00
quackerd	76a41666a0	fix dpdk	2023-12-06 03:38:32 +08:00
quackerd	b57fe6e5ea	akh morn	2023-12-06 03:23:00 +08:00
oscar	fc687426ae	stuff	2023-05-01 15:28:51 -04:00
quackerd	aba80e8869	stuff	2023-05-01 21:18:34 +02:00
quackerd	1a90104d53	minor fix	2023-03-29 22:00:59 +02:00
quackerd	59b8c36ced	multiarch	2023-03-17 21:13:05 +01:00
quackerd	4effb3f1bd	multiarch	2023-03-16 09:43:34 +01:00
oscar	bb9792cf06	memloadgen allocate memory in thread	2023-03-15 19:44:46 -04:00
oscar	a385866002	memloadgen allocate memory in thread	2023-03-15 19:10:52 -04:00
oscar	7e4fd3d721	memloadgen allocate memory in thread	2023-03-15 19:07:36 -04:00
oscar	05965dbb94	memloadgen allocate memory in thread	2023-03-15 18:43:37 -04:00
quackerd	25c18b4fc5	stdin based pct control	2023-03-05 16:48:54 +01:00
quackerd	28d469e8ff	better printing	2023-03-05 15:59:42 +01:00
quackerd	6cd0e7d12f	add signal control	2023-03-05 15:58:06 +01:00
quackerd	521a49d945	add magic number	2023-03-05 15:15:13 +01:00
quackerd	a9cac61069	cleanup and stuff	2023-01-04 17:25:32 +01:00
quackerd	f20ae16e31	temp commit	2022-12-14 20:52:12 +01:00
quackerd	2a543d7e4d	iperf	2022-11-30 20:37:51 +01:00
quackerd	a3b7b7db5d	iperf	2022-11-26 00:08:26 +01:00
quackerd	5e76edab89	useless but useful check	2022-11-24 10:11:14 +01:00
oscar	d0c7329f9f	iperf	2022-11-23 20:05:48 -05:00
quackerd	4ff2de5d1e	dpdk refactor	2022-11-22 16:27:27 +01:00
quackerd	933e9708f3	refactor iperf conf to human readable	2022-11-22 13:58:33 +01:00
quackerd	e85928e3f5	iperf script change	2022-11-21 22:52:13 +01:00
quackerd	df880a453c	new scripts	2022-11-18 09:27:04 +01:00
oscar	b5be9c38fe	memloadgen	2022-11-16 15:37:39 -05:00
quackerd	18339fb109	memloadgen pct support	2022-11-16 08:44:43 +01:00
quackerd	1836bd89df	memloadgen rate control	2022-11-11 22:11:50 +01:00
quackerd	075902ba1d	add break	2022-11-01 11:27:34 +01:00
quackerd	68b621fd3c	snapshot memloadgen transaction change	2022-11-01 11:01:23 +01:00
quackerd	565dbca278	latest dpdk & refactoring	2022-06-22 23:40:48 +08:00
quackerd	a716583b19	update various components for new machines	2022-05-25 06:55:01 -04:00
quackerd	d217bde46a	bug fix	2022-03-29 00:50:10 +08:00
quackerd	6e7e152915	posix support	2022-03-29 00:47:46 +08:00
quackerd	0d26960686	nvme support	2022-03-21 23:01:24 +08:00
quackerd	186150ca00	fixed hardcoded exit	2022-03-21 19:45:42 +08:00
quackerd	27c6cd188d	device driver abstraction	2022-03-21 19:43:49 +08:00
quackerd	2ecfacff11	spdk	2022-03-20 22:17:26 +08:00
quackerd	0dc463ba35	memload generator	2022-02-21 21:41:40 +08:00
quackerd	997587c519	temp save	2021-03-17 21:45:01 -04:00
quackerd	cd4785f08a	add mem region support for nm malloc	2021-03-04 02:25:34 -05:00
quackerd	4d50e55e1e	+fix workload gen	2021-03-04 01:54:13 -05:00
quackerd	7fd7c7f776	+libnm refactor and numa allocator support. +khat threads now have numa-local memory.	2021-03-03 22:22:06 -05:00
quackerd	b85777e6f0	+stuff?	2021-02-23 13:12:27 -05:00
quackerd	162d41a4cc	+ cat packet loss control and max packet loss tolerance \ + output and parse packet loss for master and slaves	2021-02-22 06:54:53 -05:00
quackerd	1fd9be7f13	+ packet loss control & + packet depth control	2021-02-21 05:16:39 -05:00
quackerd	d1e43dcf2f	+Bench scripts	2021-02-20 04:53:55 -05:00
quackerd	06b93ddf1c	memload gen Summary: Add memload generator Test Plan: by hand Reviewers: ali Differential Revision: https://review.rcs.uwaterloo.ca/D415	2021-02-16 05:15:11 -05:00
quackerd	f655e5f5cb	Initial commit of benchmarks Summary: + UDP and PTP over UDP & hw timestamping + Khat protocol + Rat protocol + Nanosecond timestamping + Load generation + NUMA detection library + Test scripts + Server & Client multi threading & tx/rx queues + RSS on all packets w/ randomized L4 ports Test Plan: by hand Reviewers: ali Reviewed By: ali Differential Revision: https://review.rcs.uwaterloo.ca/D408	2021-02-10 14:12:47 -05:00
		`@ -0,0 +1 @@`
							`Checks: "-,clang-diagnostic-,clang-analyzer-,modernize,performance*,-modernize-use-trailing-return-type,-modernize-avoid-c-arrays"`