sandybridge doesn't support clflushopt

fix dpdk
akh morn
2023-12-06 04:22:46 +08:00 · 2023-12-06 03:38:32 +08:00 · 2023-12-06 03:23:00 +08:00 · 2023-05-01 15:28:51 -04:00 · 2023-05-01 21:18:34 +02:00 · 2023-03-29 22:00:59 +02:00
55 changed files with 9183 additions and 1226 deletions
--- a/.arcconfig
+++ b/.arcconfig
@ -0,0 +1,3 @@
+{
+  "phabricator.uri" : "https://review.rcs.uwaterloo.ca/"
+}
--- a/.clang-format
+++ b/.clang-format
@ -0,0 +1,198 @@
+# $FreeBSD$
+# Basic .clang-format
+---
+BasedOnStyle: WebKit
+AlignAfterOpenBracket: DontAlign
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Left
+AlignOperands: false
+AlignTrailingComments: true
+AllowAllArgumentsOnNextLine: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: InlineOnly
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: TopLevelDefinitions
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: MultiLine
+BinPackArguments: true
+BinPackParameters: true
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: WebKit
+BreakBeforeTernaryOperators: false
+# TODO: BreakStringLiterals can cause very strange formatting so turn it off?
+BreakStringLiterals: false
+# Prefer:
+# some_var = function(arg1,
+#    arg2)
+# over:
+# some_var =
+#     function(arg1, arg2)
+PenaltyBreakAssignment: 100
+# Prefer:
+# some_long_function(arg1, arg2
+#     arg3)
+# over:
+# some_long_function(
+#     arg1, arg2, arg3)
+PenaltyBreakBeforeFirstCallParameter: 100
+CompactNamespaces: true
+DerivePointerAlignment: false
+DisableFormat: false
+ForEachMacros:
+  - ARB_ARRFOREACH
+  - ARB_ARRFOREACH_REVWCOND
+  - ARB_ARRFOREACH_REVERSE
+  - ARB_FOREACH
+  - ARB_FOREACH_FROM
+  - ARB_FOREACH_SAFE
+  - ARB_FOREACH_REVERSE
+  - ARB_FOREACH_REVERSE_FROM
+  - ARB_FOREACH_REVERSE_SAFE
+  - BIT_FOREACH_ISCLR
+  - BIT_FOREACH_ISSET
+  - CPU_FOREACH
+  - CPU_FOREACH_ISCLR
+  - CPU_FOREACH_ISSET
+  - FOREACH_THREAD_IN_PROC
+  - FOREACH_PROC_IN_SYSTEM
+  - FOREACH_PRISON_CHILD
+  - FOREACH_PRISON_DESCENDANT
+  - FOREACH_PRISON_DESCENDANT_LOCKED
+  - FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL
+  - MNT_VNODE_FOREACH_ALL
+  - MNT_VNODE_FOREACH_ACTIVE
+  - RB_FOREACH
+  - RB_FOREACH_FROM
+  - RB_FOREACH_SAFE
+  - RB_FOREACH_REVERSE
+  - RB_FOREACH_REVERSE_FROM
+  - RB_FOREACH_REVERSE_SAFE
+  - SLIST_FOREACH
+  - SLIST_FOREACH_FROM
+  - SLIST_FOREACH_FROM_SAFE
+  - SLIST_FOREACH_SAFE
+  - SLIST_FOREACH_PREVPTR
+  - SPLAY_FOREACH
+  - LIST_FOREACH
+  - LIST_FOREACH_FROM
+  - LIST_FOREACH_FROM_SAFE
+  - LIST_FOREACH_SAFE
+  - STAILQ_FOREACH
+  - STAILQ_FOREACH_FROM
+  - STAILQ_FOREACH_FROM_SAFE
+  - STAILQ_FOREACH_SAFE
+  - TAILQ_FOREACH
+  - TAILQ_FOREACH_FROM
+  - TAILQ_FOREACH_FROM_SAFE
+  - TAILQ_FOREACH_REVERSE
+  - TAILQ_FOREACH_REVERSE_FROM
+  - TAILQ_FOREACH_REVERSE_FROM_SAFE
+  - TAILQ_FOREACH_REVERSE_SAFE
+  - TAILQ_FOREACH_SAFE
+  - VM_MAP_ENTRY_FOREACH
+  - VM_PAGE_DUMP_FOREACH
+IndentCaseLabels: false
+IndentPPDirectives: None
+Language: Cpp
+NamespaceIndentation: None
+PointerAlignment: Right
+ContinuationIndentWidth: 4
+IndentWidth: 8
+TabWidth: 8
+ColumnLimit: 80
+UseTab: Always
+SpaceAfterCStyleCast: false
+IncludeBlocks: Regroup
+IncludeCategories:
+  - Regex: '^\"opt_.*\.h\"'
+    Priority: 1
+    SortPriority: 10
+  - Regex: '^<sys/cdefs\.h>'
+    Priority: 2
+    SortPriority: 20
+  - Regex: '^<sys/types\.h>'
+    Priority: 2
+    SortPriority: 21
+  - Regex: '^<sys/param\.h>'
+    Priority: 2
+    SortPriority: 22
+  - Regex: '^<sys/systm\.h>'
+    Priority: 2
+    SortPriority: 23
+  - Regex: '^<sys.*/'
+    Priority: 2
+    SortPriority: 24
+  - Regex: '^<vm/vm\.h>'
+    Priority: 3
+    SortPriority: 30
+  - Regex: '^<vm/'
+    Priority: 3
+    SortPriority: 31
+  - Regex: '^<machine/'
+    Priority: 4
+    SortPriority: 40
+  - Regex: '^<(x86|amd64|i386|xen)/'
+    Priority: 5
+    SortPriority: 50
+  - Regex: '^<dev/'
+    Priority: 6
+    SortPriority: 60
+  - Regex: '^<net.*/'
+    Priority: 7
+    SortPriority: 70
+  - Regex: '^<protocols/'
+    Priority: 7
+    SortPriority: 71
+  - Regex: '^<(fs|nfs(|client|server)|ufs)/'
+    Priority: 8
+    SortPriority: 80
+  - Regex: '^<[^/].*\.h'
+    Priority: 9
+    SortPriority: 90
+  - Regex: '^\".*\.h\"'
+    Priority: 10
+    SortPriority: 100
+# LLVM's header include ordering style is almost the exact opposite of ours.
+# Unfortunately, they have hard-coded their preferences into clang-format.
+# Clobbering this regular expression to avoid matching prevents non-system
+# headers from being forcibly moved to the top of the include list.
+# http://llvm.org/docs/CodingStandards.html#include-style
+IncludeIsMainRegex: 'BLAH_DONT_MATCH_ANYTHING'
+SortIncludes: true
+KeepEmptyLinesAtTheStartOfBlocks: true
+TypenameMacros:
+  - ARB_ELMTYPE
+  - ARB_HEAD
+  - ARB8_HEAD
+  - ARB16_HEAD
+  - ARB32_HEAD
+  - ARB_ENTRY
+  - ARB8_ENTRY
+  - ARB16_ENTRY
+  - ARB32_ENTRY
+  - LIST_CLASS_ENTRY
+  - LIST_CLASS_HEAD
+  - LIST_ENTRY
+  - LIST_HEAD
+  - QUEUE_TYPEOF
+  - RB_ENTRY
+  - RB_HEAD
+  - SLIST_CLASS_HEAD
+  - SLIST_CLASS_ENTRY
+  - SLIST_HEAD
+  - SLIST_ENTRY
+  - SMR_POINTER
+  - SPLAY_ENTRY
+  - SPLAY_HEAD
+  - STAILQ_CLASS_ENTRY
+  - STAILQ_CLASS_HEAD
+  - STAILQ_ENTRY
+  - STAILQ_HEAD
+  - TAILQ_CLASS_ENTRY
+  - TAILQ_CLASS_HEAD
+  - TAILQ_ENTRY
+  - TAILQ_HEAD
--- a/.clang-tidy
+++ b/.clang-tidy
@ -0,0 +1 @@
+Checks: "-*,clang-diagnostic-*,clang-analyzer-*,modernize*,performance*,-modernize-use-trailing-return-type,-modernize-avoid-c-arrays"
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,274 @@
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+################ C STUFF ##########################
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+
+# Prerequisites
+*.d
+
+# Object files
+*.o
+*.ko
+*.obj
+*.elf
+
+# Linker output
+*.ilk
+*.map
+*.exp
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Libraries
+*.lib
+*.a
+*.la
+*.lo
+
+# Shared objects (inc. Windows DLLs)
+*.dll
+*.so
+*.so.*
+*.dylib
+
+# Executables
+*.exe
+*.out
+*.app
+*.i*86
+*.x86_64
+*.hex
+
+# Debug files
+*.dSYM/
+*.su
+*.idb
+*.pdb
+
+# Kernel Module Compile Results
+*.mod*
+*.cmd
+.tmp_versions/
+modules.order
+Module.symvers
+Mkfile.old
+dkms.conf
+
+
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+################ PYTHON STUFF ##########################
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+################ C++ STUFF ##########################
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+########################################################
+# Prerequisites
+*.d
+
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Fortran module files
+*.mod
+*.smod
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.exe
+*.out
+*.app
+
+*.clangd
+compile_commands.json
--- a/.gitmodules
+++ b/.gitmodules
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,33 +1,86 @@
 cmake_minimum_required(VERSION 3.0)

-find_program(CC_GCC gcc)
-find_program(CXX_GCC g++)
-
-set(CMAKE_C_COMPILER ${CC_GCC})
-set(CMAKE_CXX_COMPILER ${CXX_GCC})
-
 project(khat)

 list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}")
-find_package(dpdk REQUIRED)
+find_package(PkgConfig REQUIRED)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY lib)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY bin)
+
+pkg_check_modules(DPDK libdpdk)
+pkg_check_modules(SPDK spdk_event_bdev spdk_env_dpdk)
+pkg_check_modules(SPDK_SYS spdk_syslibs)
+pkg_check_modules(UUID uuid)
+pkg_check_modules(TOPO bsdtopo)

 set(CC_FLAGS -O2 -g -Wall -Wextra -Werror -std=c++11 
        -Wno-deprecated-declarations 
-        -Wno-packed-not-aligned
        -Wno-address-of-packed-member
-        -msse4)
+        -Wno-zero-length-array
+        -Wno-gnu-zero-variadic-macro-arguments
+        -march=native)
+
+set(C_FLAGS -O2 -g -Wall -Wextra -Werror -std=c2x
+        -Wno-deprecated-declarations 
+        -Wno-address-of-packed-member
+        -Wno-zero-length-array
+        -Wno-gnu-zero-variadic-macro-arguments
+        -march=native)
+

 include_directories(${CMAKE_SOURCE_DIR}/inc)
-include_directories(${dpdk_INCLUDE_DIRS})
+include_directories()

-add_executable(khat khat/khat.cc)
-add_executable(cat cat/cat.cc)
+set(LIBNTR_C_FLAGS -O3 -g -Wall -Wextra -Werror -std=c2x)
+set(LIBGEN_CC_FLAGS -O3 -g -Wall -Wextra -Werror -std=c++17)

-set(LINK_LIBS ${dpdk_LIBRARIES} pthread)
+add_library(ntr SHARED libntr/ntr.c)
+target_compile_options(ntr PRIVATE ${LIBNTR_C_FLAGS})

-target_link_libraries(khat ${LINK_LIBS})
-target_compile_options(khat PRIVATE ${CC_FLAGS})
+add_library(gen SHARED libgen/generator.cc libgen/loadgen.cc)
+target_link_libraries(gen PRIVATE pthread ntr ${TOPO_LINK_LIBRARIES} nms)
+target_compile_options(gen PRIVATE ${LIBGEN_CC_FLAGS} ${TOPO_CFLAGS})

-target_link_libraries(cat ${LINK_LIBS})
-target_compile_options(cat PRIVATE ${CC_FLAGS})
+add_library(netsup SHARED net/libnetsup/dpdk.cc net/libnetsup/portconf.cc)
+target_link_libraries(netsup PRIVATE ntr ${DPDK_LINK_LIBRARIES})
+target_compile_options(netsup PRIVATE ${LIBGEN_CC_FLAGS} ${DPDK_CFLAGS})

+add_library(nms SHARED libnms/alloc.c)
+target_link_libraries(nms PRIVATE ${TOPO_LINK_LIBRARIES})
+target_compile_options(nms PRIVATE ${TOPO_CFLAGS})
+
+add_executable(khat EXCLUDE_FROM_ALL net/khat.cc)
+target_link_libraries(khat PRIVATE pthread ntr gen netsup nms ${DPDK_LINK_LIBRARIES} ${TOPO_LINK_LIBRARIES})
+target_compile_options(khat PRIVATE ${CC_FLAGS} ${DPDK_CFLAGS} ${TOPO_CFLAGS})
+
+add_executable(cat EXCLUDE_FROM_ALL net/cat.cc)
+target_link_libraries(cat PRIVATE pthread ntr gen netsup nms ${DPDK_LINK_LIBRARIES} ${TOPO_LINK_LIBRARIES})
+target_compile_options(cat PRIVATE ${CC_FLAGS} ${DPDK_CFLAGS} ${TOPO_CFLAGS})
+
+add_executable(rat EXCLUDE_FROM_ALL net/rat.cc)
+target_link_libraries(rat PRIVATE pthread ntr gen netsup nms ${DPDK_LINK_LIBRARIES} ${TOPO_LINK_LIBRARIES})
+target_compile_options(rat PRIVATE ${CC_FLAGS} ${DPDK_CFLAGS} ${TOPO_CFLAGS})
+
+add_executable(birb EXCLUDE_FROM_ALL storage/birb.cc storage/io_gen.cc storage/drivers/bdev.cc storage/drivers/bdev_thread.cc storage/drivers/nvme.cc storage/drivers/nvme_thread.cc)
+target_include_directories(birb PRIVATE ${SPDK_INCLUDE_DIRS} ${DPDK_INCLUDE_DIRS} ${UUID_INCLUDE_DIRS})
+target_compile_options(birb PRIVATE ${CC_FLAGS} ${SPDK_CFLAGS} ${UUID_CFLAGS})
+target_link_directories(birb PRIVATE ${SPDK_LIBRARY_DIRS} ${SPDK_SYS_STATIC_LIBRARY_DIRS} ${UUID_LIBRARY_DIRS})
+target_link_libraries(birb PRIVATE pthread ntr gen -Wl,--whole-archive ${SPDK_LIBRARIES} -Wl,--no-whole-archive ${SPDK_SYS_STATIC_LIBRARIES})
+
+add_executable(birb_posix EXCLUDE_FROM_ALL storage/birb_posix.cc storage/io_gen.cc)
+target_compile_options(birb_posix PRIVATE ${CC_FLAGS})
+target_link_libraries(birb_posix PRIVATE pthread ntr gen)
+
+add_executable(memloadgen util/memloadgen.cc)
+target_link_libraries(memloadgen PRIVATE pthread gen ntr nms ${TOPO_LINK_LIBRARIES})
+target_compile_options(memloadgen PRIVATE ${CC_FLAGS} ${TOPO_CFLAGS})
+
+add_executable(mornafah util/mornafah.c)
+target_link_libraries(mornafah PRIVATE pthread gen ntr nms ${TOPO_LINK_LIBRARIES})
+target_compile_options(mornafah PRIVATE ${C_FLAGS} ${TOPO_CFLAGS})
+
+add_executable(nms_test tests/nms_test.c)
+set_target_properties(nms_test PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/tests)
+target_link_libraries(nms_test PRIVATE nms)
+target_compile_options(nms_test PRIVATE ${C_FLAGS})
--- a/Finddpdk.cmake
+++ b/Finddpdk.cmake
@ -1,142 +0,0 @@
-# Try to find dpdk
-#
-# Once done, this will define
-#
-# dpdk::dpdk
-# dpdk_FOUND
-# dpdk_INCLUDE_DIR
-# dpdk_LIBRARIES
-
-find_package(PkgConfig QUIET)
-if(PKG_CONFIG_FOUND)
-  pkg_check_modules(dpdk QUIET libdpdk)
-endif()
-
-if(dpdk_INCLUDE_DIRS)
-  # good
-elseif(TARGET dpdk::dpdk)
-  get_target_property(dpdk_INCLUDE_DIRS
-     dpdk::dpdk INTERFACE_INCLUDE_DIRECTORIES)
-else()
-  find_path(dpdk_config_INCLUDE_DIR rte_config.h
-    HINTS
-      ENV DPDK_DIR
-    PATH_SUFFIXES
-      dpdk
-      include)
-  find_path(dpdk_common_INCLUDE_DIR rte_common.h
-    HINTS
-      ENC DPDK_DIR
-    PATH_SUFFIXES
-      dpdk
-      include)
-  set(dpdk_INCLUDE_DIRS "${dpdk_config_INCLUDE_DIR}")
-  if(NOT dpdk_config_INCLUDE_DIR EQUAL dpdk_common_INCLUDE_DIR)
-    list(APPEND dpdk_INCLUDE_DIRS "${dpdk_common_INCLUDE_DIR}")
-  endif()
-endif()
-
-set(components
-  bus_pci
-  bus_vdev
-  cfgfile
-  cmdline
-  eal
-  ethdev
-  hash
-  kvargs
-  mbuf
-  mempool
-  mempool_ring
-  mempool_stack
-  net
-  pci
-  pmd_af_packet
-  pmd_bnxt
-  pmd_bond
-  pmd_cxgbe
-  pmd_e1000
-  pmd_ena
-  pmd_enic
-  pmd_i40e
-  pmd_ixgbe
-  pmd_mlx5
-  pmd_nfp
-  pmd_qede
-  pmd_ring
-  pmd_sfc_efx
-  pmd_vmxnet3_uio
-  ring
-  timer)
-
-# for collecting dpdk library targets, it will be used when defining dpdk::dpdk
-set(_dpdk_libs)
-# for list of dpdk library archive paths
-set(dpdk_LIBRARIES)
-
-foreach(c ${components})
-  set(dpdk_lib dpdk::${c})
-  if(TARGET ${dpdk_lib})
-    get_target_property(DPDK_rte_${c}_LIBRARY
-      ${dpdk_lib} IMPORTED_LOCATION)
-  else()
-    find_library(DPDK_rte_${c}_LIBRARY rte_${c}
-      HINTS
-        ENV DPDK_DIR
-        ${dpdk_LIBRARY_DIRS}
-        PATH_SUFFIXES lib)
-  endif()
-  if(DPDK_rte_${c}_LIBRARY)
-    if (NOT TARGET ${dpdk_lib})
-      add_library(${dpdk_lib} UNKNOWN IMPORTED)
-      set_target_properties(${dpdk_lib} PROPERTIES
-        INTERFACE_INCLUDE_DIRECTORIES "${dpdk_INCLUDE_DIRS}"
-        IMPORTED_LOCATION "${DPDK_rte_${c}_LIBRARY}")
-      if(c STREQUAL pmd_mlx5)
-        find_package(verbs QUIET)
-        if(verbs_FOUND)
-          target_link_libraries(${dpdk_lib} INTERFACE IBVerbs::verbs)
-        endif()
-      endif()
-    endif()
-    list(APPEND _dpdk_libs ${dpdk_lib})
-    list(APPEND dpdk_LIBRARIES ${DPDK_rte_${c}_LIBRARY})
-  endif()
-endforeach()
-
-mark_as_advanced(dpdk_INCLUDE_DIRS ${dpdk_LIBRARIES})
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(dpdk DEFAULT_MSG
-  dpdk_INCLUDE_DIRS
-  dpdk_LIBRARIES)
-
-if(dpdk_FOUND)
-  if(NOT TARGET dpdk::cflags)
-     if(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64|x86_64|AMD64")
-      set(rte_cflags "-march=core2")
-    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm|ARM")
-      set(rte_cflags "-march=armv7-a")
-    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
-      set(rte_cflags "-march=armv8-a+crc")
-    endif()
-    add_library(dpdk::cflags INTERFACE IMPORTED)
-    if (rte_cflags)
-      set_target_properties(dpdk::cflags PROPERTIES
-        INTERFACE_COMPILE_OPTIONS "${rte_cflags}")
-    endif()
-  endif()
-
-  if(NOT TARGET dpdk::dpdk)
-    add_library(dpdk::dpdk INTERFACE IMPORTED)
-    find_package(Threads QUIET)
-    list(APPEND _dpdk_libs
-      Threads::Threads
-      dpdk::cflags)
-    set_target_properties(dpdk::dpdk PROPERTIES
-      INTERFACE_LINK_LIBRARIES "${_dpdk_libs}"
-      INTERFACE_INCLUDE_DIRECTORIES "${dpdk_INCLUDE_DIRS}")
-  endif()
-endif()
-
-unset(_dpdk_libs)
--- a/cat/cat.cc
+++ b/cat/cat.cc
@ -1,444 +0,0 @@
-#include <cstdio>
-#include <ctime>
-#include <netinet/in.h>
-#include <rte_config.h>
-#include <rte_common.h>
-#include <rte_eal.h>
-#include <rte_ethdev.h>
-#include <rte_cycles.h>
-#include <rte_lcore.h>
-#include <rte_mbuf.h>
-#include <rte_ether.h>
-#include <rte_launch.h>
-#include <rte_log.h>
-#include <atomic>
-#include <vector>
-#include <fstream>
-#include <unistd.h>
-
-#include "ntrlog.h"
-#include "pkt.h"
-#include "rte_byteorder.h"
-#include "rte_ip.h"
-
-// init NTRLOG
-NTR_DECL_IMPL;
-
-constexpr unsigned int MBUF_MAX_COUNT = 8191;
-constexpr unsigned int MBUF_CACHE_SIZE = 250;
-constexpr unsigned int RX_RING_SIZE = 1024;
-constexpr unsigned int TX_RING_SIZE = 1024;
-constexpr unsigned int RX_RING_NUM = 1;
-constexpr unsigned int TX_RING_NUM = 1;
-constexpr unsigned int BURST_SIZE = 32;
-
-static const struct rte_eth_conf port_conf_default{};
-
-struct datapt{
-    uint64_t server_proc = 0;
-    uint64_t rtt = 0;
-};
-
-struct options_t {
-    unsigned int run_time = 5;
-    unsigned int warmup_time = 0;
-    char output[256] = "output.txt";
-    struct rte_ether_addr server_mac;
-    // states
-    std::atomic<bool> s_stop {false};
-    std::atomic<bool> s_record {false};
-    std::vector<struct datapt *> s_stats;
-    struct rte_mempool * s_mbuf_pool;
-    uint16_t s_portid;
-    struct rte_ether_addr s_host_mac;
-};
-
-struct options_t options;
-
-static uint16_t
-rx_calc_latency(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
-        struct rte_mbuf **pkts, uint16_t nb_pkts, uint16_t max_pkts __rte_unused, void *_ __rte_unused)
-{
-    // XXX: need to get the timestamp in every loop?
-    uint64_t now = rte_rdtsc();
-    struct packet_data * pkt_data;
-
-    for (int i = 0; i < nb_pkts; i++) {
-        pkt_data = check_valid_packet(pkts[i]);
-
-        if (pkt_data == NULL) {
-            ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "rx_calc_latency: ignoring invalid packet 0x%p.\n", (void*)pkts[i]);
-            continue;  
-        }
-        
-        ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "rx_calc_latency: tagged packet %p with %llu.\n", (void*)pkts[i], now); 
-        pkt_data->clt_ts_rx = rte_cpu_to_be_64(now);
-    }
-
-    return nb_pkts;
-}
-
-static uint16_t
-tx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
-		struct rte_mbuf **pkts, uint16_t nb_pkts, void *_ __rte_unused)
-{
-    // XXX: need to get the timestamp in every loop?
-    uint64_t now = rte_rdtsc();
-    struct packet_data * pkt_data;
-
-    for (int i = 0; i < nb_pkts; i++) {
-        pkt_data = check_valid_packet(pkts[i]);
-
-        if (pkt_data == NULL) {
-            ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "tx_add_timestamp: ignoring invalid packet 0x%p.\n", (void*)pkts[i]);
-            continue;  
-        }
-
-        ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "tx_add_timestamp: tagged packet %p with %llu.\n", (void*)pkts[i], now); 
-        pkt_data->clt_ts_tx = rte_cpu_to_be_64(now);
-    }
-
-    return nb_pkts;
-}
-
-#define STATE_SEND (0)
-#define STATE_RECV (1)
-
-static int
-locore_main(void * _unused __rte_unused)
-{
-    struct rte_mbuf *tx_buf;
-    struct rte_mbuf *rx_bufs[BURST_SIZE];
-    struct packet_data *pkt_data;
-    uint32_t core_id = rte_lcore_id();
-    uint32_t epoch = 0;
-    int state = STATE_SEND;
-
-    // XXX: check link status instead
-
-    sleep(1);
-    if (rte_eth_dev_socket_id(options.s_portid) > 0 && rte_eth_dev_socket_id(options.s_portid) != (int)rte_socket_id()) {
-        ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "locore_main: WARNING, port %d is on remote NUMA node to "
-                "polling thread.\n\tPerformance will "
-                "not be optimal.\n", options.s_portid);
-    }
-
-	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: core %d running...\n", core_id);
-
-    tx_buf = rte_pktmbuf_alloc(options.s_mbuf_pool);
-
-    if (tx_buf == NULL) {
-        rte_exit(EXIT_FAILURE, "cannot allocate tx_buf\n");
-    }
-
-    pkt_data = construct_udp_pkt_hdr(tx_buf, 
-                                    &options.s_host_mac, &options.server_mac, 
-                                    RTE_IPV4(192, 168, 100, 150), RTE_IPV4(192, 168, 100, 151), 
-                                    1337, 1337);
-    if (pkt_data == NULL) {
-        rte_exit(EXIT_FAILURE, "cannot allocate space for packet_data in mbuf\n");
-    }
-    pkt_data->magic = rte_cpu_to_be_32(ETHER_FRAME_MAGIC);
-
-    while(!options.s_stop.load()) {
-        // always pop incoming packets
-        const uint16_t nb_rx = rte_eth_rx_burst(options.s_portid, 0, rx_bufs, BURST_SIZE);
-
-        if (nb_rx != 0) {
-            // only process packets when we are ready to receive
-            for (int i = 0; i < nb_rx; i++) {
-                struct packet_data * each = check_valid_packet(rx_bufs[i]);
-                
-                if (each == NULL) {
-                    ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "locore_main: ignoring invalid packet %p.\n", (void*)rx_bufs[i]);
-                    dump_pkt(rx_bufs[i]);
-                    rte_pktmbuf_free(rx_bufs[i]);
-                    continue;
-                }
-
-                if (rte_be_to_cpu_32(each->epoch) == epoch && state == STATE_RECV) {
-                   ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: received packet %p for epoch %d\n", (void*)rx_bufs[i], epoch);
-
-                    if (options.s_record.load()) {
-                        // keep statistics
-                        struct datapt * dpt = new datapt;
-                        dpt->rtt = rte_be_to_cpu_64(each->clt_ts_rx) - rte_be_to_cpu_64(each->clt_ts_tx);
-                        dpt->server_proc = rte_be_to_cpu_64(each->srv_ts_tx) - rte_be_to_cpu_64(each->srv_ts_rx);
-                        options.s_stats.push_back(dpt);
-                    }
-
-                    // bump the epoch and stop processing other packets
-                    state = STATE_SEND;
-                    epoch++;
-                } else {
-                    ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: ignoring packet 0x%p with invalid epoch %d.\n", (void*)rx_bufs[i], epoch);
-                }
-
-                rte_pktmbuf_free(rx_bufs[i]);
-            }
-        }
-
-        if (state == STATE_SEND) {
-            // set new epoch
-            pkt_data->epoch = rte_cpu_to_be_32(epoch);
-            ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: sending packet %p with epoch %d\n", (void*)tx_buf, epoch);
-
-            const uint16_t nb_tx = rte_eth_tx_burst(options.s_portid, 0, &tx_buf, 1);
-
-            if (nb_tx < 1) {
-                rte_exit(EXIT_FAILURE, "failed to send packet 0x%p, epoch %d\n", (void*)tx_buf, epoch);
-            }
-            state = STATE_RECV;
-        }
-	}
-    
-    rte_pktmbuf_free(tx_buf);
-
-    ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: core %d successfully stopped.\n", core_id);
-
-    return 0;
-}
-
-static int 
-port_init(uint16_t portid, struct rte_mempool *mbuf_pool)
-{
-    struct rte_eth_dev_info dev_info;
-    struct rte_eth_conf port_conf = port_conf_default;
-    struct rte_eth_txconf txconf;
-    struct rte_eth_rxconf rxconf;
-
-    uint16_t nb_rxd = RX_RING_SIZE;
-	uint16_t nb_txd = TX_RING_SIZE; 
-    port_conf.rxmode.max_rx_pkt_len = RTE_ETHER_MAX_LEN;
-
-    if(!rte_eth_dev_is_valid_port(portid)) {
-        return -1;
-    }
-
-    int ret = rte_eth_dev_info_get(portid, &dev_info);
-    if (ret != 0) {
-        return ret;
-    }
-
-    port_conf.rxmode.max_rx_pkt_len = RTE_ETHER_MAX_LEN;
-    port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_UDP_CKSUM;
-    port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_IPV4_CKSUM;
-    port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM;
-    port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
-    port_conf.txmode.offloads |= DEV_TX_OFFLOAD_MBUF_FAST_FREE;
-    
-    /* Configure the Ethernet device. */
-    ret = rte_eth_dev_configure(portid, RX_RING_NUM, TX_RING_NUM, &port_conf);
-	if (ret != 0)
-		return ret;
-
-	ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd, &nb_txd);
-	if (ret != 0)
-		return ret;
-
-	/* Allocate and set up 1 RX queue per Ethernet port. */
-    rxconf = dev_info.default_rxconf;
-    rxconf.offloads = port_conf.rxmode.offloads;
-	for (uint32_t i = 0; i < RX_RING_NUM; i++) {
-		ret = rte_eth_rx_queue_setup(portid, i, nb_rxd, rte_eth_dev_socket_id(portid), &rxconf, mbuf_pool);
-		if (ret < 0)
-			return ret;
-	}
-
-    txconf = dev_info.default_txconf;
-	txconf.offloads = port_conf.txmode.offloads;
-	/* Allocate and set up 1 TX queue per Ethernet port. */
-	for (uint32_t i = 0; i < TX_RING_NUM; i++) {
-		ret = rte_eth_tx_queue_setup(portid, i, nb_txd, rte_eth_dev_socket_id(portid), &txconf);
-		if (ret < 0)
-			return ret;
-	}
-
-    ret = rte_eth_dev_start(portid);
-    if (ret < 0)
-        return ret;
-
-	/* Display the port MAC address. */
-    struct rte_ether_addr addr;
-    ret = rte_eth_macaddr_get(portid, &addr);
-    if (ret != 0)
-        return ret;
-
-    /* Enable RX in promiscuous mode for the Ethernet device. */
-    ret = rte_eth_promiscuous_enable(portid);
-	if (ret != 0)
-		return ret;
-
-    rte_eth_add_tx_callback(portid, 0, tx_add_timestamp, NULL);
-    rte_eth_add_rx_callback(portid, 0, rx_calc_latency, NULL);
-
-	return 0;
-}
-
-static void dump_options()
-{
-    fprintf(stdout, "Configuration:\n" \
-            "    run time = %d\n" \
-            "    warmup time = %d\n" \
-            "    output file = %s\n" \
-            "    server MAC = %x:%x:%x:%x:%x:%x\n",
-            options.run_time,
-            options.warmup_time,
-            options.output,
-            options.server_mac.addr_bytes[0],
-            options.server_mac.addr_bytes[1],
-            options.server_mac.addr_bytes[2],
-            options.server_mac.addr_bytes[3],
-            options.server_mac.addr_bytes[4],
-            options.server_mac.addr_bytes[5]);
-}
-
-static void usage()
-{
-    fprintf(stdout, 
-            "Usage:\n " \
-            "    -v(vv): verbose mode\n" \
-            "    -h: display the information\n" \
-            "    -o: output filename\n" \
-            "    -t: run time\n" \
-            "    -T: warmup time\n" \
-            "    -s: server's mac\n\n" );
-}
-
-int main(int argc, char* argv[])
-{
-    unsigned int nb_ports;
-    struct rte_mempool *mbuf_pool, *mbuf_pool_pkt;
-    std::ofstream log_file;
-
-    // init dpdk
-    int ret = rte_eal_init(argc, argv);
-    if (ret < 0) {
-        rte_exit(EXIT_FAILURE, "rte_eal_init failed!\n");
-    }
-
-    argc -= ret;
-    argv += ret;
-
-    // set warning level
-    ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_WARNING);
-    {
-        int c;
-        // parse arguments
-        while((c = getopt(argc, argv, "hvo:t:T:s:")) != -1) {
-            switch (c) {
-                case 'v':
-                    ntr_set_level(NTR_DEP_USER1, ntr_get_level(NTR_DEP_USER1) + 1);
-                    break;
-                case 's':
-                    if (rte_ether_unformat_addr(optarg, &options.server_mac) == -1) {
-                        rte_exit(EXIT_FAILURE, "cannot parse %s as mac address.\n", optarg);
-                    }
-                    break;
-                case 't':
-                    options.run_time = atoi(optarg);
-                    break;
-                case 'T':
-                    options.warmup_time = atoi(optarg);
-                    break;
-                case 'h':
-                    usage();
-                    rte_exit(EXIT_SUCCESS, NULL);
-                    break;
-                case 'o':
-                    strncpy(options.output, optarg, sizeof(options.output) - 1);
-                    break;
-                default:
-                    usage();
-                    rte_exit(EXIT_FAILURE, "unknown argument: %c\n", c);
-                    break;
-            }
-        }
-    }
-
-    // open log file for writing
-    log_file.open(options.output, std::ofstream::out);
-    if (!log_file) {
-        rte_exit(EXIT_FAILURE, "failed to open log file %s\n", options.output);
-    }
-
-    nb_ports = rte_eth_dev_count_avail();
-    if (nb_ports == 0) {
-        rte_exit(EXIT_FAILURE, "number of ports must be > 0\n");
-    }
-
-    // create a mbuf memory pool on the socket
-    mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", MBUF_MAX_COUNT * nb_ports, MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());
-    if (mbuf_pool == nullptr) {
-        rte_exit(EXIT_FAILURE, "cannot create mbuf pool\n");
-    }
-
-    mbuf_pool_pkt = rte_pktmbuf_pool_create("MBUF_POOL_PKT", MBUF_MAX_COUNT * nb_ports, MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());
-    if (mbuf_pool_pkt == nullptr) {
-        rte_exit(EXIT_FAILURE, "cannot create mbuf pool\n");
-    }
-    options.s_mbuf_pool = mbuf_pool_pkt;
-
-    uint16_t portid = rte_eth_find_next(0);
-    if (portid == RTE_MAX_ETHPORTS) {
-        rte_exit(EXIT_FAILURE, "cannot find an available port\n");
-    }
-    options.s_portid = portid;
-
-    if (port_init(portid, mbuf_pool) != 0) {
-        rte_exit(EXIT_FAILURE, "cannot init port %d\n", portid);
-    }
-
-    if (rte_eth_macaddr_get(portid, &options.s_host_mac) != 0) {
-        rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n", portid);
-    }
-
-    ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "Configured port %d with mac addr %x:%x:%x:%x:%x:%x\n", portid,
-                                                                                                options.s_host_mac.addr_bytes[0],
-                                                                                                options.s_host_mac.addr_bytes[1],
-                                                                                                options.s_host_mac.addr_bytes[2],
-                                                                                                options.s_host_mac.addr_bytes[3],
-                                                                                                options.s_host_mac.addr_bytes[4],
-                                                                                                options.s_host_mac.addr_bytes[5]);
-
-    dump_options();
-
-    uint16_t core_id = rte_get_next_lcore(0, true, false);
-    if (rte_eal_remote_launch(locore_main, NULL, core_id) != 0) {
-        rte_exit(EXIT_FAILURE, "failed to launch function on locore\n");
-    }
-
-    // poor man's timer
-    // XXX: use kqueue instead
-    struct timespec ts;
-    ts.tv_sec = 1;
-    ts.tv_nsec = 0;
-    uint32_t second = 0;
-    while(true) {
-        if (second >= options.warmup_time) {
-            options.s_record.store(true);
-        }
-        if (second >= options.run_time + options.warmup_time) {
-            options.s_stop.store(true);
-            break;
-        }
-        clock_nanosleep(CLOCK_REALTIME, 0, &ts, NULL);
-        second++;
-    }
-
-    if (rte_eal_wait_lcore(core_id) < 0)
-        rte_exit(EXIT_FAILURE, "failed to wait for job completion\n");
-
-    // dump stats
-    for (auto it = std::begin(options.s_stats); it != std::end(options.s_stats); ++it) {
-        log_file << (*it)->rtt << "," << (*it)->server_proc << std::endl;
-        delete *it;
-    }
-    log_file.close();
-
-    // clean up
-    rte_eth_dev_stop(portid);
-    rte_eth_dev_close(portid);
-
-    return 0;
-}
--- a/compile_flags.txt
+++ b/compile_flags.txt
@ -1,9 +0,0 @@
-xc++
-O2
-std=c++11
-Wall
-Werror
-Wpedantic
-I/usr/include/dpdk
-Iinc
-Wno-deprecated-declarations
--- a/inc/defs.hh
+++ b/inc/defs.hh
@ -0,0 +1,61 @@
+#pragma once
+
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <cstdio>
+#include <sys/types.h>
+#include <sys/cpuset.h>
+
+#define DISALLOW_EVIL_CONSTRUCTORS(TypeName) \
+	TypeName(const TypeName &) = delete; \
+	void operator=(const TypeName &) = delete
+
+#define UNUSED __attribute__((unused))
+
+constexpr static unsigned long S2NS = 1000000000UL;
+constexpr static unsigned long S2US = 1000000UL;
+constexpr static unsigned long MS2NS = 1000000UL;
+
+constexpr static int NEXT_CPU_NULL = -1;
+
+
+#if defined(__x86_64__)
+static inline int
+cmask_get_next_cpu(uint64_t *mask)
+{
+	int ffs = ffsll(*mask);
+	*mask &= ~(1ul << (ffs - 1));
+	return ffs - 1;
+}
+
+static inline int
+cmask_get_num_cpus(const uint64_t mask)
+{
+	return __builtin_popcount(mask);
+}
+#endif
+
+static inline uint64_t
+get_uptime()
+{
+	struct timespec tp;
+	clock_gettime(CLOCK_MONOTONIC, &tp);
+    return (tp.tv_sec * S2NS + tp.tv_nsec);
+}
+
+static inline void
+cpulist_to_cpuset(char * cpulist, cpuset_t * cpuset)
+{
+	char * cpu = strtok(cpulist, ",");
+	CPU_ZERO(cpuset);
+
+	while (cpu != nullptr) {
+		CPU_SET(atoi(cpu), cpuset);
+		cpu = strtok(nullptr, ",");
+	}
+}
+
+#define ATTR_UNUSED __attribute__((unused))
+
--- a/inc/gen.hh
+++ b/inc/gen.hh
@ -0,0 +1,346 @@
+// modified from mutilate
+// -*- c++ -*-
+
+// 1. implement "fixed" generator
+// 2. implement discrete generator
+// 3. implement combine generator?
+
+#pragma once
+
+#include <assert.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <sys/_pthreadtypes.h>
+#include <sys/param.h>
+
+#include "defs.hh"
+
+#define D(fmt, ...)
+#define DIE(fmt, ...) (void)0;
+
+#define FNV_64_PRIME (0x100000001b3ULL)
+#define FNV1_64_INIT (0xcbf29ce484222325ULL)
+static inline uint64_t
+fnv_64_buf(const void *buf, size_t len)
+{
+	uint64_t hval = FNV1_64_INIT;
+
+	unsigned char *bp = (unsigned char *)buf; /* start of buffer */
+	unsigned char *be = bp + len;		  /* beyond end of buffer */
+
+	while (bp < be) {
+		hval ^= (uint64_t)*bp++;
+		hval *= FNV_64_PRIME;
+	}
+
+	return hval;
+}
+
+static inline uint64_t
+fnv_64(uint64_t in)
+{
+	return fnv_64_buf(&in, sizeof(in));
+}
+
+// Generator syntax:
+//
+// \d+ == fixed
+// n[ormal]:mean,sd
+// e[xponential]:lambda
+// p[areto]:scale,shape
+// g[ev]:loc,scale,shape
+// fb_value, fb_key, fb_rate
+
+class Generator {
+    public:
+	Generator() { }
+	//  Generator(const Generator &g) = delete;
+	//  virtual Generator& operator=(const Generator &g) = delete;
+	virtual ~Generator() { }
+
+	virtual double generate(double U = -1.0) = 0;
+	virtual void set_lambda(double) { DIE("set_lambda() not implemented"); }
+
+    protected:
+	std::string type;
+};
+
+class Fixed : public Generator {
+    public:
+	Fixed(double _value = 1.0)
+	    : value(_value)
+	{
+		D("Fixed(%f)", value);
+	}
+	virtual double generate(double) { return value; }
+	virtual void set_lambda(double lambda)
+	{
+		if (lambda > 0.0)
+			value = 1.0 / lambda;
+		else
+			value = 0.0;
+	}
+
+    private:
+	double value;
+};
+
+class Uniform : public Generator {
+    public:
+	Uniform(double _scale)
+	    : scale(_scale)
+	{
+		D("Uniform(%f)", scale);
+	}
+
+	virtual double generate(double U = -1.0)
+	{
+		if (U < 0.0)
+			U = drand48();
+		return scale * U;
+	}
+
+	virtual void set_lambda(double lambda)
+	{
+		if (lambda > 0.0)
+			scale = 2.0 / lambda;
+		else
+			scale = 0.0;
+	}
+
+    private:
+	double scale;
+};
+
+class Normal : public Generator {
+    public:
+	Normal(double _mean = 1.0, double _sd = 1.0)
+	    : mean(_mean)
+	    , sd(_sd)
+	{
+		D("Normal(mean=%f, sd=%f)", mean, sd);
+	}
+
+	virtual double generate(double U = -1.0)
+	{
+		if (U < 0.0)
+			U = drand48();
+		double V = U; // drand48();
+		double N = sqrt(-2 * log(U)) * cos(2 * M_PI * V);
+		return mean + sd * N;
+	}
+
+	virtual void set_lambda(double lambda)
+	{
+		if (lambda > 0.0)
+			mean = 1.0 / lambda;
+		else
+			mean = 0.0;
+	}
+
+    private:
+	double mean, sd;
+};
+
+class Exponential : public Generator {
+    public:
+	Exponential(double _lambda = 1.0)
+	    : lambda(_lambda)
+	{
+		D("Exponential(lambda=%f)", lambda);
+	}
+
+	virtual double generate(double U = -1.0)
+	{
+		if (lambda <= 0.0)
+			return 0.0;
+		if (U < 0.0)
+			U = drand48();
+		return -log(U) / lambda;
+	}
+
+	virtual void set_lambda(double lambda) { this->lambda = lambda; }
+
+    private:
+	double lambda;
+};
+
+class GPareto : public Generator {
+    public:
+	GPareto(double _loc = 0.0, double _scale = 1.0, double _shape = 1.0)
+	    : loc(_loc)
+	    , scale(_scale)
+	    , shape(_shape)
+	{
+		assert(shape != 0.0);
+		D("GPareto(loc=%f, scale=%f, shape=%f)", loc, scale, shape);
+	}
+
+	virtual double generate(double U = -1.0)
+	{
+		if (U < 0.0)
+			U = drand48();
+		return loc + scale * (pow(U, -shape) - 1) / shape;
+	}
+
+	virtual void set_lambda(double lambda)
+	{
+		if (lambda <= 0.0)
+			scale = 0.0;
+		else
+			scale = (1 - shape) / lambda - (1 - shape) * loc;
+	}
+
+    private:
+	double loc /* mu */;
+	double scale /* sigma */, shape /* k */;
+};
+
+class GEV : public Generator {
+    public:
+	GEV(double _loc = 0.0, double _scale = 1.0, double _shape = 1.0)
+	    : e(1.0)
+	    , loc(_loc)
+	    , scale(_scale)
+	    , shape(_shape)
+	{
+		assert(shape != 0.0);
+		D("GEV(loc=%f, scale=%f, shape=%f)", loc, scale, shape);
+	}
+
+	virtual double generate(double U = -1.0)
+	{
+		return loc + scale * (pow(e.generate(U), -shape) - 1) / shape;
+	}
+
+    private:
+	Exponential e;
+	double loc /* mu */, scale /* sigma */, shape /* k */;
+};
+
+class Discrete : public Generator {
+    public:
+	~Discrete() { delete def; }
+	Discrete(Generator *_def = NULL)
+	    : def(_def)
+	{
+		if (def == NULL)
+			def = new Fixed(0.0);
+	}
+
+	virtual double generate(double U = -1.0)
+	{
+		double Uc = U;
+		if (pv.size() > 0 && U < 0.0)
+			U = drand48();
+
+		double sum = 0;
+
+		for (auto p : pv) {
+			sum += p.first;
+			if (U < sum)
+				return p.second;
+		}
+
+		return def->generate(Uc);
+	}
+
+	void add(double p, double v)
+	{
+		pv.push_back(std::pair<double, double>(p, v));
+	}
+
+    private:
+	Generator *def;
+	std::vector<std::pair<double, double>> pv;
+};
+
+class KeyGenerator {
+    public:
+	KeyGenerator(Generator *_g, double _max = 10000)
+	    : g(_g)
+	    , max(_max)
+	{
+	}
+	std::string generate(uint64_t ind)
+	{
+		uint64_t h = fnv_64(ind);
+		double U = (double)h / (double)ULLONG_MAX;
+		double G = g->generate(U);
+		int keylen = MAX(round(G), floor(log10(max)) + 1);
+		char key[256];
+		snprintf(key, 256, "%0*" PRIu64, keylen, ind);
+
+		//    D("%d = %s", ind, key);
+		return std::string(key);
+	}
+
+    private:
+	Generator *g;
+	double max;
+};
+
+Generator *createGenerator(std::string str);
+Generator *createFacebookKey();
+Generator *createFacebookValue();
+Generator *createFacebookIA();
+
+// memload generator
+class memload_generator {
+	public:
+	struct memload_generator_options {
+		size_t transaction_size {4096};
+		size_t buffer_size {64*1024*1024};
+		char ia_dist[64]{"fixed"};
+		int verbose {0};
+		uint64_t trans_per_second;
+		bool shared_buffer {true};
+	};
+
+    private:
+	DISALLOW_EVIL_CONSTRUCTORS(memload_generator);
+	struct thread_info {
+		pthread_t pthr;
+		void *from_buffer;
+		void *to_buffer;
+		std::atomic<bool> reset_ts;
+		int tid;
+		int pull;
+		int coreid;
+		int target_dom;
+		struct memload_generator_options * opts;
+		Generator * ia_gen;
+	
+		// stat keeping
+		std::atomic<uint32_t> num_trans;
+		std::atomic<int> * state;
+		std::atomic<int> init_status;
+	};
+
+	std::vector<struct thread_info *> thr_infos;
+	std::atomic<int> state;
+	static constexpr int STATE_RUN = 0;
+	static constexpr int STATE_RDY = 1;
+	static constexpr int STATE_END = 2;
+	static constexpr int STATE_INIT = 3;
+
+	static void *worker_thrd(void *_tinfo);
+	struct memload_generator_options opts;
+
+    public:
+	memload_generator(cpuset_t * threads, cpuset_t * modes, cpuset_t * target_domain, struct memload_generator_options * opt, bool *success);
+	uint64_t get_transactions();
+	bool start();
+	bool stop();
+	bool set_transactions(uint64_t tps);
+	~memload_generator();
+};
--- a/inc/net/netsup.hh
+++ b/inc/net/netsup.hh
@ -0,0 +1,133 @@
+#pragma once
+#include <cstdint>
+
+#include "rte_ethdev.h"
+#include "rte_ether.h"
+
+#define MAX_NUMA_NODES (64)
+
+struct device_conf {
+	int portid;
+	uint16_t tx_ring_sz;
+	uint16_t rx_ring_sz;
+	cpuset_t core_affinity;
+	int mtu;
+	uint64_t rx_offloads;
+	uint64_t tx_offloads;
+	uint64_t rss_hf;
+	
+	rte_tx_callback_fn tx_fn;
+	void * tx_user;
+
+	rte_rx_callback_fn rx_fn;
+	void * rx_user;
+
+	bool timesync;
+};
+
+struct mem_conf {
+	int num_elements;
+	int cache_size;
+	int data_room_size;
+	int priv_size;
+	unsigned int max_pools;
+};
+
+constexpr static uint16_t MIN_RANDOM_PORT = 1000;
+constexpr static uint16_t DEFAULT_RAT_PORT = 1234;
+constexpr static unsigned int INIT_DELAY = 3;
+constexpr static unsigned int MAX_NODES = 64;
+
+void
+dpdk_init(struct device_conf *dconf, struct mem_conf *mconf);
+
+void
+dpdk_cleanup(struct device_conf *dconf);
+
+struct rte_mempool *
+mempool_get(int nodeid);
+
+struct port_conf {
+	const char * driver_name;
+	uint64_t rxoffload;
+	uint64_t txoffload;
+	uint64_t rss_hf;
+	bool timesync;
+};
+
+int
+portconf_get(int portid, struct port_conf * out);
+
+
+// constexpr static int LATENCY_MEASURE_TIMES = 10000;
+
+// static inline void
+// sync_port_clock(uint16_t portid)
+//{
+//    int64_t lat = 0;
+//    int64_t get_time_lat;
+//    int64_t write_time_lat;
+//    struct timespec dum;
+//    struct timespec start;
+//    struct timespec end;
+//
+//    // measure clock_gettime latency
+//    for(int i = 0; i < LATENCY_MEASURE_TIMES; i++) {
+//        // end - start ~= 2x clock_gettime's latency
+//        clock_gettime(CLOCK_REALTIME, &start);
+//        clock_gettime(CLOCK_REALTIME, &dum);
+//        clock_gettime(CLOCK_REALTIME, &end);
+//
+//        if (end.tv_sec != start.tv_sec) {
+//            rte_exit(EXIT_FAILURE, "clock_gettime too slow\n");
+//        }
+//
+//        // shouldn't overflow
+//        lat += (end.tv_nsec - start.tv_nsec) / 2;
+//    }
+//    get_time_lat = lat / LATENCY_MEASURE_TIMES;
+//
+//    // measure rte_eth_timesync_write_time latency
+//    lat = 0;
+//    for(int i = 0; i < LATENCY_MEASURE_TIMES; i++) {
+//        // end - start ~= rte_eth_timesync latency + clock_gettime's latency
+//        clock_gettime(CLOCK_REALTIME, &dum);
+//        clock_gettime(CLOCK_REALTIME, &start);
+//        if (rte_eth_timesync_write_time(portid, &dum) != 0) {
+//            rte_exit(EXIT_FAILURE, "failed to write time\n");
+//        }
+//        clock_gettime(CLOCK_REALTIME, &end);
+//
+//        if (end.tv_sec != start.tv_sec) {
+//            rte_exit(EXIT_FAILURE, "clock_gettime too slow!\n");
+//        }
+//
+//        // shouldn't overflow
+//        int64_t elat = (end.tv_nsec - start.tv_nsec) - get_time_lat;
+//        if (elat < 0) {
+//            rte_exit(EXIT_FAILURE, "something is wrong with lat \n");
+//        }
+//        lat += elat;
+//    }
+//    write_time_lat = lat / LATENCY_MEASURE_TIMES;
+//
+//    int64_t delta = (get_time_lat + write_time_lat) / 2;
+//    int64_t s2ns = (int64_t)S2NS;
+//    // sync the clock
+//    while (true) {
+//        clock_gettime(CLOCK_REALTIME, &dum);
+//        dum.tv_nsec += delta;
+//        if (dum.tv_nsec > s2ns) {
+//            // try again if overflow
+//            continue;
+//        }
+//        if (rte_eth_timesync_write_time(portid, &dum) != 0) {
+//            rte_exit(EXIT_FAILURE, "failed to write time\n");
+//        }
+//        break;
+//    }
+//    rte_eth_timesync_enable(portid);
+//
+//    printf("Sync-ed time: get lat %ld write lat %ld\n", get_time_lat,
+//    write_time_lat);
+//}
--- a/inc/net/pkt.hh
+++ b/inc/net/pkt.hh
@ -0,0 +1,490 @@
+#pragma once
+
+#include <sys/endian.h>
+#include <rte_byteorder.h>
+#include <rte_ether.h>
+#include <rte_flow.h>
+#include <rte_ip.h>
+#include <rte_mbuf.h>
+#include <rte_mbuf_core.h>
+#include <rte_net.h>
+#include <rte_udp.h>
+#include <unistd.h>
+
+#include "defs.hh"
+
+#include <random>
+
+#define IP_DEFTTL 64 /* from RFC 1340. */
+#define IP_VERSION 0x40
+#define IP_HDRLEN 0x05 /* default IP header length == five 32-bits words. */
+#define IP_VHL_DEF (IP_VERSION | IP_HDRLEN)
+#define IP_ADDR_FMT_SIZE 15
+
+constexpr static uint32_t MAX_JUMBO_MTU = 9000;
+constexpr static uint32_t MAX_STANDARD_MTU = 1500;
+
+static inline int
+mtu_to_pkt_size(int mtu)
+{
+	return mtu + RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN;
+}
+
+static inline void 
+tx_burst_all(int portid, int txqid, struct rte_mbuf ** tx_bufs, int sz)
+{
+	int remaining = sz;
+	while(remaining > 0) {
+		remaining -= rte_eth_tx_burst(
+			    portid, txqid, &tx_bufs[sz - remaining],
+			    remaining);
+	}
+}
+
+constexpr static uint32_t ETHER_FRAME_MAGIC = 0xDCDCE5E5;
+const static struct rte_ether_addr POU_MAC {
+	0x01, 0x00, 0x5e, 0x00, 0x01, 0x81
+};
+const static uint32_t POU_IP = RTE_IPV4(224, 0, 1, 129);
+const static uint16_t POU_PORT = 320;
+/* Khat Protocol:
+ *   khat only processes two kinds of packets - LOAD and PROBE
+ *   rat:
+ *        rat -> LOAD -> khat
+ *        khat -> LOAD_RESP -> rat
+ *   cat:
+ *        cat -> PROBE -> khat (cat tx timestamps)
+ *        khat -> PROBE_RESP -> cat (cat rx timestamps and khat tx/rx
+ * timestamps) khat -> STAT -> cat (khat sends its tx/rx timestamps)
+ */
+
+/* Rat Protocol:
+ *   cat & rat:
+ *      1. both launch with full parameters
+ *         rat with slave flag
+ *         cat with master flag
+ *      2. rats create threads and wait for cat's signal
+ *      3. cat creates threads
+ *      4. cat -> rats SYNC
+ *      5. rats -> cat SYNC_ACK and start running
+ *      6. cat start running after received all SYNC_ACKs
+ *      7. cat stops running, cat -> rats FIN
+ *      8. rats stops running, rats -> cat FIN_ACK with QPS
+ *      9. cat exits after receiving all FIN_ACKs and flushing statsGG
+ */
+
+struct ptp_hdr {
+	uint8_t ptp_msg_type;
+	uint8_t ptp_ver;
+	uint8_t unused[34];
+} __attribute__((packed));
+
+struct pkt_hdr {
+	struct rte_ether_hdr eth_hdr;
+	struct rte_ipv4_hdr ipv4_hdr;
+	struct rte_udp_hdr udp_hdr;
+	struct ptp_hdr ptp_hdr;
+	uint16_t type;
+	uint32_t magic;
+	char payload[0];
+} __attribute__((packed));
+
+struct net_spec {
+	uint32_t ip;
+	rte_ether_addr mac_addr;
+};
+
+static inline void
+pkt_hdr_to_netspec(struct pkt_hdr *pkt, struct net_spec *src,
+    uint16_t *src_port, struct net_spec *dst, uint16_t *dst_port)
+{
+	if (src != nullptr) {
+		rte_ether_addr_copy(&pkt->eth_hdr.src_addr, &src->mac_addr);
+		src->ip = rte_be_to_cpu_32(pkt->ipv4_hdr.src_addr);
+	}
+
+	if (src_port != nullptr) {
+		*src_port = rte_be_to_cpu_16(pkt->udp_hdr.src_port);
+	}
+
+	if (dst != nullptr) {
+		rte_ether_addr_copy(&pkt->eth_hdr.dst_addr, &dst->mac_addr);
+		dst->ip = rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr);
+	}
+
+	if (dst_port != nullptr) {
+		*dst_port = rte_be_to_cpu_16(pkt->udp_hdr.dst_port);
+	}
+};
+
+struct conn_spec {
+	struct net_spec *src;
+	uint16_t src_port;
+	struct net_spec *dst;
+	uint16_t dst_port;
+};
+
+// returns 0 on success
+static inline int
+str_to_netspec(char *str, struct net_spec *out)
+{
+	const char *tok = "@";
+	char *token;
+	char *ptr;
+	uint32_t a, b, c, d;
+
+	token = strtok_r(str, tok, &ptr);
+
+	if (token == nullptr ||
+	    sscanf(token, "%d.%d.%d.%d", &a, &b, &c, &d) != 4) {
+		return -1;
+	}
+
+	out->ip = RTE_IPV4(a, b, c, d);
+
+	// mac next
+	token = strtok_r(nullptr, tok, &ptr);
+	if (token == nullptr ||
+	    rte_ether_unformat_addr(token, &out->mac_addr) != 0) {
+		return -1;
+	}
+
+	return 0;
+}
+
+constexpr static uint16_t PKT_TYPE_LOAD = 0;
+constexpr static uint32_t LOAD_TYPE_CPU = 0; // arg0 = cpu time in us. arg1 = unused
+constexpr static uint32_t LOAD_TYPE_MEM = 1; // arg0 = which thread to access. arg1 = how many cachelines to access
+constexpr static uint32_t LOAD_TYPE_MAX = LOAD_TYPE_MEM + 1;
+struct pkt_payload_load {
+	uint32_t epoch;
+	uint32_t type; // type of load
+	uint32_t arg0;
+	uint32_t arg1;
+};
+
+constexpr static uint16_t PKT_TYPE_PROBE = 1;
+constexpr static uint16_t PKT_TYPE_LOAD_RESP = 2;
+constexpr static uint16_t PKT_TYPE_PROBE_RESP = 3;
+struct pkt_payload_epoch {
+	uint32_t epoch;
+};
+
+constexpr static uint16_t PKT_TYPE_STAT = 4;
+struct pkt_payload_stat {
+	uint32_t epoch;
+	uint64_t hw_rx;
+	uint64_t hw_tx;
+	uint64_t sw_rx;
+	uint64_t sw_tx;
+};
+
+constexpr static uint16_t PKT_TYPE_SYNC = 5;
+constexpr static uint16_t PKT_TYPE_SYNC_ACK = 6;
+constexpr static uint16_t PKT_TYPE_FIN = 7;
+constexpr static uint16_t PKT_TYPE_FIN_ACK = 8;
+struct pkt_payload_qps {
+	uint32_t qps;
+	uint32_t recved_pkts;
+	uint32_t lost_pkts;
+};
+
+constexpr static uint16_t NUM_PKT_TYPES = PKT_TYPE_FIN_ACK + 1;
+// for fast packet verification
+static const uint32_t expected_payload_size[NUM_PKT_TYPES] {
+	sizeof(struct pkt_payload_load),  // LOAD
+	sizeof(struct pkt_payload_epoch), // PROBE
+	sizeof(struct pkt_payload_epoch), // LOAD_RESP
+	sizeof(struct pkt_payload_epoch), // PROBE_RESP
+	sizeof(struct pkt_payload_stat),  // STAT
+	0,				  // SYNC
+	0,				  // SYNC_ACK
+	0,				  // FIN
+	sizeof(struct pkt_payload_qps)	  // FIN_ACK
+};
+
+class rdport_generator {
+    private:
+	DISALLOW_EVIL_CONSTRUCTORS(rdport_generator);
+	constexpr static uint32_t MAX_PORT = 65535;
+	uint32_t min_port;
+	uint32_t cur;
+	std::random_device rd;
+	std::default_random_engine gen;
+	std::uniform_int_distribution<uint32_t> dist;
+
+    public:
+	rdport_generator(uint32_t mport)
+	    : min_port(mport)
+	    , cur(0)
+	    , dist(0, MAX_PORT - min_port)
+	{
+		gen.seed(get_uptime());
+		cur = dist(gen);
+	}
+	uint16_t next()
+	{
+		uint16_t ret = ((cur) % (MAX_PORT - min_port)) + min_port;
+		cur++;
+		return ret;
+	}
+};
+
+#define NTR_PKT(dep, level, pkt, prefix_fmt, ...)                                                                             \
+	ntr(dep, level,                                                                                                       \
+	    prefix_fmt                                                                                                        \
+	    "src: %d.%d.%d.%d:%d@%02x:%02x:%02x:%02x:%02x:%02x dst: %d.%d.%d.%d:%d@%02x:%02x:%02x:%02x:%02x:%02x type: %d\n", \
+	    ##__VA_ARGS__,                                                                                                    \
+	    (rte_be_to_cpu_32(pkt->ipv4_hdr.src_addr) >> 24) & 0xff,                                                          \
+	    (rte_be_to_cpu_32(pkt->ipv4_hdr.src_addr) >> 16) & 0xff,                                                          \
+	    (rte_be_to_cpu_32(pkt->ipv4_hdr.src_addr) >> 8) & 0xff,                                                           \
+	    (rte_be_to_cpu_32(pkt->ipv4_hdr.src_addr) >> 0) & 0xff,                                                           \
+	    rte_be_to_cpu_16(pkt->udp_hdr.src_port),                                                                          \
+	    pkt->eth_hdr.src_addr.addr_bytes[0],                                                                                \
+	    pkt->eth_hdr.src_addr.addr_bytes[1],                                                                                \
+	    pkt->eth_hdr.src_addr.addr_bytes[2],                                                                                \
+	    pkt->eth_hdr.src_addr.addr_bytes[3],                                                                                \
+	    pkt->eth_hdr.src_addr.addr_bytes[4],                                                                                \
+	    pkt->eth_hdr.src_addr.addr_bytes[5],                                                                                \
+	    (rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr) >> 24) & 0xff,                                                          \
+	    (rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr) >> 16) & 0xff,                                                          \
+	    (rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr) >> 8) & 0xff,                                                           \
+	    (rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr) >> 0) & 0xff,                                                           \
+	    rte_be_to_cpu_16(pkt->udp_hdr.dst_port),                                                                          \
+	    pkt->eth_hdr.dst_addr.addr_bytes[0],                                                                                \
+	    pkt->eth_hdr.dst_addr.addr_bytes[1],                                                                                \
+	    pkt->eth_hdr.dst_addr.addr_bytes[2],                                                                                \
+	    pkt->eth_hdr.dst_addr.addr_bytes[3],                                                                                \
+	    pkt->eth_hdr.dst_addr.addr_bytes[4],                                                                                \
+	    pkt->eth_hdr.dst_addr.addr_bytes[5], rte_be_to_cpu_16(pkt->type))
+
+static inline void
+print_mac(struct rte_ether_addr *mac)
+{
+	printf("%x:%x:%x:%x:%x:%x", mac->addr_bytes[0], mac->addr_bytes[1],
+	    mac->addr_bytes[2], mac->addr_bytes[3], mac->addr_bytes[4],
+	    mac->addr_bytes[5]);
+}
+
+static inline void
+print_ipv4(uint32_t ip)
+{
+	printf("%d.%d.%d.%d", (ip >> 24) & 0xff, (ip >> 16) & 0xff,
+	    (ip >> 8) & 0xff, (ip >> 0) & 0xff);
+}
+
+static inline void
+dump_pkt(struct rte_mbuf *pkt)
+{
+	if (rte_pktmbuf_data_len(pkt) < sizeof(struct rte_ether_hdr)) {
+		return;
+	}
+
+	struct rte_ether_hdr _eth_hdr;
+	auto eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_read(
+	    pkt, 0, sizeof(struct rte_ether_hdr), &_eth_hdr);
+	if (eth_hdr == nullptr) {
+		return;
+	}
+
+	// ethernet frame
+	printf(
+	    "Packet %p: Length 0x%x\n", (void *)pkt, rte_pktmbuf_data_len(pkt));
+	printf("    Ethernet header:\n");
+	printf("        Src:");
+	print_mac(&eth_hdr->src_addr);
+	printf("\n");
+	printf("        Dst:");
+	print_mac(&eth_hdr->dst_addr);
+	printf("\n");
+	printf("        Type: 0x%x\n", rte_be_to_cpu_16(eth_hdr->ether_type));
+
+	uint16_t ether_type = rte_be_to_cpu_16(eth_hdr->ether_type);
+	if (ether_type != RTE_ETHER_TYPE_IPV4) {
+		return;
+	}
+
+	if (rte_pktmbuf_data_len(pkt) <
+	    sizeof(struct rte_ether_hdr) + sizeof(struct rte_ipv4_hdr)) {
+		return;
+	}
+
+	// dump ip header
+	auto ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+	printf("    IPv4 header:\n");
+	printf("        Src:");
+	print_ipv4(rte_be_to_cpu_32(ipv4_hdr->src_addr));
+	printf("\n");
+	printf("        Dst:");
+	print_ipv4(rte_be_to_cpu_32(ipv4_hdr->dst_addr));
+	printf("\n");
+	printf("        Protocol: 0x%x\n", ipv4_hdr->next_proto_id);
+}
+
+static inline bool
+is_l2ts_pkt(uint16_t type)
+{
+	return type == PKT_TYPE_PROBE || type == PKT_TYPE_PROBE_RESP;
+}
+
+// fills the packet with the information except for the payload itself
+static inline struct pkt_hdr *
+construct_pkt_hdr(
+    struct rte_mbuf *buf, uint16_t type, const struct conn_spec *conn, int pkt_pad_sz)
+{
+	rte_pktmbuf_reset(buf);
+
+	int total_sz = sizeof(struct pkt_hdr) +
+	    expected_payload_size[type];
+
+	if (pkt_pad_sz > total_sz) {
+		total_sz = pkt_pad_sz;
+	}
+
+	auto pkt_data = (struct pkt_hdr *)rte_pktmbuf_append(buf, total_sz);
+	if (pkt_data == nullptr)
+		return nullptr;
+
+	struct rte_ether_hdr *eth_hdr;
+	struct rte_ipv4_hdr *ipv4_hdr;
+	struct rte_udp_hdr *udp_hdr;
+	bool is_ts_pkt = is_l2ts_pkt(type);
+
+	// single segment
+	buf->nb_segs = 1;
+
+	// construct l2 header
+	eth_hdr = &pkt_data->eth_hdr;
+	rte_ether_addr_copy(&conn->src->mac_addr, &eth_hdr->src_addr);
+	if (is_ts_pkt) {
+		rte_ether_addr_copy(&POU_MAC, &eth_hdr->dst_addr);
+	} else {
+		rte_ether_addr_copy(&conn->dst->mac_addr, &eth_hdr->dst_addr);
+	}
+	eth_hdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4);
+	buf->l2_len = sizeof(struct rte_ether_hdr);
+
+	// construct l3 header
+	ipv4_hdr = &pkt_data->ipv4_hdr;
+	memset(ipv4_hdr, 0, sizeof(struct rte_ipv4_hdr));
+	ipv4_hdr->version_ihl = IP_VHL_DEF;
+	ipv4_hdr->type_of_service = 0;
+	ipv4_hdr->fragment_offset = 0;
+	ipv4_hdr->time_to_live = IP_DEFTTL;
+	ipv4_hdr->next_proto_id = IPPROTO_UDP;
+	ipv4_hdr->packet_id = 0;
+	ipv4_hdr->src_addr = rte_cpu_to_be_32(conn->src->ip);
+	if (is_ts_pkt) {
+		ipv4_hdr->dst_addr = rte_cpu_to_be_32(POU_IP);
+	} else {
+		ipv4_hdr->dst_addr = rte_cpu_to_be_32(conn->dst->ip);
+	}
+	ipv4_hdr->total_length = rte_cpu_to_be_16(total_sz - sizeof(struct rte_ether_hdr) - sizeof(struct rte_ipv4_hdr));
+	ipv4_hdr->hdr_checksum = 0;
+	buf->l3_len = sizeof(struct rte_ipv4_hdr);
+
+	// construct l4 header
+	udp_hdr = &pkt_data->udp_hdr;
+	udp_hdr->src_port = rte_cpu_to_be_16(conn->src_port);
+	if (is_ts_pkt) {
+		udp_hdr->dst_port = rte_cpu_to_be_16(POU_PORT);
+	} else {
+		udp_hdr->dst_port = rte_cpu_to_be_16(conn->dst_port);
+	}
+	udp_hdr->dgram_cksum = 0; /* No UDP checksum. */
+	udp_hdr->dgram_len = total_sz - sizeof(struct rte_ether_hdr) - sizeof(struct rte_ipv4_hdr) - sizeof(struct rte_udp_hdr);
+	buf->l4_len = sizeof(struct rte_udp_hdr);
+	buf->ol_flags |= RTE_MBUF_F_TX_IPV4;
+	buf->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
+	buf->ol_flags |= RTE_MBUF_F_TX_UDP_CKSUM;
+
+	if (is_ts_pkt) {
+		// set misc flags
+		buf->ol_flags |= RTE_MBUF_F_TX_IEEE1588_TMST;
+		pkt_data->ptp_hdr.ptp_ver = 0x2;      // VER 2
+		pkt_data->ptp_hdr.ptp_msg_type = 0x0; // SYNC
+	} else {
+		pkt_data->ptp_hdr.ptp_ver = 0xff; // invalid ver
+	}
+
+	pkt_data->type = rte_cpu_to_be_16(type);
+	pkt_data->magic = rte_cpu_to_be_32(ETHER_FRAME_MAGIC);
+
+	return pkt_data;
+}
+
+// returns 0 on success
+static inline int
+alloc_pkt_hdr(struct rte_mempool *pool, uint16_t type,
+    const struct conn_spec *conn, int pkt_pad_sz, struct rte_mbuf **mbuf_out,
+    struct pkt_hdr **hdr_out)
+{
+	struct pkt_hdr *hdr;
+	struct rte_mbuf *pkt = rte_pktmbuf_alloc(pool);
+	if (pkt == nullptr) {
+		return -1;
+	}
+
+	// printf("alloc_pkt_hdr:\n");
+	// printf("from ");
+	// print_mac(&conn->src->mac_addr);
+	// printf("\nto ");
+	// print_mac(&conn->dst->mac_addr);
+	// printf("\n");
+
+	hdr = construct_pkt_hdr(pkt, type, conn, pkt_pad_sz);
+	if (hdr == nullptr) {
+		rte_pktmbuf_free(pkt);
+		return -1;
+	}
+
+	*mbuf_out = pkt;
+	*hdr_out = hdr;
+	return 0;
+}
+
+static inline struct pkt_hdr *
+check_valid_packet(struct rte_mbuf *pkt, const struct rte_ether_addr *host_mac)
+{
+	struct pkt_hdr *pkt_data = nullptr;
+	const struct rte_ether_addr *expected_mac = nullptr;
+	uint16_t type;
+	const uint32_t data_len = rte_pktmbuf_data_len(pkt);
+
+	if (data_len < sizeof(struct pkt_hdr)) {
+		return nullptr;
+	}
+
+	pkt_data = rte_pktmbuf_mtod(pkt, struct pkt_hdr *);
+
+	// check MAGIC
+	if (rte_be_to_cpu_32(pkt_data->magic) != ETHER_FRAME_MAGIC) {
+		return nullptr;
+	}
+
+	type = rte_be_to_cpu_16(pkt_data->type);
+	// check type and payload size
+	if ((type >= NUM_PKT_TYPES) ||
+	    (data_len <
+		(sizeof(struct pkt_hdr) +
+		    expected_payload_size[rte_be_to_cpu_16(pkt_data->type)]))) {
+		return nullptr;
+	}
+
+	// strict dest mac filter
+	if (host_mac != nullptr) {
+		if (is_l2ts_pkt(type)) {
+			// dst mac must be the broadcast addr
+			expected_mac = &POU_MAC;
+		} else {
+			// dst mac must match the host mac
+			expected_mac = host_mac;
+		}
+
+		if (!rte_is_same_ether_addr(
+			expected_mac, &pkt_data->eth_hdr.dst_addr))
+			return nullptr;
+	}
+
+	return pkt_data;
+}
--- a/inc/nms.h
+++ b/inc/nms.h
@ -0,0 +1,26 @@
+#pragma once
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int
+nms_init(int verbose);
+
+void *
+nms_malloc(int nodeid, size_t sz);
+
+void *
+nms_alloc_static(int nodeid, size_t sz);
+
+void
+nms_free_static(void * buf, size_t sz);
+
+void
+nms_free(int nodeid, void * addr);
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
--- a/inc/ntr.h
+++ b/inc/ntr.h
@ -0,0 +1,38 @@
+#pragma once
+
+#include <stdarg.h>
+#include <stdio.h>
+
+#define NTR_LEVEL_NONE (0)
+#define NTR_LEVEL_ERROR (1)
+#define NTR_LEVEL_WARNING (2)
+#define NTR_LEVEL_INFO (3)
+#define NTR_LEVEL_DEBUG (4)
+#define NTR_LEVEL_DEFAULT (NTR_LEVEL_WARNING)
+
+#define NTR_DEP_NTR (0)
+#define NTR_DEP_USER1 (1)
+#define NTR_DEP_USER2 (2)
+#define NTR_DEP_USER3 (3)
+#define NTR_DEP_USER4 (4)
+#define NTR_DEP_USER5 (5)
+#define NTR_DEP_MAX (NTR_DEP_USER5 + 1)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ntr_init();
+
+__attribute__((format(printf, 3, 4))) void ntr(
+    int dep, int level, const char *fmt, ...);
+
+void ntr_set_level(int dep, int level);
+
+void ntr_set_output(FILE *f);
+
+int ntr_get_level(int dep);
+
+#ifdef __cplusplus
+}
+#endif
--- a/inc/ntrlog.h
+++ b/inc/ntrlog.h
@ -1,61 +0,0 @@
-#pragma once
-
-#include <stdio.h>
-
-#define NTR_LEVEL_NONE (0)
-#define NTR_LEVEL_ERROR (1)
-#define NTR_LEVEL_WARNING (2)
-#define NTR_LEVEL_INFO (3)
-#define NTR_LEVEL_DEBUG (4)
-#define NTR_LEVEL_DEFAULT (NTR_LEVEL_WARNING)
-
-#define NTR_DEP_NTR (0)
-#define NTR_DEP_USER1 (1)
-#define NTR_DEP_USER2 (2)
-#define NTR_DEP_USER3 (3)
-#define NTR_DEP_USER4 (4)
-#define NTR_DEP_USER5 (5)
-#define NTR_DEP_MAX (NTR_DEP_USER5 + 1)
-
-#define NTR_DECL_IMPL \
-int ntr_log_levels[NTR_DEP_MAX] = {NTR_LEVEL_DEFAULT}; \
-FILE * ntr_out = stdout
-
-extern int ntr_log_levels[];
-extern FILE * ntr_out;
-
-static inline
-void ntr(int dep, int level, const char * fmt, ...)
-{
-    va_list vl;
-    va_start(vl, fmt);
-    if (dep < NTR_DEP_MAX && level <= ntr_log_levels[dep]) {
-        vfprintf(ntr_out, fmt, vl);
-    }
-    va_end(vl);
-}
-
-static inline
-void ntr_set_level(int dep, int level)
-{
-    if (dep < NTR_DEP_MAX) {
-        ntr_log_levels[dep] = level;
-    }
-}
-
-static inline
-void ntr_set_output(FILE * f)
-{
-    if (f != NULL) {
-        ntr_out = f;
-    }
-}
-
-static inline
-int ntr_get_level(int dep)
-{
-    if (dep < NTR_DEP_MAX) {
-        return ntr_log_levels[dep];
-    }
-    return 0;
-}
--- a/inc/pkt.h
+++ b/inc/pkt.h
@ -1,175 +0,0 @@
-#pragma once
-
-#include <rte_mbuf_core.h>
-#include <rte_mbuf.h>
-#include <rte_udp.h>
-#include <rte_byteorder.h>
-#include <rte_ip.h>
-#include <stdint.h>
-#include <rte_flow.h>
-#include <rte_ether.h>
-#include <unistd.h>
-#include <rte_net.h>
-#include <rte_vxlan.h>
-
-#define IP_DEFTTL 64 /* from RFC 1340. */
-#define IP_VERSION 0x40
-#define IP_HDRLEN 0x05 /* default IP header length == five 32-bits words. */
-#define IP_VHL_DEF (IP_VERSION | IP_HDRLEN)
-#define IP_ADDR_FMT_SIZE 15
-
-constexpr static uint32_t ETHER_FRAME_MAGIC = 0xDCDCE5E5;
-
-struct packet_hdr {
-    struct rte_ether_hdr eth_hdr;
-    struct rte_ipv4_hdr ipv4_hdr;
-    struct rte_udp_hdr udp_hdr;
-} __attribute__((packed));
-
-struct packet_data
-{
-    struct packet_hdr pkt_hdr;
-    uint32_t magic;
-    uint32_t epoch;
-    uint64_t clt_ts_tx;
-    uint64_t clt_ts_rx;
-    uint64_t srv_ts_tx;
-    uint64_t srv_ts_rx;
-};
-
-static inline void
-print_mac(struct rte_ether_addr * mac)
-{
-	printf("%x:%x:%x:%x:%x:%x", mac->addr_bytes[0],
-								mac->addr_bytes[1],
-								mac->addr_bytes[2],
-								mac->addr_bytes[3],
-								mac->addr_bytes[4],
-								mac->addr_bytes[5]);
-}
-
-static inline void
-print_ipv4(uint32_t ip)
-{
-	printf("%d-%d-%d-%d", (ip >> 24) & 0xff,
-						  (ip >> 16) & 0xff,
-						  (ip >> 8) & 0xff,
-						  (ip >> 0) & 0xff);
-}
-
-static inline void
-dump_pkt(struct rte_mbuf *pkt)
-{
-	if(rte_pktmbuf_data_len(pkt) < sizeof(struct rte_ether_hdr)) {
-		return;
-	}
-
-	struct rte_ether_hdr _eth_hdr;
-	struct rte_ether_hdr * eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_read(pkt, 0, sizeof(struct rte_ether_hdr), &_eth_hdr);
-	if (eth_hdr == NULL) {
-		return;
-	}
-
-	// ethernet frame
-	printf("Packet %p: Length 0x%x\n", (void*)pkt, rte_pktmbuf_data_len(pkt));
-	printf("    Ethernet header:\n");
-	printf("        Src:");
-	print_mac(&eth_hdr->s_addr);
-	printf("\n");
-	printf("        Dst:");
-	print_mac(&eth_hdr->d_addr);
-	printf("\n");
-	printf("        Type: 0x%x\n", rte_be_to_cpu_16(eth_hdr->ether_type));
-
-	uint16_t ether_type = rte_be_to_cpu_16(eth_hdr->ether_type);
-	if (ether_type != RTE_ETHER_TYPE_IPV4) {
-		return;
-	}
-
-	if(rte_pktmbuf_data_len(pkt) < sizeof(struct rte_ether_hdr) + sizeof(struct rte_ipv4_hdr)) {
-		return;
-	}
-
-	// dump ip header
-	struct rte_ipv4_hdr * ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);	
-	printf("    IPv4 header:\n");
-	printf("        Src:");
-	print_ipv4(rte_be_to_cpu_32(ipv4_hdr->src_addr));
-	printf("\n");
-	printf("        Dst:");
-	print_ipv4(rte_be_to_cpu_32(ipv4_hdr->dst_addr));
-	printf("\n");
-	printf("        Protocol: 0x%x\n", ipv4_hdr->next_proto_id);
-
-}
-
-static inline
-struct packet_data * construct_udp_pkt_hdr(struct rte_mbuf * buf,
-                    struct rte_ether_addr * src_mac, struct rte_ether_addr * dst_mac,
-                    uint32_t src_ip, uint32_t dst_ip, uint16_t src_port, uint16_t dst_port)
-{
-    rte_pktmbuf_reset(buf);
-
-    struct packet_data * pkt_data = (struct packet_data *)rte_pktmbuf_append(buf, sizeof(struct packet_data));
-    struct rte_ether_hdr * eth_hdr;
-    struct rte_ipv4_hdr * ipv4_hdr;
-    struct rte_udp_hdr * udp_hdr;
-
-    if (pkt_data == NULL)   
-        return NULL;
-
-    // single segment
-    buf->nb_segs = 1;
-    
-    // construct l2 header
-    eth_hdr = &pkt_data->pkt_hdr.eth_hdr;
-    rte_ether_addr_copy(src_mac, &eth_hdr->s_addr);
-    rte_ether_addr_copy(dst_mac, &eth_hdr->d_addr);
-    eth_hdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4);
-    buf->l2_len = sizeof(struct rte_ether_hdr);
-
-    // construct l3 header
-    ipv4_hdr = &pkt_data->pkt_hdr.ipv4_hdr;
-    memset(ipv4_hdr, 0, sizeof(struct rte_ipv4_hdr));
-    ipv4_hdr->version_ihl = IP_VHL_DEF;
-    ipv4_hdr->type_of_service = 0;
-    ipv4_hdr->fragment_offset = 0;
-    ipv4_hdr->time_to_live = IP_DEFTTL;
-    ipv4_hdr->next_proto_id = IPPROTO_UDP;
-    ipv4_hdr->packet_id = 0;
-    ipv4_hdr->src_addr = rte_cpu_to_be_32(src_ip);
-    ipv4_hdr->dst_addr = rte_cpu_to_be_32(dst_ip);
-    ipv4_hdr->total_length = rte_cpu_to_be_16(sizeof(struct packet_data) - sizeof(struct rte_ether_hdr));
-    ipv4_hdr->hdr_checksum = 0;
-    buf->l3_len = sizeof(struct rte_ipv4_hdr);
-
-    // construct l4 header
-    udp_hdr = &pkt_data->pkt_hdr.udp_hdr;
-    udp_hdr->src_port = rte_cpu_to_be_16(src_port);
-    udp_hdr->dst_port = rte_cpu_to_be_16(dst_port);
-    udp_hdr->dgram_cksum = 0; /* No UDP checksum. */
-    udp_hdr->dgram_len = rte_cpu_to_be_16(sizeof(struct packet_data) -
-                                          sizeof(struct rte_ether_hdr) -
-                                          sizeof(struct rte_udp_hdr));
-    buf->l4_len = sizeof(struct rte_udp_hdr);
-
-    return pkt_data;
-}
-
-static inline
-struct packet_data * check_valid_packet(struct rte_mbuf * pkt)
-{
-    struct packet_data * pkt_data = NULL;
-
-    if (rte_pktmbuf_data_len(pkt) < sizeof(struct packet_data)) {
-        return NULL;
-    }
-
-    pkt_data = rte_pktmbuf_mtod(pkt, struct packet_data *);
-
-    if (rte_be_to_cpu_32(pkt_data->magic) == ETHER_FRAME_MAGIC) {
-        return pkt_data;
-    }
-
-    return NULL;
-}
--- a/inc/storage/drivers/bdev.hh
+++ b/inc/storage/drivers/bdev.hh
@ -0,0 +1,56 @@
+#pragma once
+
+#include "storage/drivers/driver.hh"
+#include "spdk/bdev.h"
+#include "spdk/bdev_zone.h"
+#include "spdk/thread.h"
+
+class birb_bdev_driver : public birb_driver
+{
+public:
+    birb_bdev_driver(const char * dev_name);
+    ~birb_bdev_driver() override;
+    size_t get_capacity() override;
+    birb_driver_status get_status() override;
+    struct spdk_bdev * get_bdev();
+    struct spdk_bdev_desc * get_bdev_desc();
+    birb_driver_type get_type() override;
+    size_t get_align() override;
+
+private:
+    DISALLOW_EVIL_CONSTRUCTORS(birb_bdev_driver);
+    struct spdk_bdev_desc * bdev_desc;
+    struct spdk_bdev * bdev;
+    size_t block_sz;
+    size_t block_num;
+    birb_driver_status status;
+
+    static void print_all_bdev();
+    static void bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev * bdev,
+	                void * event_ctx);
+};
+
+
+class birb_bdev_thread_context : public birb_driver_thread_context
+{
+public:
+    birb_bdev_thread_context(birb_bdev_driver * driver);
+    ~birb_bdev_thread_context() override;
+    int read(size_t offset, size_t size, char * buffer, callback callback, void * context) override;
+    int write(size_t offset, size_t size, char * buffer, callback callback, void * context) override;
+    void poll() override;
+    birb_driver::birb_driver_status get_status() override;
+
+private:
+    struct cb_context {
+        callback cb;
+        void * ctx;
+    };
+
+    DISALLOW_EVIL_CONSTRUCTORS(birb_bdev_thread_context);
+    spdk_io_channel * io_channel;
+    birb_driver::birb_driver_status status;
+    birb_bdev_driver * driver;
+
+    static void io_callback(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
+};
--- a/inc/storage/drivers/driver.hh
+++ b/inc/storage/drivers/driver.hh
@ -0,0 +1,47 @@
+#pragma once
+
+#include "defs.hh"
+
+#include "spdk/thread.h"
+#include <cstdlib>
+
+class birb_driver
+{
+private:
+    DISALLOW_EVIL_CONSTRUCTORS(birb_driver);
+
+public:
+    enum birb_driver_status{
+        BIRB_SUCCESS,
+        BIRB_FAIL
+    };
+    enum birb_driver_type{
+        BIRB_DRV_NVME,
+        BIRB_DRV_BDEV
+    };
+    virtual size_t get_capacity() = 0;
+    virtual birb_driver_status get_status() = 0;
+    virtual size_t get_align() = 0;
+    virtual birb_driver_type get_type() = 0;
+    virtual ~birb_driver() = default;
+protected:
+    birb_driver() = default;
+};
+
+
+class birb_driver_thread_context
+{
+private:
+    DISALLOW_EVIL_CONSTRUCTORS(birb_driver_thread_context);
+    
+public:
+    using callback = void (*)(bool, void *);
+    virtual int read(size_t offset, size_t size, char * buffer, callback callback, void * context) = 0;
+    virtual int write(size_t offset, size_t size, char * buffer, callback callback, void * context) = 0;
+    virtual void poll() = 0;
+    virtual birb_driver::birb_driver_status get_status() = 0;
+    virtual ~birb_driver_thread_context() = default;
+protected:
+    birb_driver_thread_context() = default;
+};
+
--- a/inc/storage/drivers/nvme.hh
+++ b/inc/storage/drivers/nvme.hh
@ -0,0 +1,65 @@
+#pragma once
+
+#include "storage/drivers/driver.hh"
+#include "spdk/nvme.h"
+#include "spdk/thread.h"
+
+class birb_nvme_driver : public birb_driver
+{
+public:
+    birb_nvme_driver(const char * dev_name);
+    ~birb_nvme_driver() override;
+    size_t get_capacity() override;
+    birb_driver_status get_status() override;
+    birb_driver_type get_type() override;
+    size_t get_align() override;
+
+    spdk_nvme_ctrlr * get_ctrlr();
+    spdk_nvme_ns * get_ns();
+    spdk_nvme_io_qpair_opts * get_io_qpair_opts();
+
+private:
+    struct attach_context {
+        spdk_nvme_ctrlr ** ctrlr;
+        spdk_nvme_ns ** ns;
+        const char * dev_name;
+        int valid;
+    };
+
+    DISALLOW_EVIL_CONSTRUCTORS(birb_nvme_driver);
+    birb_driver_status status;
+    spdk_nvme_ctrlr * ctrlr;
+    spdk_nvme_ns * ns;
+    spdk_nvme_io_qpair_opts opts;
+
+    static bool probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, struct spdk_nvme_ctrlr_opts *opts);
+    static void attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
+	                        struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts);
+};
+
+
+class birb_nvme_thread_context : public birb_driver_thread_context
+{
+public:
+    birb_nvme_thread_context(birb_nvme_driver * driver);
+    ~birb_nvme_thread_context() override;
+    int read(size_t offset, size_t size, char * buffer, callback callback, void * context) override;
+    int write(size_t offset, size_t size, char * buffer, callback callback, void * context) override;
+    void poll() override;
+    birb_driver::birb_driver_status get_status() override;
+
+private:
+    struct cb_context {
+        callback cb;
+        void * ctx;
+    };
+
+    DISALLOW_EVIL_CONSTRUCTORS(birb_nvme_thread_context);
+    birb_driver::birb_driver_status status;
+    birb_nvme_driver * driver;
+    struct spdk_nvme_qpair * qpair;
+
+    static void io_callback(void *arg, const struct spdk_nvme_cpl *completion);
+    static uint32_t size_to_lba(size_t size, int lba_size);
+    static uint64_t addr_to_lba(size_t addr, int lba_size);
+};
--- a/inc/storage/drivers/posix.hh
+++ b/inc/storage/drivers/posix.hh
@ -0,0 +1,47 @@
+#pragma once
+
+#include "defs.hh"
+
+#include "spdk/thread.h"
+#include <cstdlib>
+
+class birb_driver
+{
+private:
+    DISALLOW_EVIL_CONSTRUCTORS(birb_driver);
+
+public:
+    enum birb_driver_status{
+        BIRB_SUCCESS,
+        BIRB_FAIL
+    };
+    enum birb_driver_type{
+        BIRB_DRV_NVME,
+        BIRB_DRV_BDEV
+    };
+    virtual size_t get_capacity() = 0;
+    virtual birb_driver_status get_status() = 0;
+    virtual size_t get_align() = 0;
+    virtual birb_driver_type get_type() = 0;
+    virtual ~birb_driver() = default;
+protected:
+    birb_driver() = default;
+};
+
+
+class birb_driver_thread_context
+{
+private:
+    DISALLOW_EVIL_CONSTRUCTORS(birb_driver_thread_context);
+    
+public:
+    using callback = void (*)(bool, void *);
+    virtual int read(size_t offset, size_t size, char * buffer, callback callback, void * context) = 0;
+    virtual int write(size_t offset, size_t size, char * buffer, callback callback, void * context) = 0;
+    virtual void poll() = 0;
+    virtual birb_driver::birb_driver_status get_status() = 0;
+    virtual ~birb_driver_thread_context() = default;
+protected:
+    birb_driver_thread_context() = default;
+};
+
--- a/inc/storage/io_gen.hh
+++ b/inc/storage/io_gen.hh
@ -0,0 +1,53 @@
+#pragma once
+#include <sys/endian.h>
+#include <sys/types.h>
+#include "defs.hh"
+#include "gen.hh"
+#include <random>
+
+enum io_generator_opcode {
+    IOGEN_READ,
+    IOGEN_WRITE
+};
+
+enum io_generator_address_mode {
+    IOGEN_ADDR_MONOTONIC_INCREASING,
+    IOGEN_ADDR_UNIFORM_RANDOM
+};
+
+struct io_generator_ctx {
+    unsigned long size;
+    uint64_t offset;
+    io_generator_opcode op;
+};
+
+//
+// cur_offset is aligned to req_size
+//
+class io_generator {
+public:
+    int issue(struct io_generator_ctx * ctx, char * buf);
+    io_generator(unsigned long req_size,
+                    unsigned long capacity,
+                    unsigned int read_pct,
+                    io_generator_address_mode addr_mode);
+    io_generator() = delete;
+    
+private:
+    unsigned long cur_offset;
+
+    const unsigned long capacity;
+    const unsigned long req_size;
+    const unsigned int read_pct;
+    const io_generator_address_mode addr_mode;
+
+    std::random_device rd;
+    std::mt19937 rng;
+    std::uniform_int_distribution<int> dist;
+
+    std::random_device addr_rd;
+    std::mt19937 addr_rng;
+    std::uniform_int_distribution<uint64_t> addr_dist;
+    
+    DISALLOW_EVIL_CONSTRUCTORS(io_generator);
+};
--- a/khat/khat.cc
+++ b/khat/khat.cc
@ -1,378 +0,0 @@
-#include <cstdio>
-#include <cstdlib>
-#include <rte_common.h>
-#include <rte_eal.h>
-#include <rte_ethdev.h>
-#include <rte_cycles.h>
-#include <rte_lcore.h>
-#include <rte_mbuf.h>
-#include <rte_byteorder.h>
-#include <rte_config.h>
-#include <rte_ether.h>
-#include <rte_launch.h>
-#include <atomic>
-#include <unistd.h>
-
-#include "pkt.h"
-#include "ntrlog.h"
-#include "rte_arp.h"
-#include "rte_mbuf_core.h"
-
-NTR_DECL_IMPL;
-
-constexpr unsigned int MBUF_MAX_COUNT = 8191;
-constexpr unsigned int MBUF_CACHE_SIZE = 250;
-constexpr unsigned int RX_RING_SIZE = 1024;
-constexpr unsigned int TX_RING_SIZE = 1024;
-constexpr unsigned int RX_RING_NUM = 1;
-constexpr unsigned int TX_RING_NUM = 1;
-constexpr unsigned int BURST_SIZE = 32;
-
-static const struct rte_eth_conf port_conf_default{};
-
-struct options_t {
-    //states
-    uint16_t s_portid;
-    struct rte_ether_addr s_host_mac;
-    struct rte_mempool * s_pkt_mempool;
-};
-
-struct options_t options;
-
-static uint16_t
-rx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
-        struct rte_mbuf **pkts, uint16_t nb_pkts, uint16_t max_pkts __rte_unused, void *_ __rte_unused)
-{
-    uint64_t now = rte_rdtsc();
-    struct packet_data * pkt_data;
-    for (int i = 0; i < nb_pkts; i++) {
-        pkt_data = check_valid_packet(pkts[i]);
-
-        if (pkt_data == NULL) {
-            ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "rx_add_timestamp: ignoring invalid packet %p.\n", (void*)pkts[i]);
-            continue;  
-        }
-
-        ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "rx_add_timestamp: tagged packet %p with %llu.\n", (void*)pkts[i], now);        
-        pkt_data->srv_ts_rx = rte_cpu_to_be_64(now);
-    }
-
-    return nb_pkts;
-}
-
-static uint16_t
-tx_calc_latency(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
-		struct rte_mbuf **pkts, uint16_t nb_pkts, void *_ __rte_unused)
-{
-    uint64_t now = rte_rdtsc();
-    struct packet_data * pkt_data;
-
-    for (int i = 0; i < nb_pkts; i++) {
-
-        pkt_data = check_valid_packet(pkts[i]);
-
-        if (pkt_data == NULL) {
-            ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "tx_calc_latency: ignoring invalid packet %p.\n", (void*)pkts[i]);
-            continue;
-        }
-
-        ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "tx_calc_latency: tagged packet %p with %llu.\n", (void*)pkts[i], now);    
-        pkt_data->srv_ts_tx = rte_cpu_to_be_64(now);
-    }
-
-    return nb_pkts;
-}
-
-static int
-locore_main(void * _unused __rte_unused)
-{
-    struct rte_mbuf *bufs[BURST_SIZE];
-    struct rte_mbuf *tx_bufs[BURST_SIZE];
-    struct packet_data *pkt_data;
-    uint32_t core_id = rte_lcore_id();
-
-    if (rte_eth_dev_socket_id(options.s_portid) > 0 && rte_eth_dev_socket_id(options.s_portid) != (int)rte_socket_id()) {
-        ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,  "locore_main: WARNING, port %d is on remote NUMA node to "
-                "polling thread.\n\tPerformance will "
-                "not be optimal.\n", options.s_portid);
-    }
-
-	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: core %d running.\n", core_id);
-
-	while(true) {
-        uint16_t nb_tx = 0;
-        const uint16_t nb_rx = rte_eth_rx_burst(options.s_portid, 0, bufs, BURST_SIZE);
-
-        if (nb_rx == 0) {
-            continue;
-        }
-        
-        for(int i = 0; i < nb_rx; i++) {
-
-            pkt_data = check_valid_packet(bufs[i]);
-
-            if (pkt_data == NULL) {
-                ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "locore_main: core %d skipping invalid packet %p.\n", core_id, (void*)bufs[i]);
-                dump_pkt(bufs[i]);
-                rte_pktmbuf_free(bufs[i]);
-                continue;
-            }
-            
-            uint32_t dst_ip = rte_be_to_cpu_32(pkt_data->pkt_hdr.ipv4_hdr.dst_addr);
-            uint32_t src_ip = rte_be_to_cpu_32(pkt_data->pkt_hdr.ipv4_hdr.src_addr);
-            uint16_t src_port = rte_be_to_cpu_16(pkt_data->pkt_hdr.udp_hdr.src_port);
-            uint16_t dst_port = rte_be_to_cpu_16(pkt_data->pkt_hdr.udp_hdr.dst_port);
-            ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: core %d packet %p from %d.%d.%d.%d(%x:%x:%x:%x:%x:%x) to %d.%d.%d.%d(%x:%x:%x:%x:%x:%x), sport %d, dport %d, epoch %d\n", 
-                                                                                            core_id,
-                                                                                            (void*)bufs[i],
-                                                                                            (src_ip >> 24) & 0xff,
-                                                                                            (src_ip >> 16) & 0xff,
-                                                                                            (src_ip >> 8) & 0xff,
-                                                                                            (src_ip >> 0) & 0xff,
-                                                                                            pkt_data->pkt_hdr.eth_hdr.s_addr.addr_bytes[0],
-                                                                                            pkt_data->pkt_hdr.eth_hdr.s_addr.addr_bytes[1],
-                                                                                            pkt_data->pkt_hdr.eth_hdr.s_addr.addr_bytes[2],
-                                                                                            pkt_data->pkt_hdr.eth_hdr.s_addr.addr_bytes[3],
-                                                                                            pkt_data->pkt_hdr.eth_hdr.s_addr.addr_bytes[4],
-                                                                                            pkt_data->pkt_hdr.eth_hdr.s_addr.addr_bytes[5],
-                                                                                            (dst_ip >> 24) & 0xff,
-                                                                                            (dst_ip >> 16) & 0xff,
-                                                                                            (dst_ip >> 8) & 0xff,
-                                                                                            (dst_ip >> 0) & 0xff,
-                                                                                            pkt_data->pkt_hdr.eth_hdr.d_addr.addr_bytes[0],
-                                                                                            pkt_data->pkt_hdr.eth_hdr.d_addr.addr_bytes[1],
-                                                                                            pkt_data->pkt_hdr.eth_hdr.d_addr.addr_bytes[2],
-                                                                                            pkt_data->pkt_hdr.eth_hdr.d_addr.addr_bytes[3],
-                                                                                            pkt_data->pkt_hdr.eth_hdr.d_addr.addr_bytes[4],
-                                                                                            pkt_data->pkt_hdr.eth_hdr.d_addr.addr_bytes[5], 
-                                                                                            src_port,
-                                                                                            dst_port,
-                                                                                            rte_be_to_cpu_32(pkt_data->epoch));
-            // swap s_addr and d_addr
-            struct rte_mbuf * pkt_buf = rte_pktmbuf_alloc(options.s_pkt_mempool);
-            if (pkt_buf == NULL) {
-                rte_exit(EXIT_FAILURE, "locore_main: failed to allocate memory for pkt_buf");
-            }
-
-            struct packet_data * tx_data = construct_udp_pkt_hdr(pkt_buf, 
-                                                &options.s_host_mac, 
-                                                &pkt_data->pkt_hdr.eth_hdr.s_addr, 
-                                                dst_ip, 
-                                                src_ip, 
-                                                dst_port, 
-                                                src_port);
-            if (tx_data == NULL) {
-                rte_exit(EXIT_FAILURE, "failed to construct tx packet %p", (void*)pkt_buf);
-            }
-            // copy, endianess doesn't matter
-            tx_data->epoch = pkt_data->epoch;
-            tx_data->magic = pkt_data->magic;
-            tx_data->clt_ts_rx = pkt_data->clt_ts_rx;
-            tx_data->clt_ts_tx = pkt_data->clt_ts_tx;
-            tx_data->srv_ts_rx = pkt_data->srv_ts_rx;
-            tx_data->srv_ts_tx = pkt_data->srv_ts_tx;
-            // queue for burst send
-            tx_bufs[nb_tx++] = pkt_buf;
-            // free rx packet
-            rte_pktmbuf_free(bufs[i]);
-        }
-
-        const uint16_t nb_tx_succ = rte_eth_tx_burst(options.s_portid, 0, tx_bufs, nb_tx);
-        // cleanup unsent packets
-        // don't need to free others because it's offloaded
-        if (nb_tx_succ < nb_tx) {
-            rte_exit(EXIT_FAILURE, "locore_main: failed to send some packets.\n");
-        }
-	}
-
-    return 0;
-}
-
-static int 
-port_init(uint16_t portid, struct rte_mempool *mbuf_pool)
-{
-    struct rte_eth_dev_info dev_info;
-    struct rte_eth_conf port_conf = port_conf_default;
-    struct rte_eth_txconf txconf;
-    struct rte_eth_rxconf rxconf;
-
-    uint16_t nb_rxd = RX_RING_SIZE;
-	uint16_t nb_txd = TX_RING_SIZE; 
-
-    if(!rte_eth_dev_is_valid_port(portid)) {
-        return -1;
-    }
-
-    int ret = rte_eth_dev_info_get(portid, &dev_info);
-    if (ret != 0) {
-        return ret;
-    }
-
-    port_conf.rxmode.max_rx_pkt_len = RTE_ETHER_MAX_LEN;
-    port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_UDP_CKSUM;
-    port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_IPV4_CKSUM;
-    port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM;
-    port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
-    port_conf.txmode.offloads |= DEV_TX_OFFLOAD_MBUF_FAST_FREE;
-    
-    /* Configure the Ethernet device. */
-    ret = rte_eth_dev_configure(portid, RX_RING_NUM, TX_RING_NUM, &port_conf);
-	if (ret != 0)
-		return ret;
-
-	ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd, &nb_txd);
-	if (ret != 0)
-		return ret;
-
-	/* Allocate and set up 1 RX queue per Ethernet port. */
-    rxconf = dev_info.default_rxconf;
-	for (uint32_t i = 0; i < RX_RING_NUM; i++) {
-		ret = rte_eth_rx_queue_setup(portid, i, nb_rxd, rte_eth_dev_socket_id(portid), &rxconf, mbuf_pool);
-		if (ret < 0)
-			return ret;
-	}
-
-    txconf = dev_info.default_txconf;
-	txconf.offloads = port_conf.txmode.offloads;
-	/* Allocate and set up 1 TX queue per Ethernet port. */
-	for (uint32_t i = 0; i < TX_RING_NUM; i++) {
-		ret = rte_eth_tx_queue_setup(portid, i, nb_txd, rte_eth_dev_socket_id(portid), &txconf);
-		if (ret < 0)
-			return ret;
-	}
-
-    ret = rte_eth_dev_start(portid);
-    if (ret < 0)
-        return ret;
-
-	/* Display the port MAC address. */
-    struct rte_ether_addr addr;
-    ret = rte_eth_macaddr_get(portid, &addr);
-    if (ret != 0)
-        return ret;
-    
-    /* Enable RX in promiscuous mode for the Ethernet device. */
-    ret = rte_eth_promiscuous_enable(portid);
-	if (ret != 0)
-		return ret;
-
-    if (rte_eth_add_tx_callback(portid, 0, tx_calc_latency, NULL) == NULL || rte_eth_add_rx_callback(portid, 0, rx_add_timestamp, NULL) == NULL) {
-        return -1;
-    }
-
-	return 0;
-}
-
-static void usage()
-{
-    fprintf(stdout, 
-            "Usage:\n" \
-            "    -v(vv): verbose mode\n" \
-            "    -h: display the information\n");
-}
-
-int main(int argc, char* argv[])
-{
-    unsigned int nb_ports;
-    struct rte_mempool *mbuf_pool, *mbuf_pool_pkt;
-
-    // init dpdk
-    int ret = rte_eal_init(argc, argv);
-    if (ret < 0) {
-        rte_exit(EXIT_FAILURE, "rte_eal_init failed!\n");
-    }
-
-    argc -= ret;
-    argv += ret;
-
-    // set warning level
-    ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_WARNING);
-    {
-        int c;
-        // parse arguments
-        while((c = getopt(argc, argv, "hv")) != -1) {
-            switch (c) {
-                case 'v':
-                    ntr_set_level(NTR_DEP_USER1, ntr_get_level(NTR_DEP_USER1) + 1);
-                    break;
-                case 'h':
-                    usage();
-                    rte_exit(EXIT_SUCCESS, NULL);
-                    break;
-                default:
-                    usage();
-                    rte_exit(EXIT_SUCCESS, "unknown argument: %c", c);
-                    break;
-            }
-        }
-    }
-
-    // XXX: singal handler to exit
-
-    nb_ports = rte_eth_dev_count_avail();
-    if (nb_ports == 0) {
-        rte_exit(EXIT_FAILURE, "number of ports must be > 0\n");
-    }
-
-    // create a mbuf memory pool on the socket
-    mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", MBUF_MAX_COUNT * nb_ports, MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());
-    if (mbuf_pool == nullptr) {
-        rte_exit(EXIT_FAILURE, "cannot create mbuf pool\n");
-    }
-
-    // create a pkt mbuf memory pool on the socket
-    mbuf_pool_pkt = rte_pktmbuf_pool_create("MBUF_POOL_PKT", MBUF_MAX_COUNT * nb_ports, MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());
-    if (mbuf_pool_pkt == nullptr) {
-        rte_exit(EXIT_FAILURE, "cannot create mbuf_pkt pool\n");
-    }
-    options.s_pkt_mempool = mbuf_pool_pkt;
-
-
-    uint16_t portid = rte_eth_find_next(0);
-    if (portid == RTE_MAX_ETHPORTS) {
-        rte_exit(EXIT_FAILURE, "cannot find an available port\n");
-    }
-    options.s_portid = portid;
-
-    if (port_init(portid, mbuf_pool) != 0) {
-        rte_exit(EXIT_FAILURE, "cannot init port %d\n", portid);
-    }
-
-    if (rte_eth_macaddr_get(portid, &options.s_host_mac) != 0) {
-        rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n", portid);
-    }
-
-    ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "Configured port %d with mac addr %x:%x:%x:%x:%x:%x\n", portid,
-                                                                                                options.s_host_mac.addr_bytes[0],
-                                                                                                options.s_host_mac.addr_bytes[1],
-                                                                                                options.s_host_mac.addr_bytes[2],
-                                                                                                options.s_host_mac.addr_bytes[3],
-                                                                                                options.s_host_mac.addr_bytes[4],
-                                                                                                options.s_host_mac.addr_bytes[5]);
-
-
-    uint16_t lcore_id = rte_get_next_lcore(0, true, false);
-
-    if (lcore_id == RTE_MAX_LCORE) {
-        rte_exit(EXIT_FAILURE, "cannot detect lcores.\n");
-    }
-    
-    if (rte_eal_remote_launch(locore_main, NULL, lcore_id) != 0) {
-        rte_exit(EXIT_FAILURE, "failed to launch function on locore %d\n", lcore_id);
-    }
-
-    // while(true) {
-    //     struct rte_eth_stats stats;
-    //     rte_eth_stats_get(portid, &stats);
-    //     printf("recv: %d missed: %d err: %d\n",(uint32_t)stats.ipackets, (uint32_t)stats.imissed,(uint32_t)stats.ierrors);
-    //     usleep(1000000);
-    // }
-
-    if (rte_eal_wait_lcore(lcore_id) != 0) {
-        rte_exit(EXIT_FAILURE, "failed to wait for locore %d\n", lcore_id);
-    }
-
-    // shouldn't get here
-
-    return 0;
-}
--- a/libgen/generator.cc
+++ b/libgen/generator.cc
@ -0,0 +1,95 @@
+// modified from mutilate
+
+#include "gen.hh"
+
+Generator *
+createFacebookKey()
+{
+	return new GEV(30.7984, 8.20449, 0.078688);
+}
+
+Generator *
+createFacebookValue()
+{
+	Generator *g = new GPareto(15.0, 214.476, 0.348238);
+
+	Discrete *d = new Discrete(g);
+	d->add(0.00536, 0.0);
+	d->add(0.00047, 1.0);
+	d->add(0.17820, 2.0);
+	d->add(0.09239, 3.0);
+	d->add(0.00018, 4.0);
+	d->add(0.02740, 5.0);
+	d->add(0.00065, 6.0);
+	d->add(0.00606, 7.0);
+	d->add(0.00023, 8.0);
+	d->add(0.00837, 9.0);
+	d->add(0.00837, 10.0);
+	d->add(0.08989, 11.0);
+	d->add(0.00092, 12.0);
+	d->add(0.00326, 13.0);
+	d->add(0.01980, 14.0);
+
+	return d;
+}
+
+Generator *
+createFacebookIA()
+{
+	return new GPareto(0, 16.0292, 0.154971);
+}
+
+Generator *
+createGenerator(std::string str)
+{
+	if (!strcmp(str.c_str(), "fb_key"))
+		return createFacebookKey();
+	else if (!strcmp(str.c_str(), "fb_value"))
+		return createFacebookValue();
+	else if (!strcmp(str.c_str(), "fb_ia"))
+		return createFacebookIA();
+
+	char *s_copy = new char[str.length() + 1];
+	strcpy(s_copy, str.c_str());
+	char *saveptr = NULL;
+
+	if (atoi(s_copy) != 0 || !strcmp(s_copy, "0")) {
+		double v = atof(s_copy);
+		delete[] s_copy;
+		return new Fixed(v);
+	}
+
+	char *t_ptr = strtok_r(s_copy, ":", &saveptr);
+	char *a_ptr = strtok_r(NULL, ":", &saveptr);
+
+	if (t_ptr == NULL) // || a_ptr == NULL)
+		DIE("strtok(.., \":\") failed to parse %s", str.c_str());
+
+	saveptr = NULL;
+	char *s1 = strtok_r(a_ptr, ",", &saveptr);
+	char *s2 = strtok_r(NULL, ",", &saveptr);
+	char *s3 = strtok_r(NULL, ",", &saveptr);
+
+	double a1 = s1 ? atof(s1) : 0.0;
+	double a2 = s2 ? atof(s2) : 0.0;
+	double a3 = s3 ? atof(s3) : 0.0;
+
+	delete[] s_copy;
+
+	if (strcasestr(str.c_str(), "fixed"))
+		return new Fixed(a1);
+	else if (strcasestr(str.c_str(), "normal"))
+		return new Normal(a1, a2);
+	else if (strcasestr(str.c_str(), "exponential"))
+		return new Exponential(a1);
+	else if (strcasestr(str.c_str(), "pareto"))
+		return new GPareto(a1, a2, a3);
+	else if (strcasestr(str.c_str(), "gev"))
+		return new GEV(a1, a2, a3);
+	else if (strcasestr(str.c_str(), "uniform"))
+		return new Uniform(a1);
+
+	DIE("Unable to create Generator '%s'", str.c_str());
+
+	return NULL;
+}
--- a/libgen/loadgen.cc
+++ b/libgen/loadgen.cc
@ -0,0 +1,276 @@
+#include <sys/types.h>
+#include <sys/cpuset.h>
+#include <sys/domainset.h>
+#include <sys/endian.h>
+#include <sys/thr.h>
+
+#include <pthread.h>
+#include <pthread_np.h>
+#include <topo.h>
+#include <unistd.h>
+
+#include "nms.h"
+#include "gen.hh"
+
+#include <atomic>
+
+void *
+memload_generator::worker_thrd(void *_tinfo)
+{
+	auto *tinfo = (struct thread_info *)_tinfo;
+	void *from_buffer, *to_buffer, *tmp;
+
+	if (tinfo->opts->shared_buffer) {
+		from_buffer = tinfo->from_buffer;
+		to_buffer = tinfo->to_buffer;
+	} else {
+		if (tinfo->opts->verbose) {
+			fprintf(stdout,
+			    "memload_generator <thread %d>: allocating fbuf %lu bytes on domain %d...\n",
+			    tinfo->tid, tinfo->opts->buffer_size,
+			    topo_core_to_numa(tinfo->coreid));
+		}
+		from_buffer = nms_alloc_static(topo_core_to_numa(
+							  tinfo->coreid),
+		    tinfo->opts->buffer_size);
+		if (tinfo->opts->verbose) {
+			fprintf(stdout,
+			    "memload_generator <thread %d>: allocating tbuf %lu bytes on domain %d...\n",
+			    tinfo->tid, tinfo->opts->buffer_size, tinfo->target_dom);
+		}
+		to_buffer = nms_alloc_static(tinfo->target_dom,
+		    tinfo->opts->buffer_size);
+	}
+
+	if (from_buffer == nullptr || to_buffer == nullptr) {
+		if (tinfo->opts->verbose) {
+			fprintf(stderr,
+			    "memload_generator <thread %d>: failed to allocate memory\n",
+			    tinfo->tid);
+		}
+		tinfo->init_status.store(-1);
+		return nullptr;
+	}
+
+	if (tinfo->pull) {
+		tmp = from_buffer;
+		from_buffer = to_buffer;
+		to_buffer = tmp;
+	}
+
+	// wait for other threads to init
+	if (tinfo->opts->verbose) {
+		fprintf(stdout, "memload_generator <thread %d, pull %d>: running...\n", tinfo->tid, tinfo->pull);
+	}
+	tinfo->init_status.store(1);
+
+	uint64_t next_ts = topo_uptime_ns();
+	size_t cur_offset = 0;
+	uint64_t cur_ts = 0;
+	while (true) {
+		switch (tinfo->state->load()) {
+		case STATE_RUN:
+			cur_ts = topo_uptime_ns();
+			if (cur_ts >= next_ts) {
+				if (cur_offset + tinfo->opts->transaction_size >
+				    tinfo->opts->buffer_size) {
+					cur_offset = 0;
+				}
+				// for (uint i = 0; i < tinfo->opts->transaction_size; i++) {
+				// 	((char *)to_buffer)[cur_offset + i] = ((char *)from_buffer)[cur_offset + i];
+				// }
+				memcpy((char *)to_buffer + cur_offset,
+				    (char *)from_buffer + cur_offset,
+				    tinfo->opts->transaction_size);
+				tinfo->num_trans.fetch_add(1);
+
+				if (tinfo->reset_ts.load(
+					std::memory_order_relaxed)) {
+					tinfo->reset_ts.store(false,
+					    std::memory_order_relaxed);
+					next_ts = cur_ts;
+				}
+				next_ts += tinfo->ia_gen->generate() *
+				    (double)S2NS;
+				cur_offset += tinfo->opts->transaction_size;
+			}
+			break;
+		case STATE_END:
+			goto end;
+		case STATE_RDY:
+			next_ts = topo_uptime_ns();
+			break;
+		case STATE_INIT:
+		default:
+			break;
+		}
+	}
+end:
+	if (tinfo->opts->verbose) {
+		fprintf(stdout, "memload_generator <thread %d>: exiting...\n",
+		    tinfo->tid);
+	}
+
+	if (!tinfo->opts->shared_buffer) {
+		nms_free_static(from_buffer, tinfo->opts->buffer_size);
+		nms_free_static(to_buffer, tinfo->opts->buffer_size);
+	}
+	return nullptr;
+}
+
+memload_generator::memload_generator(cpuset_t *threads, cpuset_t * modes, cpuset_t *target_domain,
+    struct memload_generator_options *opt, bool *success)
+{
+	*success = false;
+	state.store(STATE_INIT);
+	std::memcpy(&this->opts, opt, sizeof(memload_generator_options));
+
+	int nextcore = CPU_FFS(threads) - 1;
+	int target_domain_id = CPU_FFS(target_domain) - 1;
+	int num_cores = CPU_COUNT(threads);
+	if (target_domain_id < 0 || num_cores == 0) {
+		return;
+	}
+
+	double thread_tps = (double)opt->trans_per_second / (double)num_cores;
+	void *local_buffer = nullptr;
+	void *target_buffer = nullptr;
+	int tid = 0;
+
+	if (opts.shared_buffer) {
+		local_buffer = nms_alloc_static(topo_core_to_numa(nextcore),
+		    opt->buffer_size);
+		target_buffer = nms_alloc_static(target_domain_id,
+		    opt->buffer_size);
+		if (local_buffer == nullptr || target_buffer == nullptr) {
+			*success = false;
+			goto end;
+		}
+	}
+
+	while (nextcore != -1) {
+		auto info = new struct thread_info;
+		cpuset_t cpuset;
+		pthread_attr_t attr;
+
+		info->ia_gen = createGenerator(opts.ia_dist);
+		if (info->ia_gen == nullptr) {
+			goto end;
+		}
+		info->ia_gen->set_lambda(thread_tps);
+		info->init_status.store(0);
+		info->state = &this->state;
+		info->reset_ts.store(false, std::memory_order_relaxed);
+		info->num_trans.store(0);
+		info->opts = &this->opts;
+		info->tid = tid;
+		info->coreid = nextcore;
+		info->target_dom = target_domain_id;
+		info->from_buffer = local_buffer;
+		info->to_buffer = target_buffer;
+		info->pull = CPU_ISSET(nextcore, modes);
+
+		CPU_ZERO(&cpuset);
+		CPU_SET(nextcore, &cpuset);
+		pthread_attr_init(&attr);
+		pthread_attr_setaffinity_np(&attr, sizeof(cpuset_t), &cpuset);
+		pthread_create(&info->pthr, &attr, worker_thrd, info);
+
+		if (opts.verbose) {
+			fprintf(stdout,
+			    "memload_generator: created thread %d on core %d target domain %d\n",
+			    tid, nextcore, target_domain_id);
+		}
+
+		thr_infos.push_back(info);
+
+		CPU_CLR(nextcore, threads);
+		nextcore = CPU_FFS(threads) - 1;
+		tid++;
+	}
+
+	for (auto tinfo : thr_infos) {
+		int status;
+		while ((status = tinfo->init_status.load()) != 1) {
+			if (status == -1) {
+				state.store(STATE_END);
+				*success = false;
+				goto end;
+			}
+		}
+	}
+
+	state.store(STATE_RDY);
+
+	*success = true;
+end:
+	if (opts.verbose) {
+		fprintf(stdout,
+		    "memload_generator: exiting constructor. Success: %d...\n",
+		    success ? 1 : 0);
+	}
+}
+
+bool
+memload_generator::start()
+{
+	if (this->state.load() == STATE_RDY) {
+		this->state.store(memload_generator::STATE_RUN);
+		return true;
+	}
+	return false;
+}
+
+bool
+memload_generator::stop()
+{
+	if (this->state.load() == STATE_RUN) {
+		this->state.store(memload_generator::STATE_RDY);
+		return true;
+	}
+	return false;
+}
+
+bool
+memload_generator::set_transactions(uint64_t tps)
+{
+	if (this->state.load() != STATE_END &&
+	    this->state.load() != STATE_INIT) {
+		for (unsigned int i = 0; i < thr_infos.size(); i++) {
+			thr_infos.at(i)->ia_gen->set_lambda(
+			    (double)tps / (double)thr_infos.size());
+			thr_infos.at(i)->reset_ts.store(true,
+			    std::memory_order_relaxed);
+		}
+		return true;
+	}
+	return false;
+}
+
+uint64_t
+memload_generator::get_transactions()
+{
+	uint64_t total_transactions = 0;
+	for (auto i : thr_infos) {
+		total_transactions += i->num_trans.load();
+	}
+	return total_transactions;
+}
+
+memload_generator::~memload_generator()
+{
+	void *buf1, *buf2;
+	this->state.store(STATE_END);
+	for (auto i : thr_infos) {
+		// XXX: nms_free regions
+		pthread_join(i->pthr, NULL);
+		buf1 = i->from_buffer;
+		buf2 = i->to_buffer;
+		delete i;
+	}
+
+	if (opts.shared_buffer) {
+		nms_free_static(buf1, opts.buffer_size);
+		nms_free_static(buf2, opts.buffer_size);
+	}
+}
--- a/libnms/alloc.c
+++ b/libnms/alloc.c
@ -0,0 +1,205 @@
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/cpuset.h>
+#include <sys/domainset.h>
+#include <sys/thr.h>
+#include <sys/mman.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <errno.h>
+#include <stdatomic.h>
+#include <string.h>
+#include <assert.h>
+
+#include <nms.h>
+
+#define MAX_NUMA_DOMAINS (64)
+#define MAX_REGIONS (64)
+#define REGION_SIZE (1024 * 1024 * 1024)
+#define PAGE_SIZE (4096)
+
+struct nms_region {
+	uintptr_t start_addr;
+	size_t size;
+	size_t occupied;
+};
+
+struct nms_desc {
+    // alloc
+    pthread_mutex_t alloc_lock;
+
+	struct nms_region regions[MAX_NUMA_DOMAINS][MAX_REGIONS];
+	int region_sz[MAX_NUMA_DOMAINS];
+};
+
+static _Atomic(int) initialized = 0;
+static struct nms_desc g_desc;
+
+void
+nms_free_static(void * buf, size_t sz)
+{
+	munmap(buf, sz);
+	return;
+}
+
+void *
+nms_alloc_static(int node_id, size_t sz)
+{
+	long tid;
+	domainset_t orig_dom;
+	int orig_policy;
+	void * region;
+
+    thr_self(&tid);
+	DOMAINSET_ZERO(&orig_dom);
+
+	// save existing thread's allocation strategy
+	int ret = cpuset_getdomain(CPU_LEVEL_WHICH, CPU_WHICH_TID, tid, sizeof(orig_dom), &orig_dom, &orig_policy);
+	if (ret != 0) {
+		fprintf(stderr, "libnms: cpuset_getdomain failed with %d\n", errno);
+		return NULL;
+	}
+
+	domainset_t tmp_domain;
+	DOMAINSET_ZERO(&tmp_domain);
+	DOMAINSET_SET(node_id, &tmp_domain);
+
+	ret = cpuset_setdomain(CPU_LEVEL_WHICH, CPU_WHICH_TID, tid, sizeof(tmp_domain), &tmp_domain, DOMAINSET_POLICY_ROUNDROBIN);
+	if (ret != 0) {
+		fprintf(stderr, "libnms: cpuset_setdomain failed with %d\n", errno);
+		return NULL;
+	}
+
+	if ((region = mmap(NULL, sz, PROT_READ | PROT_WRITE, MAP_ANON | MAP_ALIGNED_SUPER | MAP_NOCORE | MAP_PRIVATE | MAP_PREFAULT_READ, -1, 0)) == MAP_FAILED) {
+		fprintf(stderr, "libnms: mmap failed with %d\n", errno);
+		return NULL;
+	}
+
+	// touch the pages to prefault the pages
+	int sum;
+	for (size_t i = 0; i < sz; i++) {
+		sum += *(uint8_t *)((char *)region + i);
+		*(uint8_t *)((char *)region + i) = i;
+	}
+
+	// restore existing thread's allocation strategy
+	ret = cpuset_setdomain(CPU_LEVEL_WHICH, CPU_WHICH_TID, tid, sizeof(orig_dom), &orig_dom, orig_policy);
+	if (ret != 0) {
+		fprintf(stderr, "libnms: cpuset_setdomain failed with %d\n", errno);
+		munmap(region, REGION_SIZE);
+		return NULL;
+	}
+
+	return region;
+}
+
+static int
+nms_desc_init(struct nms_desc * desc, int verbose)
+{
+	memset(desc, 0, sizeof(struct nms_desc));
+	pthread_mutex_init(&desc->alloc_lock, NULL);
+	return 0;
+}
+
+static void *
+nms_region_malloc(struct nms_region * region, size_t size)
+{
+	void * ret = NULL;
+	if (region->size >= region->occupied + size) {
+		ret = (void *)(region->start_addr + region->occupied);
+		region->occupied += size;
+		region->occupied = (region->occupied + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
+	}
+	return ret;
+}
+
+static int
+nms_desc_add_region(struct nms_desc * desc, int nodeid, size_t size)
+{
+	void * ret;
+	int idx;
+
+	ret = nms_alloc_static(nodeid, REGION_SIZE);
+	if (ret == NULL) {
+		fprintf(stderr, "libnms: failed to allocate region on node %d\n", nodeid);
+		return ENOMEM;
+	}
+
+	desc->region_sz[nodeid]++;
+	idx = desc->region_sz[nodeid] - 1;
+	desc->regions[nodeid][idx].start_addr = (uintptr_t)ret;
+	desc->regions[nodeid][idx].occupied = 0;
+	desc->regions[nodeid][idx].size = REGION_SIZE;
+
+	return 0;
+}
+
+static void *
+nms_desc_malloc(struct nms_desc * desc, unsigned int nodeid, size_t size)
+{
+	void * ret = NULL;
+	int idx;
+	int new_region = 0;
+
+	if (size > REGION_SIZE) {
+		return NULL;
+	}
+
+	pthread_mutex_lock(&desc->alloc_lock);
+
+retry:
+	if (desc->region_sz[nodeid] > 0) {
+		idx = desc->region_sz[nodeid] - 1;
+		ret = nms_region_malloc(&desc->regions[nodeid][idx], size);
+	}
+	
+	if (ret == NULL) {
+		// we need a new region
+		if (nms_desc_add_region(desc, nodeid, REGION_SIZE) != 0) {
+			pthread_mutex_unlock(&desc->alloc_lock);
+			return NULL;
+		}
+		fprintf(stdout, "libnms: malloc request of size %zu -> allocated new region on node %d\n", size, nodeid);
+		goto retry;
+	}
+
+	pthread_mutex_unlock(&desc->alloc_lock);
+	return ret;
+}
+
+static void
+nms_desc_free(struct nms_desc * desc __attribute__((unused)), unsigned int node __attribute__((unused)), void * addr __attribute__((unused)))
+{
+	// dummy function
+}
+
+int
+nms_init(int verbose)
+{	
+	int expected = 0;
+	if (atomic_compare_exchange_strong(&initialized, &expected, 2)) {
+		nms_desc_init(&g_desc, verbose);
+		atomic_store(&initialized, 1);
+	} else {
+		while(atomic_load(&initialized) != 1) {
+		}
+		fprintf(stdout,"libnms: already initialized.\n");
+	}
+
+	return 0;
+}
+
+void *
+nms_malloc(int nodeid, size_t sz)
+{
+	assert(atomic_load(&initialized) == 1);
+	return nms_desc_malloc(&g_desc, nodeid, sz);
+}
+
+void
+nms_free(int nodeid, void * addr)
+{
+	assert(atomic_load(&initialized) == 1);
+	nms_desc_free(&g_desc, nodeid, addr);
+}
+
--- a/libntr/ntr.c
+++ b/libntr/ntr.c
@ -0,0 +1,46 @@
+#include "ntr.h"
+
+static int ntr_log_levels[NTR_DEP_MAX] = { NTR_LEVEL_DEFAULT };
+static FILE *ntr_out;
+
+void
+ntr_init()
+{
+	ntr_out = stdout;
+}
+
+void
+ntr(int dep, int level, const char *fmt, ...)
+{
+	va_list vl;
+	va_start(vl, fmt);
+	if (dep < NTR_DEP_MAX && level <= ntr_log_levels[dep]) {
+		vfprintf(ntr_out, fmt, vl);
+	}
+	va_end(vl);
+}
+
+void
+ntr_set_level(int dep, int level)
+{
+	if (dep < NTR_DEP_MAX) {
+		ntr_log_levels[dep] = level;
+	}
+}
+
+void
+ntr_set_output(FILE *f)
+{
+	if (f != NULL) {
+		ntr_out = f;
+	}
+}
+
+int
+ntr_get_level(int dep)
+{
+	if (dep < NTR_DEP_MAX) {
+		return ntr_log_levels[dep];
+	}
+	return 0;
+}
--- a/net/cat.cc
+++ b/net/cat.cc
@ -0,0 +1,989 @@
+#include <atomic>
+#include <cstdlib>
+#include <ctime>
+#include <fstream>
+#include <random>
+#include <vector>
+
+#include <topo.h>
+#include <rte_byteorder.h>
+#include <rte_common.h>
+#include <rte_config.h>
+#include <rte_cycles.h>
+#include <rte_eal.h>
+#include <rte_ethdev.h>
+#include <rte_ether.h>
+#include <rte_launch.h>
+#include <rte_lcore.h>
+#include <rte_mbuf.h>
+#include <unistd.h>
+
+#include "ntr.h"
+#include "gen.hh"
+#include "net/netsup.hh"
+#include "net/pkt.hh"
+#include "nms.h"
+
+constexpr static unsigned int BURST_SIZE = 32;
+constexpr static unsigned int MAX_SLAVES = 32;
+constexpr static unsigned int SLAVES_MAX_WAIT_MS = 1000;
+
+struct datapt {
+	uint32_t epoch;
+	uint32_t valid;
+	uint64_t clt_hw_tx;
+	uint64_t clt_sw_tx;
+	uint64_t clt_hw_rx;
+	uint64_t clt_sw_rx;
+	uint64_t srv_hw_tx;
+	uint64_t srv_sw_tx;
+	uint64_t srv_hw_rx;
+	uint64_t srv_sw_rx;
+};
+
+constexpr static uint32_t STATE_WAIT = 0;     // waiting for sending
+constexpr static uint32_t STATE_SENT = 1;     // we sent a packet
+constexpr static uint32_t STATE_COMPLETE = 2; // we received everything
+constexpr static uint32_t STATE_PKTLOSS = 3;  // last packet sent was lost
+
+struct options_t {
+	// parameters
+	unsigned int run_time { 5 };
+	unsigned int warmup_time { 3 };
+	char output[256] = "output.txt";
+	char ia_gen_str[256] = "fixed";
+	unsigned int target_qps { 0 };
+	unsigned int master_mode { 0 };
+	struct net_spec server_spec { };
+	cpuset_t cpu_set = CPUSET_T_INITIALIZER(0x2); // 2nd  core
+	std::vector<struct net_spec *> slaves;
+	uint32_t pkt_loss_failure_threshold { 0 };
+	uint32_t pkt_loss_time_ms { UINT32_MAX };
+	int portid { 0 };
+
+	// states
+	struct net_spec s_host_spec { };
+	struct conn_spec s_host_conn {
+		.src = &s_host_spec, .dst = &server_spec, .dst_port = POU_PORT
+	};
+	unsigned int s_rxqid { 0 };
+	unsigned int s_txqid { 0 };
+	unsigned int s_socketid { 0 };
+	// for qps calculation
+	std::atomic<uint32_t> s_recved_pkts { 0 };
+	std::atomic<uint32_t> s_pkt_loss { 0 };
+	std::atomic<uint64_t> s_start_time { 0 };
+	std::atomic<uint64_t> s_end_time { 0 };
+	std::atomic<uint32_t> s_slave_qps { 0 };
+	std::atomic<uint32_t> s_slave_recved { 0 };
+	std::atomic<uint32_t> s_slave_loss { 0 };
+	uint32_t s_state { STATE_WAIT };
+	bool s_hwtimestamp { true };
+
+	Generator *s_iagen { nullptr };
+	std::vector<struct datapt *> s_data;
+	struct datapt *s_last_datapt { nullptr };
+	uint32_t s_epoch { 0 };
+	std::atomic<bool> s_stop { false };
+	std::atomic<uint32_t> s_record { 0 };
+};
+
+static struct options_t options;
+
+static uint16_t
+rx_add_timestamp(uint16_t port, uint16_t qidx __rte_unused,
+struct rte_mbuf **pkts, uint16_t nb_pkts, uint16_t max_pkts __rte_unused,
+    void *_ __rte_unused)
+{
+	uint64_t now = topo_uptime_ns();
+	struct pkt_hdr *pkt_data;
+	struct timespec ts { };
+	int ret;
+
+	if (options.s_state != STATE_SENT) {
+		return nb_pkts;
+	}
+
+	for (int i = 0; i < nb_pkts; i++) {
+		pkt_data = check_valid_packet(pkts[i],
+		    &options.s_host_spec.mac_addr);
+
+		if (pkt_data == nullptr) {
+			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+			    "rx_add_timestamp: ignoring invalid packet 0x%p.\n",
+			    (void *)pkts[i]);
+			continue;
+		}
+
+		if (rte_be_to_cpu_16(pkt_data->type) == PKT_TYPE_PROBE_RESP) {
+			uint32_t epoch = rte_be_to_cpu_32(
+			    ((struct pkt_payload_epoch *)pkt_data->payload)
+				->epoch);
+			if (options.s_last_datapt != nullptr &&
+			    options.s_last_datapt->epoch == epoch) {
+				if (options.s_hwtimestamp) {
+					if ((ret = rte_eth_timesync_read_rx_timestamp(
+						port, &ts, pkts[i]->timesync & 0x3)) ==
+						0) {
+						// has hw rx timestamp
+						options.s_last_datapt->clt_hw_rx =
+							ts.tv_sec * S2NS + ts.tv_nsec;
+						options.s_last_datapt->clt_sw_rx = now;
+						ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+							"rx_add_timestamp: tagged packet %p with sw: %lu hw: %lu.\n",
+							(void *)pkts[i], now,
+							options.s_last_datapt->clt_hw_rx);
+					} else {
+						rte_exit(EXIT_FAILURE,
+							"rx_add_timestamp: packet %p not tagged - hw ts not "
+							"available - %d.\n",
+							(void *)pkts[i], ret);
+					}
+				} else {
+					options.s_last_datapt->clt_sw_rx = now;
+					options.s_last_datapt->clt_hw_rx = 0;
+					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+							"rx_add_timestamp: tagged packet %p with sw: %lu hw: (disabled).\n",
+							(void *)pkts[i], now);
+				}
+			} else {
+				ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
+				    "rx_add_timestamp: packet %p epoch %d != last epoch %d.\n",
+				    (void *)pkts[i], epoch,
+				    options.s_last_datapt->epoch);
+			}
+		} else {
+			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+			    "rx_add_timestamp: packet %p not tagged - type %d.\n",
+			    (void *)pkts[i], rte_be_to_cpu_16(pkt_data->type));
+		}
+	}
+
+	return nb_pkts;
+}
+
+static uint16_t
+tx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
+    struct rte_mbuf **pkts, uint16_t nb_pkts, void *_ __rte_unused)
+{
+	uint64_t now = topo_uptime_ns();
+	struct pkt_hdr *pkt_data;
+
+	// if (options.s_state != STATE_SENT) {
+	// 	return nb_pkts;
+	// }
+
+	for (int i = 0; i < nb_pkts; i++) {
+		pkt_data = check_valid_packet(pkts[i],
+		    &options.s_host_spec.mac_addr);
+
+		if (pkt_data == nullptr) {
+			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+			    "tx_add_timestamp: ignoring invalid packet 0x%p.\n",
+			    (void *)pkts[i]);
+			continue;
+		}
+
+		if (rte_be_to_cpu_16(pkt_data->type) == PKT_TYPE_PROBE) {
+			uint32_t epoch = rte_be_to_cpu_32(
+			    ((struct pkt_payload_epoch *)pkt_data->payload)
+				->epoch);
+
+			if (options.s_last_datapt == nullptr ||
+			    epoch != options.s_last_datapt->epoch) {
+				rte_exit(EXIT_FAILURE,
+				    "tx_add_timestamp: packet epoch %d != last epoch %d\n",
+				    epoch, options.s_last_datapt->epoch);
+			}
+
+			options.s_last_datapt->clt_sw_tx = now;
+			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+			    "tx_add_timestamp: tagged packet %p with sw: %lu.\n",
+			    (void *)pkts[i], now);
+		} else {
+			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+			    "tx_add_timestamp: packet %p not tagged - type %d.\n",
+			    (void *)pkts[i], pkt_data->type);
+		}
+	}
+
+	return nb_pkts;
+}
+
+// returns 0 on success
+static void
+send_all_slaves(uint16_t type)
+{
+	struct rte_mbuf *tx_bufs[MAX_SLAVES];
+	//struct rte_eth_stats stats;
+
+	struct conn_spec cspec;
+	cspec.src = &options.s_host_spec;
+	cspec.dst_port = DEFAULT_RAT_PORT;
+	cspec.src_port = DEFAULT_RAT_PORT;
+
+	// send all clients SYNC
+	for (unsigned int i = 0; i < options.slaves.size(); i++) {
+		struct pkt_hdr *hdr;
+		cspec.dst = options.slaves.at(i);
+		if (alloc_pkt_hdr(mempool_get(options.s_socketid), type, &cspec, 0,
+			&tx_bufs[i], &hdr) != 0) {
+			rte_exit(EXIT_FAILURE, "failed to alloc packet\n");
+		}
+	}
+	
+	// if (rte_eth_stats_get(options.portid, &stats) != 0 ) {
+	// 	rte_exit(EXIT_FAILURE, "failed!");
+	// }
+	// printf("send_all_slaves: ipackets %lu, opackets %lu, ierrors %lu, oerrors %lu\n", stats.ipackets, stats.opackets, stats.ierrors, stats.oerrors);
+
+	if (rte_eth_tx_burst(options.portid, options.s_txqid, tx_bufs,
+		options.slaves.size()) != options.slaves.size()) {
+		rte_exit(EXIT_FAILURE, "failed to send some packets\n");
+	}
+}
+
+// sizeof mbuf must >= MAX_SLAVES
+// this function fills up to #slave
+static void
+wait_for_slaves(uint16_t etype, struct rte_mbuf **out)
+{
+	struct rte_mbuf *tx_bufs[MAX_SLAVES];
+	bool stop = false;
+	const uint64_t start = topo_uptime_ns();
+	std::vector<struct rte_ether_addr *> recved;
+	uint32_t tot = 0;
+
+	while (!stop) {
+		uint64_t now = topo_uptime_ns();
+		const uint16_t nb_rx = rte_eth_rx_burst(options.portid,
+		    options.s_rxqid, tx_bufs, MAX_SLAVES);
+
+		if (nb_rx > 0) {
+			for (unsigned int i = 0; i < nb_rx; i++) {
+				struct pkt_hdr *each = check_valid_packet(
+				    tx_bufs[i], &options.s_host_spec.mac_addr);
+				uint16_t type;
+				if (each == nullptr) {
+					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+					    "wait_for_slaves: ignoring invalid packet %p.\n",
+					    (void *)tx_bufs[i]);
+					goto end_loop;
+				}
+
+				type = rte_be_to_cpu_16(each->type);
+
+				if (type == etype) {
+					bool invalid = true;
+
+					// check if it is from one of our
+					// clients
+					for (auto eaddr : options.slaves) {
+						if (rte_is_same_ether_addr(
+							&eaddr->mac_addr,
+							&each->eth_hdr
+							     .src_addr)) {
+							invalid = false;
+							break;
+						}
+					}
+
+					if (invalid) {
+						// received invalid packet from
+						// unregistered slave
+						ntr(NTR_DEP_USER1,
+						    NTR_LEVEL_WARNING,
+						    "wait_for_slaves: invalid packet %p from unregistered slave\n.",
+						    tx_bufs[i]);
+						goto end_loop;
+					}
+
+					invalid = false;
+					// check if we have already received the
+					// same packet from the mac addr
+					for (auto eaddr : recved) {
+						if (rte_is_same_ether_addr(
+							eaddr,
+							&each->eth_hdr
+							     .src_addr)) {
+							invalid = true;
+							break;
+						}
+					}
+
+					if (invalid) {
+						// received invalid packet from
+						// the same slave
+						ntr(NTR_DEP_USER1,
+						    NTR_LEVEL_WARNING,
+						    "wait_for_slaves: invalid packet %p - duplicated\n.",
+						    tx_bufs[i]);
+						goto end_loop;
+					}
+
+					recved.push_back(
+					    &each->eth_hdr.src_addr);
+
+					if (recved.size() ==
+					    options.slaves.size()) {
+						stop = true;
+					}
+
+					if (out != nullptr) {
+						out[tot] = tx_bufs[i];
+						tot++;
+						// don't free this packet
+						continue;
+					}
+				} else {
+					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+					    "wait_for_slaves: ignoring invalid packet %p type %d.\n",
+					    (void *)tx_bufs[i], type);
+				}
+			end_loop:
+				rte_pktmbuf_free(tx_bufs[i]);
+			}
+		}
+
+		// struct rte_eth_stats stats;
+		// if (rte_eth_stats_get(options.portid, &stats) != 0 ) {
+		// 	rte_exit(EXIT_FAILURE, "failed!");
+		// }
+		//printf("wait_slaves <AFTER>: ipackets %lu, opackets %lu, ierrors %lu, oerrors %lu\n", stats.ipackets, stats.opackets, stats.ierrors, stats.oerrors);
+
+		if (now - start > SLAVES_MAX_WAIT_MS * MS2NS) {
+			rte_exit(EXIT_FAILURE,
+			    "cat: waiting for too long %d. I QUIT!!", etype);
+		}
+	}
+}
+
+static void
+pkt_loop()
+{
+	struct rte_mbuf *tx_buf;
+	struct rte_mbuf *rx_bufs[BURST_SIZE];
+	struct pkt_hdr *pkt_data;
+	rdport_generator port_gen(MIN_RANDOM_PORT);
+
+	bool read_tx = true;
+	bool recv_stat = true;
+	bool recv_resp = true;
+
+	if (rte_eth_dev_socket_id(options.portid) > 0 &&
+	    rte_eth_dev_socket_id(options.portid) != (int)rte_socket_id()) {
+		ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
+		    "locore_main: WARNING, port %d is on remote NUMA node to "
+		    "polling thread.\n\tPerformance will "
+		    "not be optimal.\n",
+		    options.portid);
+	}
+
+	uint64_t next_ts = topo_uptime_ns();
+	uint64_t last_send_ts = next_ts;
+	bool is_last_pkt_lost = false;
+	uint32_t num_cts_pkt_lost = 0;
+
+	while (!options.s_stop.load()) {
+		uint64_t now = topo_uptime_ns();
+		// always pop incoming packets
+		const uint16_t nb_rx = rte_eth_rx_burst(options.portid,
+		    options.s_rxqid, rx_bufs, BURST_SIZE);
+
+		if (nb_rx > 0) {
+			for (int i = 0; i < nb_rx; i++) {
+				if (options.s_state != STATE_SENT) {
+					// only need to process packets after we
+					// sent one
+					rte_pktmbuf_free(rx_bufs[i]);
+					continue;
+				}
+
+				struct pkt_hdr *each = check_valid_packet(
+				    rx_bufs[i], &options.s_host_spec.mac_addr);
+
+				if (each == nullptr) {
+					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+					    "locore_main: ignoring invalid packet %p.\n",
+					    (void *)rx_bufs[i]);
+					rte_pktmbuf_free(rx_bufs[i]);
+					continue;
+				}
+
+				uint16_t type = rte_be_to_cpu_16(each->type);
+				NTR_PKT(NTR_DEP_USER1, NTR_LEVEL_DEBUG, each,
+				    "locore_main: received packet %p ", each);
+				struct pkt_payload_epoch *pld_epoch;
+				struct pkt_payload_stat *pld_stat;
+				uint32_t epoch;
+				switch (type) {
+				case PKT_TYPE_PROBE_RESP:
+					pld_epoch = (struct pkt_payload_epoch *)
+							each->payload;
+					epoch = rte_be_to_cpu_32(
+					    pld_epoch->epoch);
+
+					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "lcore_main: PROBE_RESP received packet %p epoch %d\n", each, epoch);
+
+					if (options.s_last_datapt == nullptr ||
+					    epoch !=
+						options.s_last_datapt->epoch) {
+						ntr(NTR_DEP_USER1,
+						    NTR_LEVEL_WARNING,
+						    "locore_main: packet %p epoch %d doesn't match datapt %d.\n",
+						    (void *)rx_bufs[i], epoch,
+						    options.s_last_datapt
+							->epoch);
+						break;
+					}
+
+					recv_resp = true;
+					break;
+				case PKT_TYPE_STAT:
+					pld_stat = (struct pkt_payload_stat *)
+						       each->payload;
+					epoch = rte_be_to_cpu_32(
+					    pld_stat->epoch);
+
+					if (options.s_last_datapt == nullptr ||
+					    epoch !=
+						options.s_last_datapt->epoch) {
+						ntr(NTR_DEP_USER1,
+						    NTR_LEVEL_WARNING,
+						    "locore_main: packet %p epoch %d doesn't match datapt %d.\n",
+						    (void *)rx_bufs[i], epoch,
+						    options.s_last_datapt
+							->epoch);
+						break;
+					}
+
+					options.s_last_datapt->srv_hw_tx =
+					    rte_be_to_cpu_64(pld_stat->hw_tx);
+					options.s_last_datapt->srv_hw_rx =
+					    rte_be_to_cpu_64(pld_stat->hw_rx);
+					options.s_last_datapt->srv_sw_tx =
+					    rte_be_to_cpu_64(pld_stat->sw_tx);
+					options.s_last_datapt->srv_sw_rx =
+					    rte_be_to_cpu_64(pld_stat->sw_rx);
+
+					recv_stat = true;
+					is_last_pkt_lost = false;
+					break;
+				default:
+					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+					    "locore_main: ignoring packet %p with unknown type %d.\n",
+					    (void *)rx_bufs[i], type);
+				}
+
+				rte_pktmbuf_free(rx_bufs[i]);
+			}
+		}
+
+		if (options.s_state == STATE_SENT) {
+			// check if hw tx ts is read
+			if (!read_tx) {
+				int ret;
+				struct timespec ts;
+				if (options.s_hwtimestamp) {
+					if ((ret = rte_eth_timesync_read_tx_timestamp(
+						options.portid, &ts)) == 0) {
+						ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+							"locore_main: read hw tx timestamp %lu.\n",
+							(ts.tv_nsec + ts.tv_sec * S2NS));
+						options.s_last_datapt->clt_hw_tx =
+							ts.tv_nsec + ts.tv_sec * S2NS;
+						read_tx = true;
+					}
+				} else {
+					options.s_last_datapt->clt_hw_tx = 0;
+					read_tx = true;
+				}
+			}
+
+			if (read_tx && recv_resp && recv_stat) {
+				options.s_state = STATE_COMPLETE;
+			} else {
+				// check packet loss
+				if (now - last_send_ts >
+				    options.pkt_loss_time_ms * MS2NS) {
+
+					if (is_last_pkt_lost) {
+						num_cts_pkt_lost++;
+					} else {
+						is_last_pkt_lost = true;
+						num_cts_pkt_lost = 1;
+					}
+
+					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+					    "locore_main: packet loss: waiting too long for epoch %d. %d in a row.\n",
+					    options.s_last_datapt->epoch,
+					    num_cts_pkt_lost);
+
+					delete options.s_last_datapt;
+					options.s_last_datapt = nullptr;
+					options.s_state = STATE_PKTLOSS;
+					options.s_pkt_loss.fetch_add(1);
+
+					if (num_cts_pkt_lost >
+					    options
+						.pkt_loss_failure_threshold) {
+						rte_exit(EXIT_FAILURE,
+						    "too many continuous packet loss detected\n");
+					}
+				}
+			}
+		}
+
+		if (options.s_state == STATE_COMPLETE ||
+		    options.s_state == STATE_PKTLOSS ||
+		    options.s_state == STATE_WAIT) {
+			if (options.s_state == STATE_COMPLETE) {
+				options.s_data.push_back(options.s_last_datapt);
+
+				ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+				    "locore_main: datapt for epoch %d dump:\n"
+				    "                    Valid: %d\n"
+				    "                    client TX HW: %lu\n"
+				    "                    client TX SW: %lu\n"
+				    "                    client RX HW: %lu\n"
+				    "                    client RX SW: %lu\n"
+				    "                    server TX HW: %lu\n"
+				    "                    server TX SW: %lu\n"
+				    "                    server RX HW: %lu\n"
+				    "                    server RX SW: %lu\n\n",
+				    options.s_last_datapt->epoch,
+				    options.s_last_datapt->valid,
+				    options.s_last_datapt->clt_hw_tx,
+				    options.s_last_datapt->clt_sw_tx,
+				    options.s_last_datapt->clt_hw_rx,
+				    options.s_last_datapt->clt_sw_rx,
+				    options.s_last_datapt->srv_hw_tx,
+				    options.s_last_datapt->srv_sw_tx,
+				    options.s_last_datapt->srv_hw_rx,
+				    options.s_last_datapt->srv_sw_rx);
+				options.s_recved_pkts.fetch_add(1);
+				options.s_last_datapt = nullptr;
+			}
+
+			options.s_state = STATE_WAIT;
+
+			if (now >= next_ts) {
+				struct pkt_payload_epoch *pld_epoch;
+				uint32_t epoch;
+
+				next_ts += (int)(options.s_iagen->generate() *
+				    S2NS);
+
+				options.s_host_conn.src_port = port_gen.next();
+				if (alloc_pkt_hdr(mempool_get(options.s_socketid),
+					PKT_TYPE_PROBE, &options.s_host_conn, 0,
+					&tx_buf, &pkt_data) != 0) {
+					rte_exit(EXIT_FAILURE,
+					    "failed to alloc probe packet.\n");
+				}
+
+				epoch = options.s_epoch;
+				options.s_epoch++;
+				pld_epoch = (struct pkt_payload_epoch *)
+						pkt_data->payload;
+				pld_epoch->epoch = rte_cpu_to_be_32(epoch);
+				options.s_last_datapt = new struct datapt;
+				options.s_last_datapt->epoch = epoch;
+				options.s_last_datapt->valid =
+				    options.s_record.load();
+
+				last_send_ts = now;
+				
+				ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+				    "locore_main: sending packet 0x%p with epoch %d\n",
+				    (void *)tx_buf, epoch);
+				const uint16_t nb_tx =
+				    rte_eth_tx_burst(options.portid,
+					options.s_txqid, &tx_buf, 1);
+
+				if (nb_tx != 1) {
+					rte_exit(EXIT_FAILURE,
+					    "failed to send packet 0x%p, epoch %d\n",
+					    (void *)tx_buf, epoch);
+				}
+
+				rte_pktmbuf_free(tx_buf);
+
+				read_tx = false;
+				recv_resp = false;
+				recv_stat = false;
+				options.s_state = STATE_SENT;
+			}
+		}
+	}
+}
+
+static int
+locore_main(void *tif __rte_unused)
+{
+	struct rte_mbuf *mbufs[MAX_SLAVES];
+	uint32_t core_id = rte_lcore_id();
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: core %d running...\n",
+	    core_id);
+
+	if (options.master_mode == 1) {
+		ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+		    "locore_main: sending SYNC ...\n");
+		send_all_slaves(PKT_TYPE_SYNC);
+		ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+		    "locore_main: waiting for SYNC_ACK ...\n");
+		wait_for_slaves(PKT_TYPE_SYNC_ACK, nullptr);
+	}
+
+	options.s_start_time.store(topo_uptime_ns());
+	pkt_loop();
+	options.s_end_time.store(topo_uptime_ns());
+
+	if (options.master_mode == 1) {
+		ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+		    "locore_main: sending FIN ...\n");
+		send_all_slaves(PKT_TYPE_FIN);
+		ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+		    "locore_main: waiting for FIN_ACK ...\n");
+		wait_for_slaves(PKT_TYPE_FIN_ACK, mbufs);
+
+		// aggregate slave QPS
+		for (unsigned int i = 0; i < options.slaves.size(); i++) {
+			// these packets already underwent validity check in
+			// wait_for_slaves
+			auto pkt_hdr = rte_pktmbuf_mtod(mbufs[i],
+			    struct pkt_hdr *);
+			auto pld_qps = (struct pkt_payload_qps *)
+					   pkt_hdr->payload;
+			uint32_t qps = rte_be_to_cpu_32(pld_qps->qps);
+			uint32_t recved = rte_be_to_cpu_32(
+			    pld_qps->recved_pkts);
+			uint32_t loss = rte_be_to_cpu_32(pld_qps->lost_pkts);
+			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+			    "locore_main: received qps %d from client %d\n",
+			    qps, i);
+			options.s_slave_qps.fetch_add(qps);
+			options.s_slave_loss.fetch_add(loss);
+			options.s_slave_recved.fetch_add(recved);
+			rte_pktmbuf_free(mbufs[i]);
+		}
+	}
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: exited\n");
+
+	return 0;
+}
+
+static void
+dump_options()
+{
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
+	    "Configuration:\n"
+	    "    verbosity = +%d\n"
+	    "    run time = %d\n"
+	    "    warmup time = %d\n"
+	    "    output file = %s\n"
+	    "    number of threads = %d\n"
+	    "    interarrival dist = %s\n"
+	    "    target qps = %d\n"
+	    "    host IP = 0x%x\n"
+	    "    pkt loss time = %u\n"
+	    "    pkt loss failure threshold = %u\n"
+	    "    portid = %d\n",
+	    ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING, options.run_time,
+	    options.warmup_time, options.output, CPU_COUNT(&options.cpu_set),
+	    options.ia_gen_str, options.target_qps, options.s_host_spec.ip,
+	    options.pkt_loss_time_ms, options.pkt_loss_failure_threshold,
+	    options.portid);
+
+	for (auto slave : options.slaves) {
+		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
+		    "    slave = 0x%x@%x:%x:%x:%x:%x:%x\n", slave->ip,
+		    slave->mac_addr.addr_bytes[0],
+		    slave->mac_addr.addr_bytes[1],
+		    slave->mac_addr.addr_bytes[2],
+		    slave->mac_addr.addr_bytes[3],
+		    slave->mac_addr.addr_bytes[4],
+		    slave->mac_addr.addr_bytes[5]);
+	}
+}
+
+static void
+usage()
+{
+	fprintf(stdout,
+	    "Usage:\n"
+	    "    -v(vv): verbose mode\n"
+	    "    -s: server net spec\n"
+	    "    -S: slave(rat)'s net spec (also turns on master mode)\n"
+	    "    -t: run time\n"
+	    "    -T: warmup time\n"
+	    "    -h: display the information\n"
+	    "    -o: output filename\n"
+	    "    -A: affinity mask\n"
+	    "    -i: inter-arrival time distribution\n"
+	    "    -q: target qps\n"
+	    "    -H: host net spec\n"
+	    "    -L: pkt loss failure threshold\n"
+	    "    -l: pkt loss time threshold\n");
+}
+
+int
+main(int argc, char *argv[])
+{
+	std::ofstream log_file;
+	bool has_host_spec = false;
+
+	ntr_init();
+
+	// init dpdk
+	int ret = rte_eal_init(argc, argv);
+	if (ret < 0) {
+		rte_exit(EXIT_FAILURE, "rte_eal_init failed!\n");
+	}
+
+	argc -= ret;
+	argv += ret;
+
+	// set warning level
+	ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_WARNING);
+	{
+		int c;
+		// parse arguments
+		struct net_spec *ns;
+		while ((c = getopt(argc, argv, "vs:S:t:T:ho:A:i:q:H:L:l:p:")) !=
+		    -1) {
+			switch (c) {
+			case 'v':
+				ntr_set_level(NTR_DEP_USER1,
+				    ntr_get_level(NTR_DEP_USER1) + 1);
+				break;
+			case 's':
+				if (str_to_netspec(optarg,
+					&options.server_spec) != 0) {
+					rte_exit(EXIT_FAILURE,
+					    "invalid server net spec.\n");
+				}
+				break;
+			case 'S':
+				ns = new struct net_spec;
+				if (str_to_netspec(optarg, ns) != 0) {
+					rte_exit(EXIT_FAILURE,
+					    "invalid client net spec\n");
+				}
+				options.slaves.push_back(ns);
+				options.master_mode = 1;
+				if (options.slaves.size() > MAX_SLAVES) {
+					rte_exit(EXIT_FAILURE,
+					    "too many rats.\n");
+				}
+				break;
+			case 't':
+				options.run_time = strtol(optarg, nullptr, 10);
+				break;
+			case 'T':
+				options.warmup_time = strtol(optarg, nullptr,
+				    10);
+				break;
+			case 'h':
+				usage();
+				rte_exit(EXIT_SUCCESS, "\n");
+			case 'o':
+				strncpy(options.output, optarg,
+				    sizeof(options.output) - 1);
+				break;
+			case 'A':
+				cpulist_to_cpuset(optarg, &options.cpu_set);
+				break;
+			case 'i':
+				strncpy(options.ia_gen_str, optarg,
+				    sizeof(options.ia_gen_str) - 1);
+				break;
+			case 'q':
+				options.target_qps = strtoul(optarg, nullptr,
+				    10);
+				break;
+			case 'H':
+				has_host_spec = true;
+				if (str_to_netspec(optarg,
+					&options.s_host_spec) != 0) {
+					rte_exit(EXIT_FAILURE,
+					    "invalid host net spec.\n");
+				}
+				break;
+			case 'L':
+				options.pkt_loss_failure_threshold =
+				    strtoul(optarg, nullptr, 10);
+				break;
+			case 'l':
+				options.pkt_loss_time_ms = strtoul(optarg,
+				    nullptr, 10);
+				if (options.pkt_loss_time_ms == 0) {
+					options.pkt_loss_time_ms = UINT32_MAX;
+				}
+				break;
+			case 'p':
+				options.portid = strtol(optarg, nullptr, 10);
+				break;
+			default:
+				usage();
+				rte_exit(EXIT_FAILURE, "unknown argument: %c\n",
+				    c);
+			}
+		}
+	}
+
+	if (!has_host_spec) {
+		rte_exit(EXIT_FAILURE, "must specify host IP\n");
+	}
+
+	// init libtopo
+	if (topo_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) !=
+	    0) {
+		rte_exit(EXIT_FAILURE, "libtopo init failed!\n");
+	}
+
+	// init nms
+	if (nms_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != 0) {
+		rte_exit(EXIT_FAILURE, "failed to init libnms!\n");
+	}
+
+	if (CPU_COUNT(&options.cpu_set) != 1) {
+		rte_exit(EXIT_FAILURE, "must specify exactly one core\n");
+	}
+	int core_id = CPU_FFS(&options.cpu_set) - 1;
+	
+	dump_options();
+
+	// configure memory and port
+	struct port_conf pconf;
+	struct device_conf dconf;
+	struct mem_conf mconf;
+	portconf_get(options.portid, &pconf);
+
+	if (!pconf.timesync) {
+		options.s_hwtimestamp = false;
+		ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
+		    "main: timesync disabled. hw timestamp unavailable.\n ");
+	}
+
+	if (CPU_COUNT(&options.cpu_set) > 1) {
+		int ffs = CPU_FFS(&options.cpu_set);
+		CPU_ZERO(&options.cpu_set);
+		CPU_SET(ffs - 1, &options.cpu_set);
+		ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "cat only supports one thread, using only core %d.\n", ffs - 1);
+	}
+
+	dconf.mtu = MAX_STANDARD_MTU;
+	CPU_COPY(&options.cpu_set, &dconf.core_affinity);
+	dconf.portid = options.portid;
+	dconf.rss_hf = pconf.rss_hf;
+	dconf.rx_offloads = pconf.rxoffload;
+	dconf.tx_offloads = pconf.txoffload;
+	dconf.timesync = pconf.timesync;
+
+	dconf.rx_fn = rx_add_timestamp;
+	dconf.rx_user = nullptr;
+	dconf.rx_ring_sz = 2048;
+	dconf.tx_fn = tx_add_timestamp;
+	dconf.tx_user = nullptr;
+	dconf.tx_ring_sz = 2048;
+
+	mconf.cache_size = 64;
+	mconf.priv_size = 0;
+	mconf.num_elements = 4096;
+	mconf.data_room_size = RTE_MBUF_DEFAULT_BUF_SIZE + MAX_STANDARD_MTU;
+	mconf.max_pools = -1;
+
+	dpdk_init(&dconf, &mconf);
+
+	if (rte_eth_macaddr_get(options.portid,
+		&options.s_host_spec.mac_addr) != 0) {
+		rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n",
+		    options.portid);
+	}
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
+	    "Configured port %d with mac addr %x:%x:%x:%x:%x:%x\n",
+	    options.portid, options.s_host_spec.mac_addr.addr_bytes[0],
+	    options.s_host_spec.mac_addr.addr_bytes[1],
+	    options.s_host_spec.mac_addr.addr_bytes[2],
+	    options.s_host_spec.mac_addr.addr_bytes[3],
+	    options.s_host_spec.mac_addr.addr_bytes[4],
+	    options.s_host_spec.mac_addr.addr_bytes[5]);
+
+	// create default generator
+	options.s_iagen = createGenerator(options.ia_gen_str);
+	if (options.s_iagen == nullptr) {
+		rte_exit(EXIT_FAILURE, "invalid generator string %s\n",
+		    options.ia_gen_str);
+	}
+	options.s_iagen->set_lambda((double)options.target_qps);
+
+	// open log file for writing
+	log_file.open(options.output, std::ofstream::out);
+	if (!log_file) {
+		rte_exit(EXIT_FAILURE, "failed to open log file %s\n",
+		    options.output);
+	}
+
+	sleep(INIT_DELAY);
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
+	    "main: launching thread on core %d\n", core_id);
+	if (rte_eal_remote_launch(locore_main, nullptr, core_id) != 0) {
+		rte_exit(EXIT_FAILURE, "failed to launch function on locore\n");
+	}
+
+	// XXX: poor man's timer
+	uint32_t second = 0;
+	while (true) {
+		if (second >= options.warmup_time) {
+			options.s_record.store(1);
+		}
+		if (second >= options.run_time + options.warmup_time) {
+			options.s_stop.store(true);
+			break;
+		}
+		usleep(S2US);
+		second++;
+	}
+
+	if (rte_eal_wait_lcore(core_id) < 0)
+		rte_exit(EXIT_FAILURE, "failed to wait for job completion\n");
+
+	// calculate QPS
+	uint32_t qps = (double)options.s_recved_pkts.load() /
+	    (((double)(options.s_end_time.load() -
+		  options.s_start_time.load()) /
+		(double)S2NS));
+	qps += options.s_slave_qps.load();
+
+	// dump stats
+	log_file << qps << ',' << options.s_recved_pkts.load() << ','
+		 << options.s_pkt_loss.load() << ','
+		 << options.s_slave_recved.load() << ','
+		 << options.s_slave_loss.load() << std::endl;
+
+	for (auto it : options.s_data) {
+		if (it->valid) {
+			log_file << it->clt_sw_rx << ',' << it->clt_sw_tx << ','
+				 << it->clt_hw_rx << ',' << it->clt_hw_tx << ','
+				 << it->srv_sw_rx << ',' << it->srv_sw_tx << ','
+				 << it->srv_hw_rx << ',' << it->srv_hw_tx
+				 << std::endl;
+		}
+		delete it;
+	}
+	log_file.close();
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
+	    "qps = %d, recved = %d, loss = %d, slave recved = %d, slave loss = %d\n",
+	    qps, options.s_recved_pkts.load(), options.s_pkt_loss.load(),
+	    options.s_slave_recved.load(), options.s_slave_loss.load());
+
+	// clean up
+	dpdk_cleanup(&dconf);
+
+	return 0;
+}
--- a/net/khat.cc
+++ b/net/khat.cc
@ -0,0 +1,701 @@
+#include <atomic>
+#include <cassert>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <vector>
+#include <unistd.h>
+
+#include <sys/cpuset.h>
+#include <sys/endian.h>
+#include <sys/sched.h>
+#include <sys/types.h>
+
+#include <topo.h>
+
+#include <rte_common.h>
+#include <rte_config.h>
+#include <rte_cycles.h>
+#include <rte_eal.h>
+#include <rte_ethdev.h>
+#include <rte_ether.h>
+#include <rte_launch.h>
+#include <rte_lcore.h>
+#include <rte_mbuf.h>
+
+#include "ntr.h"
+
+//#include "gen.hh"
+#include "net/netsup.hh"
+#include "net/pkt.hh"
+#include "nms.h"
+#include "rte_byteorder.h"
+
+constexpr static unsigned int BURST_SIZE = 32;
+constexpr static unsigned int CACHELINE_SIZE = 64;
+constexpr static uint16_t THREAD_LOAD_BUFFER_SZ = 16384;
+
+struct probe_state_t {
+	struct net_spec dst;
+	struct conn_spec cspec {
+		.dst = &dst
+	};
+	uint64_t last_sw_rx;
+	uint64_t last_sw_tx;
+	uint64_t last_hw_rx;
+	uint32_t epoch;
+};
+
+// keep track of the probe state
+// when a probe packet first arrives this state is set to be influx and the
+// rte_mbuf's userdata is set to PROBE_MAGIC which prevents other probe packets
+// to be processed when the server sends the probe stats back to user influx is
+// released this is to guarantee that the server only processes one probe packet
+// at the time
+// XXX: also this can be attached to the mbuf itself and processed by the lcore
+// thread
+//      I kept this global because globally there could be only one pending
+//      probe request and rx_add_timestamp can save their shit here too
+struct thread_info {
+	int tid;
+	int rxqid;
+	int txqid;
+	int lcore_id;
+	int node_id;
+	void *cache_lines;
+	void *load_buffer;
+};
+
+struct options_t {
+	// config
+	int num_threads { 1 };
+	cpuset_t cpu_set = CPUSET_T_INITIALIZER(0x2); // 2nd core
+	bool jumbo_frame_enabled {
+		false
+	}; // setting this to true changes mbuf size and mtu
+	int port_mtu { MAX_STANDARD_MTU };
+	int thread_cacheline_cnt = { 1600 }; // 100MB data per thread
+	uint16_t portid { 0 };
+
+	// states
+	struct net_spec s_host_spec { };
+	std::vector<struct thread_info *> s_thr_info;
+	int probe_state_offset { 0 };
+	bool s_hwtimestamp { true };
+
+	struct probe_state_t s_probe_info;
+	std::atomic<bool> is_probing { false };
+};
+
+struct options_t options;
+
+static bool
+mbuf_is_probe_valid(struct rte_mbuf *pkt)
+{
+	return *RTE_MBUF_DYNFIELD(pkt, options.probe_state_offset, bool *);
+}
+
+static void
+mbuf_set_probe_valid(struct rte_mbuf *pkt, bool b)
+{
+	*RTE_MBUF_DYNFIELD(pkt, options.probe_state_offset, bool *) = b;
+}
+
+static uint16_t
+rx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
+    struct rte_mbuf **pkts, uint16_t nb_pkts, uint16_t max_pkts __rte_unused,
+    void *_ __rte_unused)
+{
+	int rc = 0;
+	uint64_t now = topo_uptime_ns();
+	struct timespec ts { };
+	struct pkt_hdr *pkt_data;
+	for (int i = 0; i < nb_pkts; i++) {
+		pkt_data = check_valid_packet(pkts[i],
+		    &options.s_host_spec.mac_addr);
+
+		if (pkt_data == nullptr) {
+			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+			    "rx_add_timestamp: ignoring invalid packet %p.\n",
+			    (void *)pkts[i]);
+			continue;
+		}
+
+		if (rte_be_to_cpu_16(pkt_data->type) == PKT_TYPE_PROBE) {
+			bool cmp = false;
+			mbuf_set_probe_valid(pkts[i], false);
+			if (options.is_probing.compare_exchange_strong(cmp,
+				true)) {
+				options.s_probe_info.last_sw_rx = now;
+				if (options.s_hwtimestamp) {
+					if ((rc = rte_eth_timesync_read_rx_timestamp(
+						 port, &ts,
+						 pkts[i]->timesync & 0x3)) ==
+					    0) {
+						options.s_probe_info
+						    .last_hw_rx = ts.tv_nsec +
+						    ts.tv_sec * S2NS;
+						ntr(NTR_DEP_USER1,
+						    NTR_LEVEL_DEBUG,
+						    "rx_add_timestamp: tagged packet %p with sw rx: %lu hw rx:%lu.\n",
+						    (void *)pkts[i],
+						    options.s_probe_info
+							.last_sw_rx,
+						    options.s_probe_info
+							.last_hw_rx);
+						mbuf_set_probe_valid(pkts[i],
+						    true);
+					} else {
+						options.is_probing.store(false);
+						ntr(NTR_DEP_USER1,
+						    NTR_LEVEL_WARNING,
+						    "rx_add_timestamp: packet %p not tagged - failed to read hw rx timestamp: %d.\n",
+						    (void *)pkts[i], rc);
+					}
+				} else {
+					mbuf_set_probe_valid(pkts[i], true);
+					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+					    "rx_add_timestamp: tagged packet %p with sw rx only: %lu.\n",
+					    (void *)pkts[i], now);
+				}
+			} else {
+				ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+				    "rx_add_timestamp: packet %p not tagged - server is probing.\n",
+				    (void *)pkts[i]);
+			}
+		} else {
+			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+			    "rx_add_timestamp: packet %p not tagged - not PROBE packet: type %d.\n",
+			    (void *)pkts[i], rte_be_to_cpu_16(pkt_data->type));
+		}
+	}
+
+	return nb_pkts;
+}
+
+static uint16_t
+tx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
+    struct rte_mbuf **pkts, uint16_t nb_pkts, void *_ __rte_unused)
+{
+	uint64_t now = topo_uptime_ns();
+	struct pkt_hdr *pkt_data;
+
+	for (int i = 0; i < nb_pkts; i++) {
+
+		pkt_data = check_valid_packet(pkts[i],
+		    &options.s_host_spec.mac_addr);
+
+		if (pkt_data == nullptr) {
+			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+			    "tx_add_timestamp: ignoring invalid packet %p.\n",
+			    (void *)pkts[i]);
+			continue;
+		}
+
+		if (rte_be_to_cpu_16(pkt_data->type) == PKT_TYPE_PROBE_RESP) {
+			// this packet is the response to PROBE packets
+
+			// at this time the packet is not sent to the NIC yet so
+			// the state must be waiting stats
+			assert(options.is_probing.load() &&
+			    mbuf_is_probe_valid(pkts[i]));
+
+			options.s_probe_info.last_sw_tx = now;
+
+			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+			    "tx_add_timestamp: tagged packet %p with sw tx %lu\n",
+			    (void *)pkts[i], options.s_probe_info.last_sw_tx);
+		} else {
+			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+			    "tx_add_timestamp: packet %p not tagged - type %d\n",
+			    (void *)pkts[i], pkt_data->type);
+		}
+	}
+
+	return nb_pkts;
+}
+
+static void 
+worker_cpu_load(unsigned long us)
+{
+	uint64_t now = topo_uptime_ns();
+	while(true) {
+		uint64_t cur = topo_uptime_ns();
+		if (cur - now >= us * 1000) {
+			break;
+		} 
+	}
+}
+
+static void
+worker_memory_load(int tid, uint32_t which, uint32_t load)
+{
+	uint32_t start_cacheline = which % (options.thread_cacheline_cnt * options.s_thr_info.size());
+	uint32_t thrd = start_cacheline / options.thread_cacheline_cnt;
+	uint32_t start = start_cacheline % options.thread_cacheline_cnt;
+	struct thread_info * cur = options.s_thr_info.at(tid);
+	struct thread_info * tgt = options.s_thr_info.at(thrd);
+	for (uint32_t i = 0; i < load; i++) {
+		*(uint32_t *)cur->load_buffer = *(uint32_t *)((char *)tgt->cache_lines + ((start + i) % options.thread_cacheline_cnt) * CACHELINE_SIZE);
+	}
+}
+
+static int
+locore_main(void *ti)
+{
+	auto tinfo = (struct thread_info *)ti;
+	struct rte_mbuf *bufs[BURST_SIZE];
+	// + 1 because it might involve an extra PKT_TYPE_STAT packet
+	// when all tx timestamps are ready
+	struct rte_mbuf *tx_bufs[BURST_SIZE];
+	struct pkt_hdr *pkt_data;
+	// XXX: hack hardcode to be larger than MTU
+
+	bool pending_probe = false;
+
+	if (rte_eth_dev_socket_id(options.portid) > 0 &&
+	    rte_eth_dev_socket_id(options.portid) != (int)rte_socket_id()) {
+		ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
+		    "locore_main <thread %d>: WARNING, port %d is on remote NUMA node to "
+		    "polling thread.\n\tPerformance will "
+		    "not be optimal.\n",
+		    tinfo->tid, options.portid);
+	}
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
+	    "locore_main <thread %d>: running on locore %d with txqid %d and rxqid %d.\n",
+	    tinfo->tid, rte_lcore_id(), tinfo->txqid, tinfo->rxqid);
+
+	while (true) {
+		uint16_t nb_tx = 0;
+		const uint16_t nb_rx = rte_eth_rx_burst(options.portid,
+		    tinfo->rxqid, bufs, BURST_SIZE);
+		struct rte_mbuf *pkt_buf;
+		struct pkt_hdr *tx_data;
+
+		for (int i = 0; i < nb_rx; i++) {
+			// XXX: optimization: in rx_add_timestamp every packet
+			// is already validated once can just mark valid packet
+			// with a value so we can avoid this redundant check
+			pkt_data = check_valid_packet(bufs[i],
+			    &options.s_host_spec.mac_addr);
+
+			if (pkt_data == nullptr) {
+				ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+				    "locore_main <thread %d>: skipping invalid packet %p.\n",
+				    tinfo->tid, (void *)bufs[i]);
+				// dump_pkt(bufs[i]);
+				rte_pktmbuf_free(bufs[i]);
+				continue;
+			}
+
+			NTR_PKT(NTR_DEP_USER1, NTR_LEVEL_DEBUG, pkt_data,
+			    "locore_main <thread %d>: received packet ", tinfo->tid);
+			switch (rte_be_to_cpu_16(pkt_data->type)) {
+			case PKT_TYPE_PROBE: {
+				if (mbuf_is_probe_valid(bufs[i])) {
+					// send back probe_resp pkt to probe for
+					// return latency
+					pending_probe = true;
+
+					// book keep probe results
+					options.s_probe_info.epoch =
+					    rte_be_to_cpu_32(
+						((struct pkt_payload_epoch *)
+							pkt_data->payload)
+						    ->epoch);
+
+					pkt_hdr_to_netspec(pkt_data,
+					    &options.s_probe_info.dst,
+					    &options.s_probe_info.cspec
+						 .dst_port,
+					    nullptr,
+					    &options.s_probe_info.cspec
+						 .src_port);
+
+					options.s_probe_info.cspec.src =
+					    &options.s_host_spec;
+
+					if (alloc_pkt_hdr(mempool_get(
+							      tinfo->node_id),
+						PKT_TYPE_PROBE_RESP,
+						&options.s_probe_info.cspec, 0,
+						&pkt_buf, &tx_data) != 0) {
+						rte_exit(EXIT_FAILURE,
+						    "failed to allocate pkt\n");
+					}
+
+					rte_memcpy(tx_data->payload,
+					    pkt_data->payload,
+					    sizeof(struct pkt_payload_epoch));
+
+					mbuf_set_probe_valid(pkt_buf, true);
+
+					// queue for burst send
+					NTR_PKT(NTR_DEP_USER1, NTR_LEVEL_DEBUG, tx_data,
+			    		"locore_main <thread %d>: sending packet ", tinfo->tid);
+					tx_bufs[nb_tx++] = pkt_buf;
+				}
+				break;
+			}
+			case PKT_TYPE_LOAD: {
+				struct conn_spec cspec;
+				struct net_spec src;
+				struct net_spec dst;
+
+				// touch the unused data to pretend that we read
+				// those dummy fields
+				memcpy(tinfo->load_buffer, pkt_data->payload,
+				    MIN(bufs[i]->data_len -
+					    sizeof(struct pkt_hdr),
+					THREAD_LOAD_BUFFER_SZ));
+
+				// perform the load
+				auto pld = (struct pkt_payload_load *)
+					       pkt_data->payload;
+				
+				uint32_t load_type = rte_be_to_cpu_32(pld->type);
+				uint32_t load_arg0 = rte_be_to_cpu_32(pld->arg0);
+				uint32_t load_arg1 = rte_be_to_cpu_32(pld->arg1);
+
+				ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+				    "locore_main <thread %d>: LOAD type %d, arg0 %d, arg1 %d\n",
+				    tinfo->tid, load_type, load_arg0, load_arg1);
+
+				if (load_type == LOAD_TYPE_CPU) {
+					worker_cpu_load(load_arg0);
+				} else if (load_type == LOAD_TYPE_MEM) {
+					worker_memory_load(tinfo->tid, load_arg0, load_arg1);
+				} else {
+					ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
+							"locore_main <thread %d>: unknown LOAD type %d, ignoring...", tinfo->tid, load_type);
+					break;
+				}
+
+				// reply
+				pkt_hdr_to_netspec(pkt_data, &src,
+				    &cspec.dst_port, &dst, &cspec.src_port);
+				cspec.dst = &src;
+				cspec.src = &dst;
+
+				// printf("LOAD PKT SIZE: %d\n",
+				// bufs[i]->data_len); we reply to load packet
+				// regardless of the server state
+				if (alloc_pkt_hdr(mempool_get(tinfo->node_id),
+					PKT_TYPE_LOAD_RESP, &cspec, 0, &pkt_buf,
+					&tx_data) != 0) {
+					rte_exit(EXIT_FAILURE,
+					    "failed to allocate pkt\n");
+				}
+
+				rte_memcpy(tx_data->payload, pkt_data->payload,
+				    sizeof(struct pkt_payload_load));
+
+				// queue for burst send
+				NTR_PKT(NTR_DEP_USER1, NTR_LEVEL_DEBUG, tx_data,
+			    		"locore_main <thread %d>: sending packet ", tinfo->tid);
+				tx_bufs[nb_tx++] = pkt_buf;
+				break;
+			}
+			default:
+				ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+				    "locore_main <thread %d>: ignoring packet %p with unknown type %d.\n",
+				    tinfo->tid, (void *)bufs[i],
+				    rte_be_to_cpu_16(pkt_data->type));
+				break;
+			}
+			rte_pktmbuf_free(bufs[i]);
+		}
+
+		// send all packets
+		tx_burst_all(options.portid, tinfo->txqid, tx_bufs, nb_tx);
+
+		// we wanna check every loop not only when there are packets
+		if (pending_probe) {
+			assert(options.is_probing.load());
+			struct timespec ts { };
+			struct pkt_payload_stat *stat;
+			int status = 0;
+			if (options.s_hwtimestamp) {
+				if ((status = rte_eth_timesync_read_tx_timestamp(
+					options.portid, &ts)) == 0) {
+					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+					    "locore_main <thread %d>: obtained hw tx timestamp %lu.\n",
+					    tinfo->tid,
+					    (ts.tv_sec * S2NS + ts.tv_nsec));
+				} else {
+					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+					    "locore_main <thread %d>: failed to obtain hw tx timestamp: %d.\n",
+					    tinfo->tid, status);
+				}
+			}
+			if (status == 0) {
+				// now we have everything we need
+
+				if (alloc_pkt_hdr(mempool_get(tinfo->node_id),
+					PKT_TYPE_STAT, &options.s_probe_info.cspec, 0,
+					&pkt_buf, &tx_data) != 0) {
+					rte_exit(EXIT_FAILURE,
+						"failed to alloc pkt_buf\n");
+				}
+
+				// populate stats
+				stat = (struct pkt_payload_stat *)tx_data->payload;
+				stat->epoch = rte_cpu_to_be_32(
+					options.s_probe_info.epoch);
+				if (options.s_hwtimestamp) {
+					stat->hw_rx = rte_cpu_to_be_64(
+						options.s_probe_info.last_hw_rx);
+					stat->hw_tx = rte_cpu_to_be_64(
+						ts.tv_nsec + ts.tv_sec * S2NS);
+				} else {
+					stat->hw_rx = 0;
+					stat->hw_tx = 0;
+				}
+				stat->sw_rx = rte_cpu_to_be_64(
+					options.s_probe_info.last_sw_rx);
+				stat->sw_tx = rte_cpu_to_be_64(
+					options.s_probe_info.last_sw_tx);
+				
+				// send the packet
+				tx_burst_all(options.portid, tinfo->txqid, &pkt_buf, 1);
+
+				// release flux
+				pending_probe = false;
+				options.is_probing.store(false);
+			}
+		}
+	}
+}
+
+static void
+usage()
+{
+	fprintf(stdout,
+	    "Usage:\n"
+	    "    -v(vv): verbose mode\n"
+	    "    -h: seek help\n"
+	    "    -A: cpu list for worker threads\n"
+	    "    -m: enable memory load generator(MLG)\n"
+	    "    -b: MLG trunk size\n"
+	    "    -x: MLG thread affinity mask\n"
+	    "    -X: MLG target domain affinity mask\n"
+	    "    -S: MLG shared buffer\n"
+	    "    -H: host spec\n"
+	    "    -J: enable jumbo frames\n"
+	    "    -p: port id\n");
+	fflush(stdout);
+}
+
+static void
+dump_options()
+{
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
+	    "main: khat configuration:\n"
+	    "          verbosity: +%d\n"
+	    "          thread count: %d\n"
+	    "          ip: 0x%x\n"
+	    "          jumbo frame: %d\n"
+	    "          port id: %d\n",
+	    ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING,
+	    options.num_threads, options.s_host_spec.ip,
+	    options.jumbo_frame_enabled, options.portid);
+}
+
+int
+main(int argc, char *argv[])
+{
+	bool has_host_spec { false };
+	struct mem_conf mconf;
+	struct device_conf dconf;
+
+	ntr_init();
+
+	// init dpdk
+	int ret = rte_eal_init(argc, argv);
+	if (ret < 0) {
+		rte_exit(EXIT_FAILURE, "rte_eal_init failed!\n");
+	}
+
+	argc -= ret;
+	argv += ret;
+
+	// set warning level
+	ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_WARNING);
+	{
+		int c;
+		// parse arguments
+		while ((c = getopt(argc, argv, "hvA:H:Jp:")) != -1) {
+			switch (c) {
+			case 'v':
+				ntr_set_level(NTR_DEP_USER1,
+				    ntr_get_level(NTR_DEP_USER1) + 1);
+				break;
+			case 'h':
+				usage();
+				rte_exit(EXIT_SUCCESS, "\n");
+			case 'A':
+				cpulist_to_cpuset(optarg, &options.cpu_set);
+				options.num_threads = CPU_COUNT(
+				    &options.cpu_set);
+				if (options.num_threads == 0) {
+					rte_exit(EXIT_FAILURE,
+					    "must run at least one thread\n");
+				}
+				break;
+			case 'H':
+				if (str_to_netspec(optarg,
+					&options.s_host_spec) != 0) {
+					rte_exit(EXIT_FAILURE,
+					    "invalid host spec\n");
+				}
+				has_host_spec = true;
+				break;
+			case 'J':
+				options.jumbo_frame_enabled = true;
+				options.port_mtu = MAX_JUMBO_MTU;
+				break;
+			case 'p':
+				options.portid = atoi(optarg);
+				break;
+			default:
+				usage();
+				rte_exit(EXIT_SUCCESS, "unknown argument: %c",
+				    c);
+			}
+		}
+	}
+
+	if (!has_host_spec) {
+		rte_exit(EXIT_FAILURE, "Must specify host spec\n");
+	}
+
+	// init libtopo
+	if (topo_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) !=
+	    0) {
+		rte_exit(EXIT_FAILURE, "libtopo init failed!\n");
+	}
+
+	// init libnms
+	if (nms_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != 0) {
+		rte_exit(EXIT_FAILURE, "libnms init failed!\n");
+	}
+
+	dump_options();
+
+	// register dynamic field
+	struct rte_mbuf_dynfield rte_mbuf_dynfield_probe_flag = {
+		.name = "rte_mbuf_dynfield_probe_valid",
+		.size = sizeof(bool),
+		.align = __alignof__(uint32_t),
+		.flags = 0
+	};
+	options.probe_state_offset = rte_mbuf_dynfield_register(
+	    &rte_mbuf_dynfield_probe_flag);
+	if (options.probe_state_offset == -1) {
+		rte_exit(EXIT_FAILURE, "failed to register dynamic field: %d\n",
+		    rte_errno);
+	}
+
+	// configure memory and port
+	struct port_conf pconf;
+	portconf_get(options.portid, &pconf);
+	if (!pconf.timesync) {
+		ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
+		    "main: timesync disabled. hw timestamp unavailable.\n ");
+		options.s_hwtimestamp = false;
+	}
+	dconf.mtu = options.port_mtu;
+	CPU_COPY(&options.cpu_set, &dconf.core_affinity);
+	dconf.portid = options.portid;
+	dconf.rss_hf = pconf.rss_hf;
+	dconf.rx_offloads = pconf.rxoffload;
+	dconf.tx_offloads = pconf.txoffload;
+	dconf.timesync = pconf.timesync;
+
+	dconf.rx_fn = rx_add_timestamp;
+	dconf.rx_user = nullptr;
+	dconf.rx_ring_sz = 2048;
+	dconf.tx_fn = tx_add_timestamp;
+	dconf.tx_user = nullptr;
+	dconf.tx_ring_sz = 2048;
+
+	mconf.cache_size = 512;
+	mconf.priv_size = 0;
+	mconf.num_elements = (dconf.rx_ring_sz + dconf.tx_ring_sz) *
+	    rte_lcore_count() / rte_socket_count();
+	mconf.data_room_size = RTE_MBUF_DEFAULT_BUF_SIZE + MAX_JUMBO_MTU -
+	    MAX_STANDARD_MTU;
+	mconf.max_pools = -1;
+
+	dpdk_init(&dconf, &mconf);
+
+	if (rte_eth_macaddr_get(options.portid,
+		&options.s_host_spec.mac_addr) != 0) {
+		rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n",
+		    options.portid);
+	}
+
+	// init threads
+	uint32_t cpu_idx = CPU_FFS(&options.cpu_set);
+	uint32_t tid = 0;
+	while (cpu_idx != 0) {
+		uint32_t lcore_id = cpu_idx - 1;
+		uint32_t node_id = rte_lcore_to_socket_id(lcore_id);
+		auto *tinfo = (struct thread_info *)nms_malloc(node_id,
+		    sizeof(struct thread_info));
+		tinfo->cache_lines = nms_malloc(node_id,
+		    CACHELINE_SIZE * options.thread_cacheline_cnt);
+		tinfo->load_buffer = nms_malloc(node_id,
+		    THREAD_LOAD_BUFFER_SZ);
+		tinfo->tid = tid;
+		tinfo->lcore_id = lcore_id;
+		tinfo->node_id = node_id;
+		tinfo->rxqid = tid;
+		tinfo->txqid = tid;
+		options.s_thr_info.push_back(tinfo);
+		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
+		    "main: thread %d assigned to cpu %d, node %d\n", tinfo->tid,
+		    tinfo->lcore_id, topo_core_to_numa(lcore_id));
+
+		tid++;
+		CPU_CLR(cpu_idx - 1, &options.cpu_set);
+		cpu_idx = CPU_FFS(&options.cpu_set);
+	}
+
+	sleep(INIT_DELAY);
+
+	for (int i = 0; i < options.num_threads; i++) {
+		struct thread_info *tinfo = options.s_thr_info.at(i);
+		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
+		    "main: launching thread %d on locore %d\n", tinfo->tid,
+		    tinfo->lcore_id);
+		if (rte_eal_remote_launch(locore_main,
+			(void *)options.s_thr_info.at(i),
+			tinfo->lcore_id) != 0) {
+			rte_exit(EXIT_FAILURE,
+			    "failed to launch function on locore %d\n",
+			    tinfo->lcore_id);
+		}
+	}
+
+	while (true) {
+		usleep(S2US);
+	}
+
+	// shouldn't get here
+	// clean up
+	for (int i = 0; i < options.num_threads; i++) {
+		struct thread_info *tinfo = options.s_thr_info.at(i);
+		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
+		    "main: waiting for locore %d...\n", tinfo->lcore_id);
+		if (rte_eal_wait_lcore(tinfo->lcore_id) != 0) {
+			rte_exit(EXIT_FAILURE, "failed to wait for locore %d\n",
+			    tinfo->lcore_id);
+		}
+	}
+
+	dpdk_cleanup(&dconf);
+
+	return 0;
+}
--- a/net/libnetsup/dpdk.cc
+++ b/net/libnetsup/dpdk.cc
@ -0,0 +1,204 @@
+#include "net/netsup.hh"
+#include <cstdlib>
+
+#include "rte_build_config.h"
+#include "rte_common.h"
+#include "rte_config.h"
+#include "rte_ether.h"
+#include "rte_lcore.h"
+#include "rte_mempool.h"
+#include "rte_mbuf.h"
+#include "rte_errno.h"
+#include "rte_ethdev.h"
+
+#include "ntr.h"
+
+static struct rte_mempool *g_mempools[MAX_NUMA_NODES] = {nullptr};
+static unsigned int g_mempool_sz = 0;
+
+static void
+mempool_init(struct mem_conf *mconf)
+{
+	struct rte_mempool * mbuf_pool;
+	char mempool_name[64];
+
+	for (int i = 0; i < (int)rte_socket_count(); i++) {
+		uint32_t nodeid = i;
+		// ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
+		//     "mempool_init: creating mempool for node %d\n", nodeid);
+
+		// create one mbuf pool per socket
+		snprintf(mempool_name, sizeof(mempool_name), "net_mempool_%d", nodeid);
+
+		mbuf_pool = rte_pktmbuf_pool_create(mempool_name, mconf->num_elements, 
+					mconf->cache_size, mconf->priv_size, 
+				mconf->data_room_size, nodeid);
+
+		if (mbuf_pool == nullptr) {
+			rte_exit(EXIT_FAILURE, "cannot create mbuf pool: %d\n", rte_errno);
+		}
+
+		g_mempools[nodeid] = mbuf_pool;
+		g_mempool_sz++;
+		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "mempool_init: created mempool for node %d\n", nodeid);
+	}
+}
+
+struct rte_mempool *
+mempool_get(int nodeid)
+{
+	if ((unsigned int)nodeid < g_mempool_sz) {
+		return g_mempools[nodeid];
+	}
+	return nullptr;
+}
+
+static void
+port_init(struct device_conf *dconf)
+{
+	struct rte_ether_addr addr;
+	struct rte_eth_dev_info dev_info {
+	};
+	struct rte_eth_conf port_conf;
+	struct rte_eth_txconf txconf {
+	};
+	struct rte_eth_rxconf rxconf {
+	};
+	int ret;
+
+	int num_threads = CPU_COUNT(&dconf->core_affinity);
+	if (rte_eth_dev_count_avail() == 0) {
+		rte_exit(EXIT_FAILURE, "number of ports must be > 0\n");
+	}
+
+	if (!rte_eth_dev_is_valid_port(dconf->portid)) {
+		rte_exit(EXIT_FAILURE, "cannot find port %d\n", dconf->portid);
+	}
+
+	if ((ret = rte_eth_macaddr_get(dconf->portid, &addr)) != 0) {
+		rte_exit(EXIT_FAILURE, "cannot get mac address of port: %d\n", ret);
+	}
+
+	ret = rte_eth_dev_info_get(dconf->portid, &dev_info);
+	if (ret != 0) {
+		rte_exit(EXIT_FAILURE, "failed to get dev info: %d", ret);
+	}
+
+	memset(&port_conf, 0, sizeof(struct rte_eth_conf));
+	port_conf.rxmode.mtu = dconf->mtu;
+	port_conf.rxmode.mq_mode = RTE_ETH_MQ_RX_RSS;
+	port_conf.rx_adv_conf.rss_conf.rss_key = nullptr;
+	port_conf.rx_adv_conf.rss_conf.rss_hf = dconf->rss_hf;
+
+	port_conf.rxmode.offloads = dconf->rx_offloads;
+	port_conf.txmode.offloads = dconf->tx_offloads;
+
+	/* Configure the Ethernet device. */
+	ret = rte_eth_dev_configure(dconf->portid, num_threads, num_threads, &port_conf);
+	if (ret != 0)
+		rte_exit(EXIT_FAILURE, "failed to configure port: %d\n", ret);
+
+	ret = rte_eth_dev_adjust_nb_rx_tx_desc(dconf->portid, &dconf->rx_ring_sz, &dconf->tx_ring_sz);
+	if (ret != 0)
+		rte_exit(EXIT_FAILURE, "failed to set rx tx queue size: %d\n", ret);
+
+	/* Allocate and set up 1 RX queue per thread per Ethernet port. */
+	rxconf = dev_info.default_rxconf;
+	rxconf.offloads = port_conf.rxmode.offloads;
+	rxconf.rx_nseg = 0;
+	rxconf.rx_seg = nullptr;
+	txconf = dev_info.default_txconf;
+	txconf.offloads = port_conf.txmode.offloads;
+
+	int core;
+	int qid = 0;
+	CPU_FOREACH_ISSET(core, &dconf->core_affinity) {
+		int socket = rte_lcore_to_socket_id(core);
+		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "port_init: setting up rx & tx queue for core %d (socket %d)...\n", core, socket);
+		ret = rte_eth_rx_queue_setup(dconf->portid, qid, dconf->rx_ring_sz, socket, &rxconf, mempool_get(socket));
+		if (ret < 0)
+			rte_exit(EXIT_FAILURE, "failed to setup rx queue for core %d: %d\n", core, ret);
+		
+		ret = rte_eth_tx_queue_setup(dconf->portid, qid, dconf->tx_ring_sz, socket, &txconf);
+		if (ret < 0)
+			rte_exit(EXIT_FAILURE, "failed to setup tx queue for core %d: %d", core, ret);
+
+		qid++;
+	}
+
+	// set mtu
+	ret = rte_eth_dev_set_mtu(dconf->portid, dconf->mtu);
+	if (ret != 0)
+		rte_exit(EXIT_FAILURE, "failed to set mtu: %d\n", ret);
+
+	ret = rte_eth_dev_start(dconf->portid);
+	if (ret < 0)
+		rte_exit(EXIT_FAILURE, "failed to start port: %d\n", ret);
+
+	if (dconf->timesync) {
+		ret = rte_eth_timesync_enable(dconf->portid);
+		if (ret != 0)
+			rte_exit(EXIT_FAILURE, "failed to enable timesync: %d\n", ret);
+	}
+
+	/* Enable RX in promiscuous mode for the Ethernet device. */
+	ret = rte_eth_promiscuous_enable(dconf->portid);
+	if (ret != 0)
+		rte_exit(EXIT_FAILURE, "failed to enable promiscuous mode: %d\n", ret);
+
+	for (int i = 0; i < num_threads; i++) {
+		if (dconf->tx_fn != nullptr) {
+			if (rte_eth_add_tx_callback(dconf->portid, i, dconf->tx_fn, dconf->tx_user) == nullptr) {
+				rte_exit(EXIT_FAILURE, "failed to attach callback to tx queue %d\n", i);
+			}
+		}
+		
+		if (dconf->rx_fn != nullptr) {
+			if (rte_eth_add_rx_callback(dconf->portid, i, dconf->rx_fn, dconf->rx_user) == nullptr) {
+				rte_exit(EXIT_FAILURE, "failed to attach callback to rx queue %d\n", i);
+			}
+		}
+	}
+
+	// sync_port_clock(portid);
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, 
+	"port_init: configured port %d on socket %d with mac addr %x:%x:%x:%x:%x:%x\n",
+		dconf->portid, rte_eth_dev_socket_id(dconf->portid),
+		addr.addr_bytes[0],
+		addr.addr_bytes[1],
+		addr.addr_bytes[2],
+		addr.addr_bytes[3],
+		addr.addr_bytes[4],
+		addr.addr_bytes[5]);
+}
+
+void
+dpdk_init(struct device_conf *dconf, struct mem_conf *mconf)
+{
+	if (rte_socket_count() > (int)MAX_NUMA_NODES) {
+		rte_exit(EXIT_FAILURE, "too many numa nodes\n");
+	}
+
+	// ensure 1-1 mapping
+	for (int i = 0; i < (int)rte_socket_count(); i++) {
+		if (rte_socket_id_by_idx(i) != i) {
+			rte_exit(EXIT_FAILURE, "socket %d has id %d instead.\n", i, rte_socket_id_by_idx(i));
+		}
+	}
+
+	mempool_init(mconf);
+
+	port_init(dconf);
+}
+
+void
+dpdk_cleanup(struct device_conf * dconf)
+{
+	rte_eth_dev_stop(dconf->portid);
+	rte_eth_dev_close(dconf->portid);
+
+	for (int i = 0; i < (int)rte_socket_count(); i++) {
+		rte_mempool_free(g_mempools[i]);
+	}
+}
--- a/net/libnetsup/portconf.cc
+++ b/net/libnetsup/portconf.cc
@ -0,0 +1,66 @@
+#include "rte_ethdev.h"
+#include "net/netsup.hh"
+#include <cstdlib>
+
+static struct port_conf port_confs[] = {
+	{
+		.driver_name = "net_cxgbe",
+		.rxoffload = RTE_ETH_RX_OFFLOAD_RSS_HASH | RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_IPV4_CKSUM,
+		.txoffload = RTE_ETH_TX_OFFLOAD_UDP_CKSUM | RTE_ETH_TX_OFFLOAD_IPV4_CKSUM,
+		.rss_hf = RTE_ETH_RSS_UDP | RTE_ETH_RSS_FRAG_IPV4,
+		.timesync = false
+	},
+	{
+		.driver_name = "net_i40e",
+		.rxoffload = RTE_ETH_RX_OFFLOAD_RSS_HASH | RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_IPV4_CKSUM,
+		.txoffload = RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE | RTE_ETH_TX_OFFLOAD_UDP_CKSUM | RTE_ETH_TX_OFFLOAD_IPV4_CKSUM,
+		.rss_hf = RTE_ETH_RSS_FRAG_IPV4 | RTE_ETH_RSS_NONFRAG_IPV4_UDP | RTE_ETH_RSS_NONFRAG_IPV4_OTHER | RTE_ETH_RSS_L2_PAYLOAD,
+		.timesync = false
+	},
+	{
+		.driver_name = "net_ice",
+		.rxoffload = RTE_ETH_RX_OFFLOAD_RSS_HASH | RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_IPV4_CKSUM | RTE_ETH_RX_OFFLOAD_TIMESTAMP,
+		.txoffload = RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE | RTE_ETH_TX_OFFLOAD_UDP_CKSUM | RTE_ETH_TX_OFFLOAD_IPV4_CKSUM,
+		.rss_hf = RTE_ETH_RSS_FRAG_IPV4 | RTE_ETH_RSS_NONFRAG_IPV4_UDP | RTE_ETH_RSS_NONFRAG_IPV4_OTHER | RTE_ETH_RSS_L2_PAYLOAD,
+		.timesync = false
+	},
+	{
+		.driver_name = "net_ixgbe",
+		.rxoffload = RTE_ETH_RX_OFFLOAD_RSS_HASH | RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_IPV4_CKSUM,
+		.txoffload = RTE_ETH_TX_OFFLOAD_UDP_CKSUM | RTE_ETH_TX_OFFLOAD_IPV4_CKSUM,
+		.rss_hf = RTE_ETH_RSS_IPV4 | RTE_ETH_RSS_NONFRAG_IPV4_UDP,
+		.timesync = true
+	}
+};
+
+static struct port_conf default_conf = {
+	.driver_name = "default",
+	.rxoffload = RTE_ETH_RX_OFFLOAD_RSS_HASH | RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_IPV4_CKSUM | RTE_ETH_RX_OFFLOAD_TIMESTAMP,
+	.txoffload = RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE | RTE_ETH_TX_OFFLOAD_UDP_CKSUM | RTE_ETH_TX_OFFLOAD_IPV4_CKSUM,
+	.rss_hf = RTE_ETH_RSS_FRAG_IPV4 | RTE_ETH_RSS_NONFRAG_IPV4_UDP | RTE_ETH_RSS_NONFRAG_IPV4_OTHER | RTE_ETH_RSS_L2_PAYLOAD,
+	.timesync = true
+};
+
+static const int port_size = sizeof(port_confs) / sizeof(port_confs[0]);
+
+int
+portconf_get(int portid, struct port_conf * out)
+{
+	struct rte_eth_dev_info dev_info {};
+
+	if (rte_eth_dev_info_get(portid, &dev_info) != 0) {
+		rte_exit(EXIT_FAILURE, "failed to obtain device info for port %d\n", portid);
+	}
+
+	for(int i = 0; i < port_size; i++) {
+		struct port_conf * conf = &port_confs[i];
+		if (strcmp(conf->driver_name, dev_info.driver_name) == 0) {
+			memcpy(out, conf, sizeof(struct port_conf));
+			return 0;
+		}
+	}
+
+	fprintf(stdout, "portconf_get: unable to find matching conf for port %d:%s, returning default conf.\n", portid, dev_info.driver_name);
+	memcpy(out, &default_conf, sizeof(struct port_conf));
+	return -1;
+}
--- a/net/rat.cc
+++ b/net/rat.cc
@ -0,0 +1,909 @@
+#include <atomic>
+#include <cstddef>
+#include <cstdlib>
+#include <list>
+#include <map>
+#include <mutex>
+#include <random>
+#include <vector>
+
+#include <sys/endian.h>
+
+#include <topo.h>
+#include <rte_byteorder.h>
+#include <rte_common.h>
+#include <rte_config.h>
+#include <rte_eal.h>
+#include <rte_ethdev.h>
+#include <rte_ether.h>
+#include <rte_launch.h>
+#include <rte_lcore.h>
+#include <rte_mbuf.h>
+#include <unistd.h>
+
+#include "ntr.h"
+
+#include "gen.hh"
+#include "net/netsup.hh"
+#include "net/pkt.hh"
+#include "nms.h"
+
+constexpr static unsigned int BURST_SIZE = 32;
+
+static unsigned int
+epoch_mk(unsigned int id, unsigned int epoch)
+{
+	return (id << 24) | epoch;
+}
+
+static unsigned int
+epoch_get_id(unsigned int epoch)
+{
+	return epoch >> 24;
+}
+
+static unsigned int
+epoch_get_epoch(unsigned int epoch)
+{
+	return epoch & 0x00FFFFFF;
+}
+
+struct epoch_info {
+	unsigned int epoch;
+	uint64_t ts;
+};
+
+struct thread_info {
+	unsigned int id { 0 };
+	unsigned int lcore_id { 0 };
+	unsigned int rxqid { 0 };
+	unsigned int txqid { 0 };
+	int socket_id;
+	// this field is read by the stat collecting thread
+	std::atomic<int> recved_pkts { 0 };
+	std::atomic<int> lost_pkts { 0 };
+
+	Generator *ia_gen { nullptr };
+	Generator *load_gen0 { nullptr };
+	Generator *load_gen1 { nullptr };
+
+	std::mutex
+	    mtx; // this lock protects data shared between worker threads, i.e.:
+	std::list<struct epoch_info *> recved_epochs;
+
+	thread_info() = default;
+};
+
+constexpr static int STATE_SYNC = 0;	 // waiting for SYNC
+constexpr static int STATE_SYNC_ACK = 1; // Waiting for sending SYNC_ACK
+constexpr static int STATE_RUNNING = 2;	 // Running
+constexpr static int STATE_FIN = 3;	 // FIN received
+
+constexpr static int WORKLOAD_MAX_ARGS = 2;
+
+struct options_t {
+	unsigned int run_time { 5 };
+	// parameters
+	int slave_mode { 0 };
+	uint32_t rage_quit_time { UINT32_MAX };
+	char ia_gen[256] { "fixed:0" };
+	char load_gen[WORKLOAD_MAX_ARGS][256] = {{"fixed:0"}, {"fixed:0"}};
+	uint32_t workload_type {LOAD_TYPE_CPU};
+	uint32_t target_qps { 0 };
+	uint32_t depth { 1 };
+	struct net_spec server_spec { };
+	cpuset_t cpu_set = CPUSET_T_INITIALIZER(0x2); // 1 thread @ core 2
+	uint32_t pkt_loss_delay_ms { UINT32_MAX };
+	bool jumbo_frame_enabled { false };
+	int pkt_pad_sz { 0 };
+	int port_mtu { MAX_STANDARD_MTU };
+	int portid { 0 };
+
+	// states
+	unsigned int s_num_threads { 1 }; // 1 thread
+	struct net_spec s_host_spec { };
+	struct net_spec s_master_spec { };
+	struct conn_spec s_master_cspec {
+		.src = &s_host_spec, .src_port = DEFAULT_RAT_PORT,
+		.dst = &s_master_spec, .dst_port = DEFAULT_RAT_PORT,
+	};
+	std::vector<struct thread_info *> s_thr_info;
+	std::atomic<int> s_state { STATE_RUNNING }; // default non master mode
+
+	// states for qps
+	std::atomic<uint64_t> s_ts_begin { 0 };
+};
+
+static struct options_t options;
+
+static inline void
+calc_stats(uint64_t now, uint32_t *qps, uint32_t *recved_pkt,
+    uint32_t *total_loss)
+{
+	uint32_t recv = 0;
+	uint32_t loss = 0;
+
+	for (auto i : options.s_thr_info) {
+		recv += i->recved_pkts.load();
+		loss += i->lost_pkts.load();
+	}
+
+	if (recved_pkt != nullptr) {
+		*recved_pkt = recv;
+	}
+
+	if (total_loss != nullptr) {
+		*total_loss = loss;
+	}
+
+	if (qps != nullptr) {
+		*qps = (uint32_t)((double)(recv) /
+		    ((double)(now - options.s_ts_begin.load()) / (double)S2NS));
+	}
+}
+
+static void
+proto_loop(struct thread_info *tinfo)
+{
+	struct rte_mbuf *tx_buf;
+	struct rte_mbuf *rx_bufs[BURST_SIZE];
+	struct pkt_hdr *pkt_data;
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
+	    "proto_loop <thread %d>: waiting for SYNC from cat\n", tinfo->id);
+	while (options.s_state.load() == STATE_SYNC) {
+		const uint16_t nb_rx = rte_eth_rx_burst(options.portid,
+		    tinfo->rxqid, rx_bufs, BURST_SIZE);
+		if (nb_rx > 0) {
+			for (int i = 0; i < nb_rx; i++) {
+				struct pkt_hdr *each = check_valid_packet(
+				    rx_bufs[i], &options.s_host_spec.mac_addr);
+
+				if (each != nullptr) {
+					uint16_t type = rte_be_to_cpu_16(
+					    each->type);
+					if (type == PKT_TYPE_SYNC) {
+						int expected = STATE_SYNC;
+
+						ntr(NTR_DEP_USER1,
+						    NTR_LEVEL_INFO,
+						    "proto_loop <thread %d>: received SYNC from cat\n",
+						    tinfo->id);
+
+						if (!options.s_state
+							 .compare_exchange_strong(
+							     expected,
+							     STATE_SYNC_ACK)) {
+							// someone barged in,
+							// listen to that guy
+							ntr(NTR_DEP_USER1,
+							    NTR_LEVEL_WARNING,
+							    "proto_loop <thread %d>: failed to cmpxchg sync_recv.\n",
+							    tinfo->id);
+						} else {
+							pkt_hdr_to_netspec(each,
+							    &options
+								 .s_master_spec,
+							    nullptr, nullptr,
+							    nullptr);
+
+							if (alloc_pkt_hdr(
+								mempool_get(
+								    tinfo
+									->socket_id),
+								PKT_TYPE_SYNC_ACK,
+								&options
+								     .s_master_cspec,
+								0, &tx_buf,
+								&pkt_data) !=
+							    0) {
+								rte_exit(
+								    EXIT_FAILURE,
+								    "failed to alloc pkt hdr\n");
+							}
+
+							tx_burst_all(
+							    options.portid,
+							    tinfo->txqid,
+							    &tx_buf, 1);
+
+							expected =
+							    STATE_SYNC_ACK;
+							// we've done our job,
+							// set off the threads
+							if (!options.s_state
+								 .compare_exchange_strong(
+								     expected,
+								     STATE_RUNNING)) {
+								rte_exit(
+								    EXIT_FAILURE,
+								    "state unexpectedly changed\n");
+							}
+
+							ntr(NTR_DEP_USER1,
+							    NTR_LEVEL_INFO,
+							    "proto_loop <thread %d>: sent SYNC_ACK to cat\n",
+							    tinfo->id);
+						}
+					} else {
+						ntr(NTR_DEP_USER1,
+						    NTR_LEVEL_DEBUG,
+						    "proto_loop <thread %d>: ignoring invalid packet %p type %d.\n",
+						    tinfo->id,
+						    (void *)rx_bufs[i], type);
+					}
+				} else {
+					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+					    "proto_loop <thread %d>: ignoring invalid packet %p.\n",
+					    tinfo->id, (void *)rx_bufs[i]);
+					//dump_pkt(rx_bufs[i]);
+				}
+
+				rte_pktmbuf_free(rx_bufs[i]);
+			}
+		}
+	}
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+	    "proto_loop <thread %d>: exiting loop...\n", tinfo->id);
+}
+
+static void
+pkt_loop(struct thread_info *tinfo)
+{
+	struct rte_mbuf *tx_bufs[BURST_SIZE];
+	struct rte_mbuf *rx_bufs[BURST_SIZE];
+	std::vector<struct epoch_info *> recved_epochs;
+	std::map<unsigned int, struct epoch_info *> sent_epochs;
+	uint64_t cur_epoch = 0;
+	uint64_t next_ts;
+	uint64_t last_recv_ts = 0;
+	struct conn_spec srv_cspec;
+	rdport_generator src_port_gen(MIN_RANDOM_PORT);
+	rdport_generator dst_port_gen(MIN_RANDOM_PORT);
+
+	srv_cspec.src = &options.s_host_spec;
+	srv_cspec.dst = &options.server_spec;
+
+	next_ts = topo_uptime_ns();
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "pkt_loop <thread %d>: entering\n",
+	    tinfo->id);
+
+	while (options.s_state.load() == STATE_RUNNING) {
+		uint64_t now = topo_uptime_ns();
+		// always pop incoming packets
+		const uint16_t nb_rx = rte_eth_rx_burst(options.portid,
+		    tinfo->rxqid, rx_bufs, BURST_SIZE);
+
+		if (nb_rx > 0) {
+			for (int i = 0; i < nb_rx; i++) {
+				struct pkt_hdr *each = check_valid_packet(
+				    rx_bufs[i], &options.s_host_spec.mac_addr);
+
+				if (each == nullptr) {
+					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+					    "pkt_loop <thread %d>: ignoring invalid packet %p.\n",
+					    tinfo->id, (void *)rx_bufs[i]);
+					rte_pktmbuf_free(rx_bufs[i]);
+					continue;
+				}
+
+				uint16_t type = rte_be_to_cpu_16(each->type);
+				NTR_PKT(NTR_DEP_USER1, NTR_LEVEL_DEBUG, each,
+				    "locore_main <thread %d>: ", tinfo->id);
+				struct pkt_payload_epoch *pld_epoch;
+				struct epoch_info *einfo;
+				uint32_t epoch;
+				uint32_t id;
+				struct thread_info *other_t;
+				int int_expected = STATE_RUNNING;
+				switch (type) {
+				case PKT_TYPE_LOAD_RESP:
+					pld_epoch = (struct pkt_payload_epoch *)
+							each->payload;
+					epoch = rte_be_to_cpu_32(
+					    pld_epoch->epoch);
+					id = epoch_get_id(epoch);
+
+					// printf("Load resp size : %d\n",
+					// rx_bufs[i]->data_len);
+
+					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+					    "pkt_loop <thread %d>: packet %p epoch 0x%x id %d.\n",
+					    tinfo->id, (void *)rx_bufs[i],
+					    epoch, id);
+
+					if (id >= options.s_num_threads) {
+						ntr(NTR_DEP_USER1,
+						    NTR_LEVEL_WARNING,
+						    "pkt_loop <thread %d>: packet %p invalid id %d.\n",
+						    tinfo->id,
+						    (void *)rx_bufs[i], id);
+						break;
+					}
+
+					einfo = new struct epoch_info;
+					einfo->epoch = epoch;
+					einfo->ts = now;
+
+					other_t = options.s_thr_info.at(id);
+					other_t->mtx.lock();
+					other_t->recved_epochs.push_back(einfo);
+					other_t->mtx.unlock();
+
+					break;
+				case PKT_TYPE_FIN:
+					if (rte_is_same_ether_addr(
+						&each->eth_hdr.src_addr,
+						&options.s_master_spec
+						     .mac_addr)) {
+						ntr(NTR_DEP_USER1,
+						    NTR_LEVEL_DEBUG,
+						    "pkt_loop <thread %d>: recved FIN from cat.\n",
+						    tinfo->id);
+						// master told us to stop!
+						if (!options.s_state
+							 .compare_exchange_strong(
+							     int_expected,
+							     STATE_FIN)) {
+							ntr(NTR_DEP_USER1,
+							    NTR_LEVEL_WARNING,
+							    "pkt_loop <thread %d>: failed to cmpxchg state.\n",
+							    tinfo->id);
+						}
+
+						uint32_t qps;
+						uint32_t total_recv;
+						uint32_t total_loss;
+
+						calc_stats(now, &qps,
+						    &total_recv, &total_loss);
+
+						struct pkt_hdr *pkt_hdr;
+						if (alloc_pkt_hdr(
+							mempool_get(
+							    tinfo->socket_id),
+							PKT_TYPE_FIN_ACK,
+							&options.s_master_cspec,
+							0, &tx_bufs[0],
+							&pkt_hdr) != 0) {
+							rte_exit(EXIT_FAILURE,
+							    "failed to allocate pkt hdr\n");
+						}
+
+						auto pld_qps =
+						    (struct pkt_payload_qps *)
+							pkt_hdr->payload;
+						pld_qps->qps = rte_cpu_to_be_32(
+						    qps);
+						pld_qps->recved_pkts =
+						    rte_cpu_to_be_32(
+							total_recv);
+						pld_qps->lost_pkts =
+						    rte_cpu_to_be_32(
+							total_loss);
+
+						tx_burst_all(options.portid,
+						    tinfo->txqid, &tx_bufs[0],
+						    1);
+
+						options.s_state.store(
+						    STATE_FIN);
+
+						ntr(NTR_DEP_USER1,
+						    NTR_LEVEL_DEBUG,
+						    "pkt_loop <thread %d>: sent FIN_ACK to cat. QPS = %d.\n",
+						    tinfo->id, qps);
+					} else {
+						ntr(NTR_DEP_USER1,
+						    NTR_LEVEL_WARNING,
+						    "pkt_loop <thread %d>: invalid FIN packet from a different cat.\n",
+						    tinfo->id);
+					}
+					break;
+				default:
+					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+					    "pkt_loop: ignoring packet %p with unknown type %d.\n",
+					    (void *)rx_bufs[i], type);
+				}
+
+				rte_pktmbuf_free(rx_bufs[i]);
+			}
+		}
+
+		// dequeue receved epochs
+		struct epoch_info *einfo;
+		tinfo->mtx.lock();
+		while (!tinfo->recved_epochs.empty()) {
+			// only dequeue, process later
+			einfo = tinfo->recved_epochs.front();
+			tinfo->recved_epochs.pop_front();
+
+			// XXX: might call into the allocator
+			// otherwise we need to have an array and do batching
+			// => complex code and don't think it's worth it
+			recved_epochs.push_back(einfo);
+		}
+		tinfo->mtx.unlock();
+
+		if (!recved_epochs.empty())
+			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+			    "pkt_loop <thread %d>: dequeued %lu received epochs\n",
+			    tinfo->id, recved_epochs.size());
+
+		// process epochs
+		while (!recved_epochs.empty()) {
+			einfo = recved_epochs.back();
+			recved_epochs.pop_back();
+
+			auto it = sent_epochs.find(einfo->epoch);
+			if (it != sent_epochs.end()) {
+				ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+				    "pkt_loop <thread %d>: received epoch 0x%x\n",
+				    tinfo->id, epoch_get_epoch(einfo->epoch));
+
+				if (einfo->ts > last_recv_ts) {
+					last_recv_ts = einfo->ts;
+				}
+				delete it->second;
+				sent_epochs.erase(it);
+				tinfo->recved_pkts.fetch_add(1);
+			} else {
+				// we recved an epoch we never sent
+				ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+				    "pkt_loop <thread %d>: received epoch 0x%x but never sent it. Packet loss?\n",
+				    tinfo->id, einfo->epoch);
+			}
+			delete einfo;
+		}
+
+		// handle packet loss
+		for (auto it = sent_epochs.begin(); it != sent_epochs.end();) {
+			einfo = it->second;
+			if (now - einfo->ts >
+			    options.pkt_loss_delay_ms * MS2NS) {
+				// timed out
+				ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+				    "pkt_loop <thread %d>: epoch 0x%x is lost after not receiving for too long\n",
+				    tinfo->id, einfo->epoch);
+
+				delete it->second;
+				it = sent_epochs.erase(it);
+				tinfo->lost_pkts.fetch_add(1);
+			} else {
+				++it;
+			}
+		}
+
+		// check to send the next packet
+		uint32_t total_send = 0;
+		while (now >= next_ts && sent_epochs.size() < options.depth &&
+		    total_send < BURST_SIZE) {
+			struct pkt_payload_load *pld_load;
+			struct pkt_hdr *pkt_data;
+			next_ts += (int)(tinfo->ia_gen->generate() * S2NS);
+
+			// change dst port for every packet for RSS
+			srv_cspec.dst_port = dst_port_gen.next();
+			srv_cspec.src_port = src_port_gen.next();
+			if (alloc_pkt_hdr(mempool_get(tinfo->socket_id),
+				PKT_TYPE_LOAD, &srv_cspec, options.pkt_pad_sz,
+				&tx_bufs[total_send], &pkt_data) != 0) {
+				rte_exit(EXIT_FAILURE,
+				    "failed to allocate pkt hdr\n");
+			}
+
+			pld_load = (struct pkt_payload_load *)pkt_data->payload;
+			pld_load->type = rte_cpu_to_be_32(options.workload_type);
+			pld_load->arg0 = rte_cpu_to_be_32((uint32_t)tinfo->load_gen0->generate());
+			pld_load->arg1 = rte_cpu_to_be_32((uint32_t)tinfo->load_gen1->generate());
+			unsigned int epoch = epoch_mk(tinfo->id, cur_epoch);
+			pld_load->epoch = rte_cpu_to_be_32(epoch);
+			cur_epoch++;
+
+			einfo = new struct epoch_info;
+			einfo->epoch = epoch;
+			einfo->ts = now;
+			sent_epochs.insert({ epoch, einfo });
+
+			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+			    "pkt_loop <thread %d>: sending packet %p with epoch 0x%x\n",
+			    tinfo->id, (void *)tx_bufs[total_send], epoch);
+
+			total_send++;
+		}
+
+		tx_burst_all(options.portid, tinfo->txqid, tx_bufs, total_send);
+
+		// check rage quit only when we have sent a packet
+		if (last_recv_ts == 0) {
+			last_recv_ts = topo_uptime_ns();
+		}
+		if (topo_uptime_ns() >
+		    options.rage_quit_time * MS2NS + last_recv_ts) {
+			rte_exit(EXIT_FAILURE,
+			    "rat: thread %d waiting too long for resp. I F QUIT!\n",
+			    tinfo->id);
+		}
+	}
+
+	// clean up
+	for (auto it = sent_epochs.begin(); it != sent_epochs.end();) {
+		delete it->second;
+		++it;
+	}
+	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+	    "pkt_loop <thread %d>: exiting loop...\n", tinfo->id);
+}
+
+static int
+locore_main(void *tif)
+{
+	auto tinfo = (struct thread_info *)tif;
+	uint32_t core_id = rte_lcore_id();
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
+	    "locore_main <thread %d>: running on core %d rxqid %d txqid %d...\n", tinfo->id,
+	    core_id, tinfo->rxqid, tinfo->txqid);
+
+	if (rte_eth_dev_socket_id(options.portid) > 0 &&
+	    rte_eth_dev_socket_id(options.portid) != (int)rte_socket_id()) {
+		ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
+		    "locore_main <thread %d>: WARNING, port %d is on remote NUMA node to "
+		    "polling thread.\n\tPerformance will "
+		    "not be optimal.\n",
+		    tinfo->id, options.portid);
+	}
+
+	if (options.slave_mode == 1) {
+		// perform rat protocol
+		proto_loop(tinfo);
+	}
+
+	// wait for the primary thread sending SYNC_ACK
+	while (options.s_state.load() != STATE_RUNNING) {
+	}
+	// store the current timestamp
+	options.s_ts_begin.store(topo_uptime_ns());
+	pkt_loop(tinfo);
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main <thread %d>: exited\n",
+	    tinfo->id);
+
+	return 0;
+}
+
+static void
+dump_options()
+{
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
+	    "Configuration:\n"
+	    "    verbosity = +%d\n"
+	    "    run time = %d\n"
+	    "    num threads = %d\n"
+	    "    rage quit time = %ul\n"
+	    "    slave mode = %d\n"
+	    "    interarrival dist = %s\n"
+	    "    workload type = %d\n"
+		"    workload arg0 = %s\n"
+		"    workload arg1 = %s\n"
+	    "    qps = %d\n"
+	    "    host IP = 0x%x\n"
+	    "    depth = %u\n"
+	    "    packet loss time threshold = %u\n"
+	    "    jumbo frame = %d\n"
+	    "    packet pad size = %d\n"
+	    "    portid = %d\n",
+	    ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING, options.run_time,
+	    options.s_num_threads, options.rage_quit_time, options.slave_mode,
+	    options.ia_gen, options.workload_type, options.load_gen[0], options.load_gen[1], options.target_qps,
+	    options.s_host_spec.ip, options.depth, options.pkt_loss_delay_ms,
+	    options.jumbo_frame_enabled, options.pkt_pad_sz, options.portid);
+}
+
+static void
+usage()
+{
+	fprintf(stdout,
+	    "Usage:\n"
+	    "    -v(vv): verbose mode\n"
+	    "    -h: display the information\n"
+	    "    -t: run time\n"
+	    "    -s: server net spec\n"
+	    "    -S: slave(rat) mode\n"
+	    "    -A: affinity mask\n"
+	    "    -i: inter-arrival time distribution\n"
+	    "    -w: workload type\n"
+		"	 -w (repeated): workload arg0 distribution\n"
+		"	 -w (repeated): workload arg1 distribution\n"
+	    "    -r: rage quit time (in ms)\n"
+	    "    -q: target QPS\n"
+	    "    -H: host net spec\n"
+	    "    -D: max number of packets in flight\n"
+	    "    -l: packet loss time threshold\n"
+	    "    -J: enable jumbo frame\n"
+	    "    -P: pad load packets to this size\n"
+	    "    -p: portid\n");
+}
+
+int
+main(int argc, char *argv[])
+{
+	struct thread_info *tinfo;
+	bool has_host_spec = false;
+
+	ntr_init();
+
+	// init dpdk
+	int ret = rte_eal_init(argc, argv);
+	if (ret < 0) {
+		rte_exit(EXIT_FAILURE, "rte_eal_init failed!\n");
+	}
+
+	argc -= ret;
+	argv += ret;
+
+	// set warning level
+	ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_WARNING);
+	{
+		int c;
+		int num_of_ws = 0;
+		// parse arguments
+		while ((c = getopt(argc, argv,
+			    "vht:s:SA:i:w:r:q:H:D:l:JP:p:")) != -1) {
+			switch (c) {
+			case 'v':
+				ntr_set_level(NTR_DEP_USER1,
+				    ntr_get_level(NTR_DEP_USER1) + 1);
+				break;
+			case 'h':
+				usage();
+				rte_exit(EXIT_SUCCESS, "\n");
+			case 't':
+				options.run_time = strtol(optarg, nullptr, 10);
+				break;
+			case 's':
+				if (str_to_netspec(optarg,
+					&options.server_spec) != 0) {
+					rte_exit(EXIT_FAILURE,
+					    "invalid server net spec\n");
+				}
+				break;
+			case 'S':
+				options.slave_mode = 1;
+				options.s_state =
+				    STATE_SYNC; // set state to wait for SYNC
+				break;
+			case 'A':
+				cpulist_to_cpuset(optarg, &options.cpu_set);
+				options.s_num_threads = CPU_COUNT(
+				    &options.cpu_set);
+				if (options.s_num_threads == 0) {
+					rte_exit(EXIT_FAILURE,
+					    "invalid cpu mask %s\n", optarg);
+				}
+				break;
+			case 'i':
+				strncpy(options.ia_gen, optarg,
+				    sizeof(options.ia_gen) - 1);
+				break;
+			case 'w':
+				if (num_of_ws == 0) {
+					options.workload_type = strtol(optarg, NULL, 10);
+					if (options.workload_type >= LOAD_TYPE_MAX) {
+						rte_exit(EXIT_FAILURE,
+					    	"invalid workload type %s\n", optarg);
+					}
+				} else if (num_of_ws <= WORKLOAD_MAX_ARGS) {
+					strncpy(options.load_gen[num_of_ws - 1], optarg, 255);	
+				}
+				
+				num_of_ws++;
+				break;
+			case 'r':
+				options.rage_quit_time = strtol(optarg, nullptr,
+				    10);
+				break;
+			case 'q':
+				options.target_qps = strtol(optarg, nullptr,
+				    10);
+				break;
+			case 'H':
+				has_host_spec = true;
+				if (str_to_netspec(optarg,
+					&options.s_host_spec) != 0) {
+					rte_exit(EXIT_FAILURE,
+					    "invalid host net spec.\n");
+				}
+				break;
+			case 'D':
+				options.depth = strtol(optarg, nullptr, 10);
+				if (options.depth == 0) {
+					options.depth = UINT32_MAX;
+				}
+				break;
+			case 'l':
+				options.pkt_loss_delay_ms = strtol(optarg,
+				    nullptr, 10);
+				if (options.pkt_loss_delay_ms == 0) {
+					options.pkt_loss_delay_ms = UINT32_MAX;
+				}
+				break;
+			case 'J':
+				options.jumbo_frame_enabled = true;
+				options.port_mtu = MAX_JUMBO_MTU;
+				break;
+			case 'P':
+				options.pkt_pad_sz = strtol(optarg, nullptr,
+				    10);
+				break;
+			case 'p':
+				options.portid = strtol(optarg, nullptr, 10);
+				break;
+			default:
+				usage();
+				rte_exit(EXIT_FAILURE, "unknown argument: %c\n",
+				    c);
+			}
+		}
+	}
+
+	if (options.pkt_pad_sz != 0 &&
+	    options.pkt_pad_sz > mtu_to_pkt_size(options.port_mtu)) {
+		rte_exit(EXIT_FAILURE, "pkt_pad_sz is too large for mtu %d\n",
+		    options.port_mtu);
+	}
+
+	if (!has_host_spec) {
+		rte_exit(EXIT_FAILURE, "Must specify host IP.\n");
+	}
+
+	// init libtopo
+	if (topo_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) !=
+	    0) {
+		rte_exit(EXIT_FAILURE, "libtopo init failed!\n");
+	}
+
+	if (nms_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) !=
+	    0) {
+		rte_exit(EXIT_FAILURE, "libnms init failed!\n");
+	}
+
+	dump_options();
+
+	// configure memory and port
+	struct port_conf pconf;
+	struct device_conf dconf;
+	struct mem_conf mconf;
+	portconf_get(options.portid, &pconf);
+	if (!pconf.timesync) {
+		ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
+		    "main: timesync disabled. hw timestamp unavailable.\n ");
+	}
+	dconf.mtu = options.port_mtu;
+	CPU_COPY(&options.cpu_set, &dconf.core_affinity);
+	dconf.portid = options.portid;
+	dconf.rss_hf = pconf.rss_hf;
+	dconf.rx_offloads = pconf.rxoffload;
+	dconf.tx_offloads = pconf.txoffload;
+	dconf.timesync = pconf.timesync;
+
+	dconf.rx_fn = nullptr;
+	dconf.rx_user = nullptr;
+	dconf.rx_ring_sz = 2048;
+	dconf.tx_fn = nullptr;
+	dconf.tx_user = nullptr;
+	dconf.tx_ring_sz = 2048;
+
+	mconf.cache_size = 512;
+	mconf.priv_size = 0;
+	mconf.num_elements = (dconf.rx_ring_sz + dconf.tx_ring_sz) *
+	    rte_lcore_count() / rte_socket_count();
+	mconf.data_room_size = RTE_MBUF_DEFAULT_BUF_SIZE + MAX_JUMBO_MTU -
+	    MAX_STANDARD_MTU;
+	mconf.max_pools = -1;
+
+	dpdk_init(&dconf, &mconf);
+
+	if (rte_eth_macaddr_get(options.portid,
+		&options.s_host_spec.mac_addr) != 0) {
+		rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n",
+		    options.portid);
+	}
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
+	    "Configured port %d with mac addr %x:%x:%x:%x:%x:%x\n",
+	    options.portid, options.s_host_spec.mac_addr.addr_bytes[0],
+	    options.s_host_spec.mac_addr.addr_bytes[1],
+	    options.s_host_spec.mac_addr.addr_bytes[2],
+	    options.s_host_spec.mac_addr.addr_bytes[3],
+	    options.s_host_spec.mac_addr.addr_bytes[4],
+	    options.s_host_spec.mac_addr.addr_bytes[5]);
+
+	unsigned int cpuset_idx = CPU_FFS(&options.cpu_set);
+	unsigned int tid = 0;
+	while (cpuset_idx != 0) {
+		unsigned int lcore_id = cpuset_idx - 1;
+		tinfo = new thread_info;
+		tinfo->ia_gen = createGenerator(options.ia_gen);
+		tinfo->load_gen0 = createGenerator(options.load_gen[0]);
+		tinfo->load_gen1 = createGenerator(options.load_gen[1]);
+		if (tinfo->ia_gen == nullptr || tinfo->load_gen0 == nullptr || tinfo->load_gen1 == nullptr) {
+			rte_exit(EXIT_FAILURE,
+			    "invalid ia_gen or ld_gen string\n");
+		}
+		tinfo->ia_gen->set_lambda((double)options.target_qps /
+		    (double)(options.s_num_threads));
+		tinfo->id = tid;
+		tinfo->lcore_id = lcore_id;
+		tinfo->socket_id = rte_lcore_to_socket_id(lcore_id);
+		tinfo->rxqid = tid;
+		tinfo->txqid = tid;
+		options.s_thr_info.push_back(tinfo);
+
+		tid++;
+		CPU_CLR(lcore_id, &options.cpu_set);
+		cpuset_idx = CPU_FFS(&options.cpu_set);
+	}
+
+	sleep(INIT_DELAY);
+
+	for (unsigned int i = 0; i < options.s_num_threads; i++) {
+		tinfo = options.s_thr_info.at(i);
+		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
+		    "main: launching thread %d on locore %d\n", tinfo->id,
+		    tinfo->lcore_id);
+		if (rte_eal_remote_launch(locore_main,
+			(void *)options.s_thr_info.at(i),
+			tinfo->lcore_id) != 0) {
+			rte_exit(EXIT_FAILURE,
+			    "failed to launch function on locore %d\n",
+			    tinfo->lcore_id);
+		}
+	}
+
+	// poor man's timer
+	uint32_t second = 0;
+	// this loop exit is signaled by SYNC_FIN in slave mode and by itself in
+	// non slave mode
+	while (options.s_state.load() != STATE_FIN) {
+		if (options.slave_mode != 1) {
+			if (second >= options.run_time) {
+				options.s_state.store(STATE_FIN);
+				break;
+			}
+			usleep(1 * S2US);
+			second++;
+		}
+	}
+
+	for (unsigned int i = 0; i < options.s_num_threads; i++) {
+		tinfo = options.s_thr_info.at(i);
+		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
+		    "main: waiting for locore %d...\n", tinfo->lcore_id);
+		if (rte_eal_wait_lcore(tinfo->lcore_id) != 0) {
+			rte_exit(EXIT_FAILURE, "failed to wait for locore %d\n",
+			    tinfo->lcore_id);
+		}
+	}
+
+	uint32_t qps;
+	uint32_t total_recv;
+	uint32_t total_loss;
+	calc_stats(topo_uptime_ns(), &qps, &total_recv, &total_loss);
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "qps = %d, recv = %d, loss = %d\n",
+	    qps, total_recv, total_loss);
+
+	for (auto each : options.s_thr_info) {
+		delete each->load_gen0;
+		delete each->load_gen1;
+		delete each->ia_gen;
+		delete each;
+	}
+
+	// clean up
+	dpdk_cleanup(&dconf);
+
+	return 0;
+}
--- a/scripts/cc_pin.py
+++ b/scripts/cc_pin.py
@ -0,0 +1,50 @@
+import os
+import sys
+import getopt
+import subprocess
+
+options = getopt.getopt(sys.argv[1:], 'b:s:d:p:')[0]
+
+base=0
+stride=2
+num = 0
+port = 0
+
+for opt, arg in options:
+    if opt == '-b':
+        base = int(arg)
+    elif opt == '-s':
+        stride = int(arg)
+    elif opt == '-d':
+        num = int(arg)
+    elif opt == '-p':
+        port = int(arg)
+result = subprocess.run("sysctl -a", shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+lines = result.stdout.decode().split('\n')
+cclines : list[str] = []
+for line in lines:
+    if ("irq" in line) and (f"t6nex{num}" in line) and (f"{port}a" in line):
+        cclines.append(line)
+
+if len(cclines) == 0:
+    print(f"No t6nex {num}a lines from sysctl.\n")
+    exit(1)
+
+irqs = []
+for line in cclines:
+    eles = line.split(' ')
+    irq = eles[0]
+    if (irq.startswith("irq") and irq.endswith(":")):
+        irq = irq[3:-1]
+        irqs.append(int(irq))
+    else:
+        print(f"Unknown line format: f{line}")
+
+print(f"Detected {len(irqs)} irqs:\n{str(irqs)}")
+
+for irq in irqs:
+    print(f"Setting irq{irq}'s affinity to core {base}...")
+    subprocess.run(f"cpuset -l {base} -x {irq}", check=True, shell=True)
+    base = base + stride
+
+exit(0)
--- a/scripts/copy-mount.sh
+++ b/scripts/copy-mount.sh
@ -0,0 +1,9 @@
+#!/bin/sh
+scp -P77 mount.sh oscar@icelake1-int.rcs.uwaterloo.ca:~/
+scp -P77 mount_small.sh oscar@icelake1-int.rcs.uwaterloo.ca:~/
+scp -P77 mount.sh oscar@milan1-int.rcs.uwaterloo.ca:~/
+scp -P77 mount_small.sh oscar@milan1-int.rcs.uwaterloo.ca:~/
+scp -P77 mount.sh oscar@icelake2-int.rcs.uwaterloo.ca:~/
+scp -P77 mount_small.sh oscar@icelake2-int.rcs.uwaterloo.ca:~/
+scp -P77 mount.sh oscar@milan2-int.rcs.uwaterloo.ca:~/
+scp -P77 mount_small.sh oscar@milan2-int.rcs.uwaterloo.ca:~/
--- a/scripts/dpdk.py
+++ b/scripts/dpdk.py
@ -0,0 +1,230 @@
+from cgi import test
+from site import abs_paths
+import subprocess as sp
+import time
+import select
+import os
+import datetime
+import pwd
+import sys
+import getopt
+import numpy as np
+import re
+
+import libpar as par
+import libtc as tc
+import libmechspec as mechspec
+import netexp
+
+only_max_qps = True
+# [[counter names], counting mode (0 = sampling, 1 = counting)]
+pmc_counters = [
+	"",
+	# [["mem_load_l3_miss_retired.local_dram"], 1],
+	# [["mem_load_l3_miss_retired.remote_dram"], 1],
+	# [["mem_load_l3_miss_retired.remote_hitm"], 1],
+	# [["mem_load_l3_miss_retired.remote_fwd"], 1]
+	# [["mem_trans_retired.load_latency_gt_8"], 0],
+	# [["mem_trans_retired.load_latency_gt_16"], 0],
+	# [["mem_trans_retired.load_latency_gt_32"], 0],
+	# [["mem_trans_retired.load_latency_gt_64"], 0],
+	# [["mem_trans_retired.load_latency_gt_128"], 0],
+	# [["mem_trans_retired.load_latency_gt_256"], 0],
+	# [["mem_trans_retired.load_latency_gt_512"], 0],
+	#[["mem_trans_retired.load_latency_gt_8", ""], 0],
+]
+
+# pkt_pad
+clt_pkt_pads = [
+	0,
+	# 256,
+	# 512,
+	# 1024,
+	# 2048,
+	# 4096,
+	# 8192
+]
+
+clt_pkt_pads_depth = {}
+clt_pkt_pads_depth[0] = 8
+clt_pkt_pads_depth[256] = 6
+clt_pkt_pads_depth[512] = 6
+clt_pkt_pads_depth[1024] = 4
+clt_pkt_pads_depth[1518] = 4
+clt_pkt_pads_depth[2048] = 2
+clt_pkt_pads_depth[4096] = 2
+clt_pkt_pads_depth[8192] = 1
+clt_pkt_pads_depth[9018] = 1
+
+# clt_load
+clt_wrkld = [
+	[0, "fixed:0", "fixed:0"],
+	# [0, "uniform:1000", "fixed:0"],
+	# [0, "uniform:100", "fixed:0"],
+	# [0, "uniform:10", "fixed:0"],
+	# [1, "uniform:480", "uniform:1024"],
+	# [1, "uniform:480", "uniform:256"],
+	# [1, "uniform:480", "uniform:64"]
+]
+
+# paths
+file_dir = os.path.dirname(os.path.realpath(__file__))
+root_dir = os.path.join(file_dir,"..")
+
+# [srv_affinity, OPTIONAL( memgen_affinity, iteration, buffer_size, target_dom )]
+server_affinity = [
+	["1,3,5,7,9,11,13,15,17,19,21,23"],
+	["25,27,29,31,33,35,37,39,41,43,45,47"],
+	#["1,3,5,7,9,11,13,15,17,19,21,23", "26,28,30,32,34,36,38,40,42,44,46", -1, 512*1024*1024, 0],
+	#["25,27,29,31,33,35,37,39,41,43,45,47", "2,4,6,8,10,12,14,16,18,20,22", -1, 512*1024*1024, 1],
+	
+	# "65,67,69,71,73,75,77,79,81,83,85,87,89,91,93,95,97,99,101,103,105,107,109,111,113,115,117,119,121,123,125,127",
+	# "1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63",
+	# "1,3,5,7,9,11,13,15",
+	# "17,19,21,23,25,27,29,31",
+	# "33,35,37,39,41,43,45,47",
+	# "49,51,53,55,57,59,61,63"
+]
+
+def flush_netresult(conf : netexp.NetExpConf, result : netexp.NetExpResult):
+	sample_out = tc.get_odir() + "/" + str(result.parser.qps) + ".txt"
+
+	with open(sample_out, "w") as f:
+		f.write(result.sample)
+
+	if conf.enable_pmc:
+		pmc_out = tc.get_odir() + "/" + str(result.parser.qps) + ".pmc"
+		if conf.pmc_mode != 0:
+			with open(pmc_out, "w") as f:
+				f.write(result.pmc_parser.raw)
+		else:
+			with open(pmc_out, "wb") as f:
+				f.write(result.pmc_parser[0])
+			with open(pmc_out + "_parsed", "w") as g:
+				g.write(result.pmc_parser[1])
+
+	tc.log_print("=== Summary - qps: " + str(result.parser.qps) + " master loss: " + str(float(result.parser.master_loss) / float(result.parser.master_recv + result.parser.master_loss) * 100.00) + "% slave loss: " + str(float(result.parser.slave_loss) / float(result.parser.slave_recv + result.parser.slave_loss) * 100.0) + "%" )
+	tc.log_print("=== Server HW:")
+	tc.log_print(par.mutilate_data.build_mut_output(result.parser.srv_hwlat, [result.parser.qps]) + "\n")
+	tc.log_print("=== Server SW:")
+	tc.log_print(par.mutilate_data.build_mut_output(result.parser.srv_swlat, [result.parser.qps]) + "\n")
+	tc.log_print("=== Client HW:")
+	tc.log_print(par.mutilate_data.build_mut_output(result.parser.clt_hwlat, [result.parser.qps]) + "\n")
+	tc.log_print("=== Client SW:")
+	tc.log_print(par.mutilate_data.build_mut_output(result.parser.clt_swlat, [result.parser.qps]) + "\n")
+	if conf.enable_pmc:
+		if conf.pmc_mode != 0:
+			tc.log_print("=== PMC:")
+			tc.log_print("counter: " + result.pmc_parser.counter + " count: " + str(result.pmc_parser.count) + " cores: " + str(result.pmc_parser.cores))
+
+def main():
+	tc.set_ssh_param("-o StrictHostKeyChecking=no -p77")
+	tc.set_ssh_user("oscar")
+	output_dirname = "run"
+
+	conf = netexp.NetExpConf()
+	conf.srv_mechspec = mechspec.LAB.SKYLAKE1_10G
+	conf.clt_mechspecs = [mechspec.LAB.SKYLAKE3_10G, mechspec.LAB.SKYLAKE5_10G]
+	conf.mst_mechspec = mechspec.LAB.SKYLAKE2_10G
+	conf.finalize_mechspecs()
+	conf.root_dir = "/numam.d/build/bin"
+
+	# server fixed configs
+	conf.srv_port = 0
+	
+	# client fixed configs
+	conf.clt_ia = "exponential"
+	conf.clt_affinity = "1,3,5,7,9,11,13,15,17,19,21,23"
+	conf.clt_port = 0
+	conf.clt_pkt_loss_lat = 5000
+	conf.clt_rage_quit_lat = 5000
+
+	# master fixed configs
+	conf.mst_port = 0
+	conf.mst_warmup = 5
+	conf.mst_duration = 20
+	conf.mst_qps = 100
+	conf.mst_ia = "exponential"
+	conf.mst_pkt_loss_lat = 5000
+	conf.mst_pkt_loss_max = 100
+	conf.mst_affinity = "2"
+
+	# pmc stuff
+	conf.pmc_sampling_rate = 4096
+	conf.pmc_counting_interval = 0.1
+
+	options = getopt.getopt(sys.argv[1:], 'scSD')[0]
+	for opt, arg in options:
+		if opt in ('-s'):
+			netexp.stop_all(conf)
+			return
+		elif opt in ('-c'):
+			conf.enable_client_only=True
+		elif opt in ('-S'):
+			netexp.setup(conf, bench = True, dpdk = False)
+			return
+		elif opt in ('-D'):
+			netexp.setup(conf, bench=False, dpdk=True)
+			return
+
+	tc.init("~/results.d/numam_neo/" + output_dirname + "_" + datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
+	cpcmd = "cp " + __file__ + " " + tc.get_odir() + "/"
+	tc.log_print(cpcmd)
+	sp.check_call(cpcmd, shell=True)
+
+	for eaff in server_affinity:
+		conf.srv_affinity = eaff[0]
+		conf.enable_memgen = False
+		if len(eaff) > 1:
+			conf.enable_memgen = True
+			conf.memgen_affinity = eaff[1]
+			conf.memgen_iteration = eaff[2]
+			conf.memgen_size = eaff[3]
+			conf.memgen_tgtdom = eaff[4]
+		for epad in clt_pkt_pads:
+			conf.clt_pkt_pad = 0
+			conf.clt_pkt_depth = clt_pkt_pads_depth[conf.clt_pkt_pad]
+			for eload in clt_wrkld:
+				conf.clt_wrkld = eload[0]
+				conf.clt_wrkarg0 = eload[1]
+				conf.clt_wrkarg1 = eload[2]
+				for epmc in pmc_counters:
+					conf.enable_pmc = False
+					if len(epmc) > 0:
+						conf.enable_pmc = True
+						conf.pmc_counters = epmc[0]
+						conf.pmc_mode = epmc[1]
+
+					test_name = "affinity" + eaff[0] + "_pad" + str(epad) + "_load" + str(eload[0]) + "," + str(eload[1]) + "," + str(eload[2])
+					if (conf.enable_memgen):
+						test_name += "_memload" + str(eaff[1]) + "," + str(eaff[2]) + "," + str(eaff[3]) + "," + str(eaff[4])
+					if (conf.enable_pmc):
+						test_name += "_pmc" + str(epmc[1]) + "_" + conf.get_pmc_str()
+					tc.begin(test_name)
+					
+					conf.clt_qps = 0
+					tc.log_print("============ " + test_name + " QPS: MAX ============")
+					result : netexp.NetExpResult = netexp.run(conf)
+					flush_netresult(conf, result)
+					max_qps = result.parser.qps
+
+					if conf.enable_client_only:
+						return
+
+					if only_max_qps:
+						continue
+
+					finish = (int)(max_qps - max(conf.mst_qps, 0.01 * max_qps))
+					step = (int)(finish / 10)
+					cur_qps = step
+					while cur_qps <= finish:
+						tc.log_print("============ " + test_name + " QPS: " + str(cur_qps) + " ============")
+						conf.clt_qps = cur_qps
+						result : netexp.NetExpResult = netexp.run(conf)
+						flush_netresult(result)
+						cur_qps += step
+						tc.log_print("")
+			tc.end()
+
+	netexp.stop_all(conf)
+main()
--- a/scripts/graph.py
+++ b/scripts/graph.py
@ -0,0 +1,132 @@
+#!/usr/bin/env python3.6
+
+import pandas as pd
+import matplotlib.pyplot as plt
+from matplotlib import ticker
+import numpy as np
+import sys
+import re
+import os
+import json
+import libpar as par
+import getopt
+import math
+import concurrent.futures as CF
+
+def process_dir(rootdir):
+    ret = []
+    print("Processing directory " + rootdir + " ...")
+    for subdir in os.listdir(rootdir):
+        each_dir = os.path.join(rootdir, subdir)
+        if os.path.isfile(each_dir) and each_dir.endswith(".txt"):
+            output = None
+            try:
+                with open(each_dir, 'r') as f:
+                    if len(f.readlines()) <= 1:
+                        print("Skipping empty file - " + each_dir)
+                        continue
+
+                with open(each_dir, 'r') as f:
+                    output = f.read()
+                    parser = par.khat_parser() 
+                    parser.parse(output)
+                    print("Processed raw data - " + each_dir)
+                    ret.append(parser)
+            except:
+                print("Unrecognized format - " + subdir)
+        
+    print("")
+    return ret
+
+
+marker_map = ["o", "P", "s", "v", "*", "+", "^", "1", "2", "d", "X", "o", "P", "s", "v", "*", "+", "^", "1", "2", "d", "X"]
+color_map = ["xkcd:black", "xkcd:red", "xkcd:blue", "xkcd:green", "xkcd:cyan", "xkcd:purple", "xkcd:orange", "xkcd:salmon", "xkcd:lightgreen", "xkcd:indigo", "xkcd:brown", "xkcd:bubblegum", "xkcd:lavender", "xkcd:maroon", "xkcd:fern", "xkcd:sky", "xkcd:orchid", "xkcd:sienna"]
+parser_idx_labels = ["srv_hw", "srv_sw", "clt_hw", "clt_sw"]
+
+def add_curve(eax, label : str, qps_arr : [], lat_arr : [], marker : str, color : str):
+    df_dict = {}
+    df_dict['qps'] = qps_arr
+    df_dict['lat'] = lat_arr
+
+    df = pd.DataFrame(df_dict)
+    df = df.sort_values('qps')
+    eax.plot('qps', 'lat', data = df, label=label, marker=marker, color=color, markersize=8)
+
+# adds curves (avg and 99th percentile) for a specific parser idx
+def add_curves(rax, label : str, parsers : [], parser_idx : int, marker : str, color : str):
+    qps_arr = []
+    avg_arr = []
+    p99_arr = []
+
+    for parser in parsers:
+        qps_arr.append(parser.qps)
+        each_lat_arr = []
+        each_lat_arr.extend(parser.get_stat_arr(parser_idx))
+        avg_arr.append(np.mean(each_lat_arr))
+        p99_arr.append(np.percentile(each_lat_arr, 99))
+
+    add_curve(rax[0], label, qps_arr, avg_arr, marker, color)
+    add_curve(rax[1], label, qps_arr, p99_arr, marker, color)
+
+
+# generate the graphs for a parser index
+def generate_graph(aff_to_parser : {}, parser_idx : int, fn : str):
+    marker_idx = 0
+    color_idx = 0
+
+    fig, rax = plt.subplots(2, 1)
+    rax[0].set_yscale("log")
+    rax[0].set_title("Average")
+    rax[0].set_xlabel("QPS")
+    rax[0].set_ylabel("Latency (ns)")
+    rax[0].xaxis.get_major_formatter().set_scientific(False)
+    rax[0].yaxis.set_minor_formatter(ticker.ScalarFormatter())
+    rax[1].set_yscale("log")
+    rax[1].set_title("99th percentile")
+    rax[1].set_xlabel("QPS")
+    rax[1].set_ylabel("Latency (ns)")
+    rax[1].xaxis.get_major_formatter().set_scientific(False)
+    rax[1].yaxis.set_minor_formatter(ticker.ScalarFormatter())
+
+    print("Generating graph => " + fn + "...")
+    for aff in aff_to_parser:
+        # each affinity gets a different marker type
+        marker_type = marker_map[marker_idx]
+        color_type = color_map[color_idx]
+        marker_idx += 1
+        color_idx += 1
+
+        print("    Processing affinity " + aff + "...")
+
+        add_curves(rax, aff, aff_to_parser[aff], parser_idx, marker_type, color_type)
+
+    rax[0].legend()
+    rax[1].legend()
+    fig.set_size_inches(23.4, 16.5)
+    plt.savefig(fn, dpi=150)
+    plt.close()
+
+def main():
+    datdir = None
+    options = getopt.getopt(sys.argv[1:], 'd:')[0]
+
+    for opt, arg in options:
+        if opt in ('-d'):
+            datdir = arg
+
+    if datdir == None:
+        raise Exception("Must specify -d parameter")
+
+    dat = {}
+
+    for subdir in os.listdir(datdir):
+        each_dir = os.path.join(datdir, subdir)
+        if not os.path.isfile(each_dir):
+            dat[subdir] = process_dir(each_dir)
+
+    for i in range(len(parser_idx_labels)):
+        generate_graph(dat, i, datdir + "/" + parser_idx_labels[i])
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/histo.py
+++ b/scripts/histo.py
@ -0,0 +1,105 @@
+	
+import pandas as pd
+import matplotlib.pyplot as plt
+import matplotlib.mlab as mlab
+import numpy as np
+import sys
+import re
+import os
+import json
+import getopt
+import math
+import concurrent.futures as CF
+import libpar as par
+
+num_bins = 250
+extra_pct = []
+
+def saveplot(fp : str, data : [], title : str):
+    plt.hist(data, num_bins)
+    plt.xlabel("Delay")
+    plt.title(title)
+    plt.ylabel("Frequency")
+    f = plt.gcf()
+    f.set_size_inches(11.69, 8.27)
+    f.savefig(fp + "_" + title + "_" + ".png", dpi=160)
+    plt.clf()
+    print("Generated - " + fp + "_" + title + "_" + ".png")
+
+executor = CF.ProcessPoolExecutor(max_workers=int(os.cpu_count()))
+
+def clean_data(dat: []):
+    ret = []
+    arr = np.array(dat)
+    cutoff = np.percentile(arr, 99)
+    for i in arr:
+        if i <= cutoff:
+            ret.append(i)
+    return ret
+
+def process_file(each_dir):
+    try:
+        print("Processing " + each_dir + " ...")
+        with open(each_dir, 'r') as f:
+            parser = par.khat_parser()
+            parser.parse(f.read())
+
+        sh = []
+        ss = []
+        ch = []
+        cs = []
+        for pt in parser.datapt:
+            sh.append(pt.s_htx - pt.s_hrx)
+            ss.append(pt.s_stx - pt.s_srx)
+            ch.append(pt.c_hrx - pt.c_htx)
+            cs.append(pt.c_srx - pt.c_stx)
+        
+        sh = clean_data(sh)
+        ss = clean_data(ss)
+        ch = clean_data(ch)
+        cs = clean_data(cs)
+
+        saveplot(each_dir, sh, "server_hw_delay")
+        saveplot(each_dir, ss, "server_sw_delay")
+        saveplot(each_dir, ch, "client_hw_delay")
+        saveplot(each_dir, cs, "client_sw_delay")
+
+        # output median, etc.
+        with open(each_dir + "_" + "stats.txt", 'w') as f:
+            f.write("===================== SERVER HW ====================\n")
+            f.write(par.mutilate_data.build_mut_output(sh, [len(sh)]))
+            f.write("\n===================== SERVER SW ====================\n")
+            f.write(par.mutilate_data.build_mut_output(ss, [len(ss)]))
+            f.write("\n===================== CLIENT HW ====================\n")
+            f.write(par.mutilate_data.build_mut_output(ch, [len(ch)]))
+            f.write("\n===================== CLIENT SW ====================\n")
+            f.write(par.mutilate_data.build_mut_output(cs, [len(cs)]))
+
+    except Exception:
+        print("Unexpected error:", sys.exc_info())
+
+def process_dir(rootdir):
+    for subdir in os.listdir(rootdir):
+        each_dir = os.path.join(rootdir, subdir)
+        if os.path.isfile(each_dir):
+            if each_dir.endswith(".txt") or each_dir.endswith(".sample"):
+                process_file(each_dir)
+        else:
+            process_dir(each_dir)
+
+def main():    
+    datdir = None
+    options = getopt.getopt(sys.argv[1:], 'd:')[0]
+
+    for opt, arg in options:
+        if opt in ('-d'):
+            datdir = arg
+
+    if datdir == None:
+        raise Exception("Must specify -d parameter")
+
+    process_dir(datdir)
+    executor.shutdown()
+
+if __name__ == "__main__":
+    main()
--- a/scripts/libs/libmechspec.py
+++ b/scripts/libs/libmechspec.py
@ -0,0 +1,25 @@
+
+class NetSpec:
+    def __init__(self, fqdn, ip, mac) -> None:
+        self.mac = mac
+        self.ip = ip
+        self.fqdn = fqdn
+        self.netspec = ip + "@" + mac
+
+
+class LabNetSpecs:
+    def __init__(self) -> None:
+        self.SKYLAKE1_10G = NetSpec(fqdn = "skylake1.rcs.uwaterloo.ca",ip = "192.168.123.11", mac = "3c:15:fb:62:9b:28")
+        self.SKYLAKE2_10G = NetSpec(fqdn = "skylake2.rcs.uwaterloo.ca",ip = "192.168.123.12", mac = "3c:15:fb:c9:f3:36")
+        self.SKYLAKE3_10G = NetSpec(fqdn = "skylake3.rcs.uwaterloo.ca",ip = "192.168.123.13", mac = "3c:15:fb:c9:f3:4b")
+        self.SKYLAKE4_10G = NetSpec(fqdn = "skylake4.rcs.uwaterloo.ca",ip = "192.168.123.14", mac = "")
+        self.SKYLAKE5_10G = NetSpec(fqdn = "skylake5.rcs.uwaterloo.ca",ip = "192.168.123.15", mac = "3c:15:fb:c9:f3:28")
+        self.SKYLAKE6_10G = NetSpec(fqdn = "skylake6.rcs.uwaterloo.ca",ip = "192.168.123.16", mac = "3c:15:fb:62:9b:2f")
+        self.SKYLAKE7_10G = NetSpec(fqdn = "skylake7.rcs.uwaterloo.ca",ip = "192.168.123.17", mac = "3c:15:fb:c9:f3:44")
+        self.SKYLAKE8_10G = NetSpec(fqdn = "skylake8.rcs.uwaterloo.ca",ip = "192.168.123.18", mac = "3c:15:fb:62:9c:be")
+        self.MILAN1_100G = NetSpec(fqdn = "milan1-int.rcs.uwaterloo.ca",ip = "192.168.123.19", mac = "")
+        self.MILAN1_10G = NetSpec(fqdn = "milan1-int.rcs.uwaterloo.ca",ip = "192.168.123.19", mac = "a0:42:3f:4d:cb:bc")
+        self.ICELAKE2_100G = NetSpec(fqdn = "icelake2-int.rcs.uwaterloo.ca",ip = "192.168.123.20", mac = "")
+        self.ICELAKE2_10G = NetSpec(fqdn = "icelake2-int.rcs.uwaterloo.ca",ip = "192.168.123.20", mac = "")
+
+LAB = LabNetSpecs()
--- a/scripts/libs/libpar.py
+++ b/scripts/libs/libpar.py
@ -0,0 +1,196 @@
+import json
+import numpy as np
+
+class iperf_json_parser:
+    def __init__(self, inputs):
+        self.aggregate_egress_bps = 0
+        self.jsonobjs = []
+        for input in inputs:
+            jsobj = json.loads(input)
+            self.jsonobjs.append(jsobj)
+            each_bps = jsobj['end']['sum_sent']['bits_per_second']
+            self.aggregate_egress_bps += each_bps
+
+class memloadgen_parser:
+    def __init__(self, input, min, max):
+        lines = input.split('\n')
+        if max > len(lines):
+            max = len(lines)
+        if len(lines) <= min:
+            raise Exception("Not enough lines!")
+        if min > max:
+            min = max
+        arr = []
+        for i in range(min, max):
+            arr.append(int(lines[i]))
+        self.bps = np.mean(arr)
+
+
+class pmc_parser:
+    def __init__(self, input):
+        self.raw = input
+        lines = input.split('\n')
+        if len(lines) < 2:
+            raise Exception("Invalid pmc file format")
+        
+        spec = lines[0].strip()
+        if (spec[0] != '#'):
+            raise Exception("Invalid pmc file spec line: \"" + lines[0] + "\"")
+        spec = spec.split(' ')
+        self.cores = len(spec) - 1
+        elements = spec[1].split('/')
+        if (len(elements) != 3):
+            raise Exception("Invalid pmc file spec line: \"" + lines[0] + "\"")
+        self.counter = elements[2].strip()
+
+        last_line = lines[-1]
+        elements = last_line.split(' ')
+        total = 0
+        for e in elements:
+            if (len(e) > 0):
+                total += int(e)
+        self.count = total
+
+class khat_parser:
+    class pt:
+        def __init__(self):
+            self.s_htx = 0
+            self.s_hrx = 0
+            self.s_stx = 0
+            self.s_srx = 0
+            self.c_htx = 0
+            self.c_hrx = 0
+            self.c_stx = 0
+            self.c_srx = 0
+            self.master_total = 0
+            self.master_loss = 0
+            self.slave_total = 0
+            self.slave_loss = 0
+            self.qps = 0
+
+    def __init__(self):
+        self.datapt = []
+        self.srv_hwlat = []
+        self.srv_swlat = []
+        self.clt_hwlat = []
+        self.clt_swlat = []
+        self.lat_idx_arr = []
+        self.lat_idx_arr.append(self.srv_hwlat)
+        self.lat_idx_arr.append(self.srv_swlat)
+        self.lat_idx_arr.append(self.clt_hwlat)
+        self.lat_idx_arr.append(self.clt_swlat)
+
+    def get_stat_arr(self, idx : int):
+        return self.lat_idx_arr[idx]
+
+    def parse(self, output : str):
+        first = True
+        for line in output.splitlines():
+            # the first line is qps
+            cells = line.split(',')
+            if (first):
+                if len(cells) != 5:
+                    raise Exception("Invalid headline:" + line)
+                self.qps = int(cells[0])
+                self.master_recv = int(cells[1])
+                self.master_loss = int(cells[2])
+                self.slave_recv = int(cells[3])
+                self.slave_loss = int(cells[4])
+                first = False
+                continue
+            if len(cells) != 8:
+                raise Exception("Invalid line:" + line)
+            pt = self.pt()
+            pt.c_srx = int(cells[0])
+            pt.c_stx = int(cells[1])
+            pt.c_hrx = int(cells[2])
+            pt.c_htx = int(cells[3])
+            pt.s_srx = int(cells[4])
+            pt.s_stx = int(cells[5])
+            pt.s_hrx = int(cells[6])
+            pt.s_htx = int(cells[7])
+            self.datapt.append(pt)
+            self.srv_hwlat.append(pt.s_htx - pt.s_hrx)
+            self.srv_swlat.append(pt.s_stx - pt.s_srx)
+            self.clt_hwlat.append(pt.c_hrx - pt.c_htx)
+            self.clt_swlat.append(pt.c_srx - pt.c_stx)
+            
+
+class mutilate_data:
+    def __init__(self):
+        self.dat = {}
+        self.qps = 0
+
+    def to_string(self):
+        ret = "Throughput: " + str(self.qps) + "\n" + json.dumps(self.dat)
+        return ret
+
+    @staticmethod
+    def parse_mut_output(output):
+        ret = mutilate_data()
+        succ_qps = False
+        succ_read = False
+        table = [None, "avg", "std", "min", "5th", "10th", "50th", "90th", "95th", "99th"]
+        table_legacy = [None, "avg", "std", "min", "5th", "10th", "90th", "95th", "99th"]
+        for line in output.splitlines():
+            if line.find("Total QPS") != -1:
+                spl = line.split()
+                if len(spl) == 7:
+                    ret.qps = float(spl[3])
+                    succ_qps = True
+                else:
+                    break
+            elif line.find("read") != -1:
+                spl = line.split()
+                if len(spl) == 10:
+                    for i in range(1, len(spl)):
+                        ret.dat[table[i]] = float(spl[i])
+                    succ_read = True
+                elif len(spl) == 9:
+                    for i in range(1, len(spl)):
+                        ret.dat[table_legacy[i]] = float(spl[i])
+                    succ_read = True
+                else:
+                    break
+        
+        if not (succ_qps and succ_read):
+            raise Exception("Failed to parse data")
+
+        return ret
+
+    @staticmethod
+    def parse_mut_sample(fn):
+        f = open(fn, "r")
+        qps = []
+        lat = []
+        lines = f.readlines()
+        for line in lines:
+            entry = line.split()
+            if len(entry) != 2:
+                raise Exception("Unrecognized line: " + line)
+            qps.append(float(entry[0]))
+            lat.append(float(entry[1]))
+        f.close()
+        return qps, lat
+
+
+    # generate mutilate output format
+    @staticmethod
+    def build_mut_output(lat_arr, qps_arr):
+        output = '{0: <10}'.format('#type') + '{0: >10}'.format('avg') + '{0: >10}'.format('std') + \
+                        '{0: >10}'.format('min') + '{0: >10}'.format('5th') + '{0: >10}'.format('10th') + \
+                        '{0: >10}'.format('50th') + '{0: >10}'.format('90th')  + '{0: >10}'.format('95th') + '{0: >10}'.format('99th') + "\n"
+        
+        output += '{0: <10}'.format('read') + '{0: >10}'.format("{:.1f}".format(np.mean(lat_arr))) + ' ' + \
+                        '{0: >10}'.format("{:.1f}".format(np.std(lat_arr))) + ' ' + \
+                        '{0: >10}'.format("{:.1f}".format(np.min(lat_arr))) + ' ' + \
+                        '{0: >10}'.format("{:.1f}".format(np.percentile(lat_arr, 5))) + ' ' + \
+                        '{0: >10}'.format("{:.1f}".format(np.percentile(lat_arr, 10))) + ' ' + \
+                        '{0: >10}'.format("{:.1f}".format(np.percentile(lat_arr, 50))) + ' ' + \
+                        '{0: >10}'.format("{:.1f}".format(np.percentile(lat_arr, 90))) + ' ' + \
+                        '{0: >10}'.format("{:.1f}".format(np.percentile(lat_arr, 95))) + ' ' + \
+                        '{0: >10}'.format("{:.1f}".format(np.percentile(lat_arr, 99))) + ' ' + "\n" \
+
+        output += "\n" + "Total QPS = " + "{:.1f}".format(np.mean(qps_arr)) + " (0 / 0s)"
+
+        return output
--- a/scripts/libs/libtc.py
+++ b/scripts/libs/libtc.py
@ -0,0 +1,189 @@
+import subprocess as sp
+import time
+import select
+import os
+import pwd
+import sys
+import datetime
+import random
+import re
+from threading import Thread 
+
+tc_logfile = None
+	
+def log_print(info):
+	print(info)
+	if tc_logfile != None:
+		tc_logfile.write(info + "\n")
+		tc_logfile.flush()
+
+tc_output_dir=""
+tc_cur_test = ""
+tc_test_id = 0
+
+def init(odir = "./results.d/"):
+	global tc_output_dir
+	tc_output_dir = odir
+	tc_output_dir = os.path.expanduser(tc_output_dir)
+	os.system("mkdir -p " + tc_output_dir)
+	global tc_logfile
+	tc_logfile = open(tc_output_dir + "/log.txt", "w+")
+
+def begin(name):
+	global tc_test_id
+	global tc_cur_test
+	tc_cur_test = name
+	tc_test_id += 1
+	os.system("mkdir -p " + get_odir())
+	log_print("\n===== Test #" + str(tc_test_id) + " - " + tc_cur_test + " started =====")
+
+def end():
+	global tc_cur_test
+	log_print("\n===== Test #" + str(tc_test_id) + " - " + tc_cur_test + " completed =====")
+	tc_cur_test = ""
+
+def get_odir():
+	return tc_output_dir + "/" + tc_cur_test
+
+SCHED_QUEUE = 1
+SCHED_CPU = 2
+SCHED_BEST = 4
+SCHED_FEAT_WS = 1
+def make_sched_flag(sched, args, feat = 0, fargs = 0):
+	return (sched & 0xFF) | (args & 0xFF) << 8 | (feat & 0xFF) << 16 | (fargs & 0xFF) << 24
+
+TUNE_RTSHARE = 2
+TUNE_TFREQ = 1
+def make_tune_flag(obj, val):
+	return (obj & 0xFFFF) | (val & 0xFFFF) << 16 
+
+def get_username():
+    return pwd.getpwuid( os.getuid() )[0]
+
+ssh_param = ""
+def set_ssh_param(para):
+	global ssh_param
+	ssh_param = para
+
+def get_ssh_param():
+	global ssh_param
+	return ssh_param
+
+ssh_user = None
+def set_ssh_user(user):
+	global ssh_user
+	ssh_user = user
+
+def get_ssh_user():
+	global ssh_user
+	return ssh_user
+
+def remote_exec(srv : list[str], cmd : str, blocking=True, check=True) -> sp.Popen:
+	sub = []
+	for s in srv:
+		p = sp.Popen(["ssh " + ssh_param + " " + ((ssh_user + "@") if ssh_user != None else "") + s + " \"" + cmd +"\""], shell=True, stdout=sp.PIPE, stderr=sp.PIPE)
+		sub.append(p)
+	
+	if blocking:
+		for p in sub:
+			p.wait()
+			if check and p.returncode != 0:
+				raise Exception("Command failed " + cmd)
+
+	return sub
+
+
+def check_stderr(p, sel, exclude = []):# -> tuple[bool, list[str]]:
+	max_stderr_rd = 10
+	err = []
+	while sel.poll(1) and max_stderr_rd > 0:
+		err.append(p.stderr.readline().decode().strip())
+		max_stderr_rd = max_stderr_rd - 1
+
+
+	good = True
+	for e in err:
+		e = e.strip()
+		if len(e) == 0:
+			continue
+		
+		good = False
+		for exc in exclude:
+			if exc in e:
+				good = True
+				break
+			
+	return good, err
+
+# stderr threads
+errthr_objs = []
+errthr_sigstop = False
+errthr_failed = False
+
+def errthr_get_failed():
+	return errthr_failed
+
+def thr_check_stderr(p : sp.Popen, name: str, exclude):
+	global errthr_failed
+	sel = select.poll()
+	sel.register(p.stderr, select.POLLIN)
+	local_failed = False
+	while(not errthr_sigstop):
+		if (not local_failed):
+			status, err = check_stderr(p, sel, exclude=exclude)
+			if not status:
+				errthr_failed = True
+				local_failed = True
+				log_print("Error detected in \"" + name + "\":")
+				for e in err:
+					log_print("        \"" + e + "\"")
+				log_print("")
+		time.sleep(random.uniform(0.001, 0.1))
+
+def errthr_start():
+	global errthr_sigstop
+	global errthr_failed
+	errthr_sigstop = False
+	errthr_failed = False
+	for thr in errthr_objs:
+		thr.daemon = True
+		thr.start()
+
+def errthr_create(cp, name, exclude = None):
+	global errthr_objs
+	for i in range(len(cp)):
+		errthr_objs.append(Thread(target = thr_check_stderr, args=(cp[i], name[i], exclude)))
+
+def errthr_stop():
+	global errthr_objs
+	global errthr_sigstop
+	errthr_sigstop = True
+	for thr in errthr_objs:
+		thr.join()
+	errthr_objs.clear()
+
+def parse_hostfile(fp):
+	ret = {}
+	fh = open(fp, "r")
+	content = fh.readlines()
+	fh.close()
+	content = [x.strip() for x in content]
+	for line in content:
+		spl = line.split(" ")
+		if len(spl) >= 2:
+			ret[spl[0]] = spl[1]
+			log_print("Parsed: hostname \"" + spl[0] + "\" -> \"" + spl[1] + "\"")
+	return ret
+
+def process_hostnames(names, hosts):
+	ret = []
+	for line in names:
+		if line in hosts:
+			ret.append(hosts[line])
+		else:
+			ret.append(line)
+	return ret
+
+def get_cpuset_core(threads):
+	ret = "cpuset -l 0-" + str(threads * 2 - 1) + " "
+	return ret
--- a/scripts/netexp.py
+++ b/scripts/netexp.py
@ -0,0 +1,340 @@
+import time
+import subprocess as sp
+import os
+
+import libpar as par
+import libtc as tc
+import libmechspec as mechspec
+
+class NetExpResult:
+	def __init__(self):
+		self.parser = None
+		self.pmc_parser = None
+		self.sample = None
+		
+
+class NetExpConf:
+	def __init__(self):
+		self.root_dir = ""
+
+		self.enable_client_only = False
+		self.enable_memgen = False
+
+		self.memgen_affinity = ""
+		self.memgen_iteration = -1
+		self.memgen_size = 512 * 1024 * 1024
+		self.memgen_tgtdom = 1
+
+		self.srv_affinity = ""
+		self.srv_mechspec = None
+		self.srv_port = 0
+		
+		self.clt_qps = 0
+		self.clt_mechspecs = []
+		self.clt_affinity = "1"
+		self.clt_wrkld = 0
+		self.clt_wrkarg0 = "fixed:0"
+		self.clt_wrkarg1 = "fixed:0"
+		self.clt_pkt_loss_lat = 1000
+		self.clt_rage_quit_lat = 1000
+		self.clt_port = 0
+		self.clt_pkt_pad = 0
+		self.clt_pkt_depth = 1
+		self.clt_ia = "exponential"
+
+		self.mst_mechspec = None
+		self.mst_affinity = "2"
+		self.mst_qps = 100
+		self.mst_port = 0
+		self.mst_pkt_loss_lat = 1000
+		self.mst_pkt_loss_max = 1000
+		self.mst_duration = 10
+		self.mst_warmup = 5
+		self.mst_ia = "exponential"
+
+		self.enable_pmc = False
+		self.pmc_counters = []
+		self.pmc_mode = 0 # 0 = sampling
+		self.pmc_sampling_rate = 8192
+		self.pmc_counting_interval = 0.1
+
+	def __build_fqdn_arr(self, ns):
+		ret = []
+		for n in ns:
+			if n != None:
+				ret.append(n.fqdn)
+		return ret
+	
+	def get_pmc_str(self):
+		ret = ""
+		for counter in self.pmc_counters:
+			ret = ret + counter + ","
+		return ret[:-1]
+
+	def calc_client_qps(self):
+		return 0 if self.clt_qps == 0 else (int)((self.clt_qps - self.mst_qps) / len(self.clt_mechspecs))
+
+	def finalize_mechspecs(self):
+		self.clt_fqdns = self.__build_fqdn_arr(self.clt_mechspecs)
+		self.srv_fqdns = self.__build_fqdn_arr([self.srv_mechspec])
+		self.mst_fqdns = self.__build_fqdn_arr([self.mst_mechspec])
+
+__SAMPLE_FN = "sample.txt.tmp"
+__PMC_FN = "pmc.txt.tmp"
+
+def __keep_result(conf : NetExpConf):
+	result = NetExpResult()
+
+	target_scp_fn = tc.get_odir() + "/" + __SAMPLE_FN
+	scpcmd = "scp -P77 " + tc.get_ssh_user() + "@" + conf.mst_mechspec.fqdn + ":" + conf.root_dir + "/" + __SAMPLE_FN + " " + target_scp_fn
+	tc.log_print(scpcmd)
+	sp.check_call(scpcmd, shell=True)
+
+	result.parser = par.khat_parser()
+	with open(target_scp_fn, "r") as f:
+		result.sample = f.read()
+		result.parser.parse(result.sample)
+	
+	rmcmd = "rm " + target_scp_fn
+	tc.log_print(rmcmd)
+	sp.check_call(rmcmd, shell=True)
+
+	if conf.enable_pmc:
+		target_pmc_fn = tc.get_odir() + "/" + __PMC_FN
+
+		pmcscpcmd = "scp -P77 " + tc.get_ssh_user() + "@" + conf.srv_mechspec.fqdn + ":" + conf.root_dir + "/" + __PMC_FN + " " + target_pmc_fn
+		tc.log_print(pmcscpcmd)
+		sp.check_call(pmcscpcmd, shell=True)
+
+		if conf.pmc_mode == 0:
+			pmcproccmd = "sudo pmcstat -R " + conf.root_dir + "/" + __PMC_FN  + " -m " + conf.root_dir + "/" + __PMC_FN + ".proc"
+			tc.log_print(pmcproccmd)
+			tc.remote_exec(conf.srv_fqdns, pmcproccmd)
+
+			pmcscpcmd = "scp -P77 " + tc.get_ssh_user() + "@" + conf.srv_mechspec.fqdn + ":" + conf.root_dir + "/" + __PMC_FN + ".proc" + " " + target_pmc_fn + ".proc"
+			tc.log_print(pmcscpcmd)
+			sp.check_call(pmcscpcmd, shell=True)
+		
+			if conf.pmc_mode != 0:
+				with open(target_pmc_fn, "r") as f:
+					result.pmc_parser = par.pmc_parser(f.read())
+			else:
+				with open(target_pmc_fn, "rb") as f:
+					with open(target_pmc_fn + ".proc", "r") as g:
+						result.pmc_parser = [f.read(), g.read()]
+
+				rmcmd = "rm " + target_pmc_fn + ".proc"
+				tc.log_print(rmcmd)
+				sp.check_call(rmcmd, shell=True)
+		
+		rmcmd = "rm " + target_pmc_fn
+		tc.log_print(rmcmd)
+		sp.check_call(rmcmd, shell=True)
+
+	return result
+
+def stop_all(conf : NetExpConf):
+	# stop clients
+	tc.log_print("Stopping clients...")
+	tc.remote_exec(conf.clt_fqdns, "sudo killall -9 rat; sudo killall -9 cat; sudo killall -9 khat; sudo killall -9 memloadgen", check=False)
+
+	# stop master
+	tc.log_print("Stopping master...")
+	tc.remote_exec(conf.mst_fqdns, "sudo killall -9 rat; sudo killall -9 cat; sudo killall -9 khat; sudo killall -9 memloadgen", check=False)
+
+	if not conf.enable_client_only:
+		# stop server
+		tc.log_print("Stopping server...")
+		tc.remote_exec(conf.srv_fqdns, "sudo killall -9 rat; sudo killall -9 cat; sudo killall -9 khat; sudo killall -9 memloadgen", check=False)
+	
+	if conf.enable_pmc:
+		tc.log_print("Stopping server PMC...")
+		tc.remote_exec(conf.srv_fqdns, "sudo killall -9 pmcstat", check=False)
+
+
+def __run_setup_cmd(conf : NetExpConf, cmd : str, desc : str):
+	all = []
+	all.extend(conf.srv_fqdns)
+	all.extend(conf.clt_fqdns)
+	all.extend(conf.mst_fqdns)
+
+	ssrv : list[tuple[str, sp.Popen]] = []
+	for s in all:
+		tc.log_print(f"Running \'{desc}\' on {s}...")
+		ssrv.append((s, tc.remote_exec([s], cmd, blocking=False, check=False)[0]))
+
+	for p in ssrv:
+		_ , stderr = p[1].communicate()
+		if p[1].returncode != 0:
+			print(f"{ p[0] } \'{desc}\' failed. stderr:\n{stderr.decode()}\n")
+		else:
+			print(f"{ p[0] } \'{desc}\' succeeded")
+
+def setup(conf : NetExpConf, bench : False, dpdk : False):
+	libtopo_path = "/libtopo"
+	dpdk_path = "/dpdk"
+	bench_path = "/numam.d"
+	if dpdk:
+		setup_cmd = f'''sudo rm -rf {libtopo_path}; sudo rm -rf /usr/local/include/libtopo; 
+					sudo rm -rf /usr/local/lib/libtopo; 
+					sudo mkdir -p {libtopo_path}; 
+					sudo chmod 777 {libtopo_path}; 
+					cd {libtopo_path}; 
+					git clone https://git.quacker.org/d/libtopo; 
+					cd libtopo; 
+					mkdir build; 
+					cd build; 
+					cmake ../; 
+					sudo make install'''
+		__run_setup_cmd(conf, setup_cmd, "dpdk - libtopo")
+		setup_cmd = f'''sudo pkg install -y meson pkgconf py39-pyelftools; 
+					sudo rm -rf {dpdk_path} 
+					sudo mkdir -p {dpdk_path}; 
+					sudo chmod 777 {dpdk_path}; 
+					cd {dpdk_path}; 
+					git clone https://git.quacker.org/d/numam-dpdk; 
+					cd numam-dpdk; 
+					git checkout migration; 
+					CC=gcc CXX=g++ meson -Denable_kmods=true build; 
+					cd build; 
+					sudo ninja install'''
+		__run_setup_cmd(conf, setup_cmd, "dpdk - dpdk")
+	if bench:
+		setup_cmd = f'''sudo rm -rf {bench_path};
+						sudo mkdir -p {bench_path};
+						sudo chmod 777 {bench_path}'''
+		__run_setup_cmd(conf, setup_cmd, "bench - remove")
+		all = []
+		all.extend(conf.srv_fqdns)
+		all.extend(conf.clt_fqdns)
+		all.extend(conf.mst_fqdns)
+		dir = f"{os.path.dirname(__file__)}/../"
+		for clt in all:
+			print("Syncing files to " + clt + "...")
+			rsync_cmd = f"rsync -az --no-perms --rsync-path=\"sudo rsync\" --omit-dir-times -e \"ssh -p77\" {dir} {tc.get_ssh_user()}@{clt}:{bench_path}/"
+			sp.check_call(rsync_cmd, shell=True)
+		setup_cmd = f'''cd {bench_path};
+						sudo rm -rf build;
+						mkdir build;
+						cd build;
+						cmake ../;
+						make -j8 khat cat rat memloadgen'''
+		__run_setup_cmd(conf, setup_cmd, "bench - compile")
+
+def run(conf : NetExpConf):
+	stop_all(conf)
+	while True:
+		server_cmd = "sudo "
+		if conf.enable_pmc:
+			if conf.pmc_mode != 0:
+				pmc_cmd = "sudo pmcstat -C -w " + str(conf.pmc_counting_interval) + " -s " + conf.get_pmc_str() + " -o " + conf.root_dir + "/" + __PMC_FN
+			else:
+				pmc_cmd = "sudo pmcstat -n " + str(conf.pmc_sampling_rate) + " -S " + conf.get_pmc_str() + " -O " + conf.root_dir + "/" + __PMC_FN
+			tc.log_print("Starting server PMC...")
+			tc.log_print(pmc_cmd)
+			spmc = tc.remote_exec(conf.srv_fqdns, pmc_cmd, blocking=False)
+
+		server_cmd += conf.root_dir + "/khat --log-level lib.eal:err -- -A " + conf.srv_affinity + \
+				" -H " + conf.srv_mechspec.netspec + " -p " + str(conf.srv_port)
+		if int(conf.clt_pkt_pad) > 1518:
+			server_cmd += " -J "
+		if conf.enable_client_only:
+			ssrv = None
+			tc.log_print(server_cmd)
+		else:
+			# start server
+			tc.log_print("Starting server...")
+			tc.log_print(server_cmd)
+			ssrv = tc.remote_exec(conf.srv_fqdns, server_cmd, blocking=False)
+		
+		if conf.enable_memgen:
+			memgen_cmd = "sudo " + conf.root_dir + "/memloadgen -b " + str(conf.memgen_size) + " -s " + conf.memgen_affinity + \
+				" -i " + str(conf.memgen_iteration) + " -d " + str(conf.memgen_tgtdom)
+			tc.log_print("Starting memloadgen...")
+			tc.log_print(memgen_cmd)
+			smem = tc.remote_exec(conf.srv_fqdns, memgen_cmd, blocking=False)
+
+		# start clients
+		tc.log_print("Starting clients...")
+		sclt = []
+		sclt_name = []
+		for i in range(len(conf.clt_fqdns)):
+			client_cmd = "sudo " + conf.root_dir + "/rat --log-level lib.eal:err -- -S -A " + conf.clt_affinity + \
+				" -i " + conf.clt_ia + \
+				" -q " + str(conf.calc_client_qps()) + \
+				" -H " + conf.clt_mechspecs[i].netspec + \
+				" -s " + conf.srv_mechspec.netspec + \
+				" -r " + str(conf.clt_rage_quit_lat) + \
+				" -l " + str(conf.clt_pkt_loss_lat) + \
+				" -w " + str(conf.clt_wrkld) + \
+				" -w " + str(conf.clt_wrkarg0) + \
+				" -w " + str(conf.clt_wrkarg1) + \
+				" -P " + str(conf.clt_pkt_pad) + \
+				" -D " + str(conf.clt_pkt_depth) + \
+				" -p " + str(conf.clt_port)
+			if int(conf.clt_pkt_pad) > 1518:
+				client_cmd += " -J "
+			tc.log_print(client_cmd)
+			sclt.append(tc.remote_exec([conf.clt_fqdns[i]], client_cmd, blocking=False)[0])
+			sclt_name.append(conf.clt_fqdns[i])
+
+		time.sleep(5)
+		# start master
+		tc.remote_exec
+		tc.log_print("Starting master...")
+		master_cmd = "sudo " + conf.root_dir + "/cat --log-level lib.eal:err -- " + \
+							  " -s " + conf.srv_mechspec.netspec + \
+							  " -o " + conf.root_dir + "/" + __SAMPLE_FN + \
+							  " -t " + str(conf.mst_duration) + \
+							  " -T " + str(conf.mst_warmup) + \
+							  " -i " + conf.mst_ia + \
+							  " -q " + str(conf.mst_qps) + \
+							  " -l " + str(conf.mst_pkt_loss_lat) + \
+							  " -L " + str(conf.mst_pkt_loss_max) + \
+							  " -A " + conf.mst_affinity + \
+							  " -H " + conf.mst_mechspec.netspec + \
+							  " -p " + str(conf.mst_port)
+		for clt in conf.clt_mechspecs:
+			master_cmd += " -S " + clt.netspec
+		tc.log_print(master_cmd)
+		sp = tc.remote_exec(conf.mst_fqdns, master_cmd, blocking=False)
+		p = sp[0]
+
+		# launch stderr monitoring thread
+		exclude = ["Pseudo-terminal", "ice_", "i40e_"]
+		tc.errthr_create([p], conf.mst_fqdns, exclude)
+		if not conf.enable_client_only:
+			tc.errthr_create(ssrv, conf.srv_fqdns, exclude)
+		tc.errthr_create(sclt, sclt_name, exclude)
+		if conf.enable_memgen:
+			tc.errthr_create(smem, ["memloadgen"], exclude)
+		if conf.enable_pmc:
+			tc.errthr_create(spmc, ["pmcstat"], exclude)
+		tc.errthr_start()
+		success = False
+		cur = 0
+		# selec = select.poll()
+		# selec.register(p.stdout, select.POLLIN)
+		while True:
+			# either failed or timeout
+			# we use failure detection to save time for long durations
+			if tc.errthr_get_failed() or cur >= (conf.mst_warmup + conf.mst_duration) * 3:
+				break
+
+			# while selec.poll(1):
+			# 	print(p.stdout.readline())
+			
+			if p.poll() != None:
+				success = True
+				break
+
+			time.sleep(1)
+			cur = cur + 1
+
+		stop_all(conf)
+		tc.errthr_stop()
+		tc.log_print("Cooling down...")
+		time.sleep(5)
+
+		if success:
+			return __keep_result(conf)
--- a/scripts/storage/parse.py
+++ b/scripts/storage/parse.py
@ -0,0 +1,112 @@
+#!/usr/bin/env python3.6
+
+import numpy as np
+import sys
+import re
+import os
+import json
+import getopt
+import math
+import concurrent.futures as CF
+
+columns = [
+    ("Req per second", "rps", ".2f"),
+    ("Bytes per second", "bps", ".2f"),
+    ("Average Latency", "lat_avg", ".2f"),
+    ("50th Latency", "lat_50", ".0f"),
+    ("95th Latency", "lat_95", ".0f"),
+    ("99th Latency", "lat_99", ".0f"),
+    ("Latency stddev", "lat_std", ".2f")
+]
+
+TIME = 30
+REQ_SZ = 4096
+
+class DatObj:
+    def __init__(self, raw : list, time : int, req_sz : int):
+        self.raw = raw
+        self.rps = len(raw) / time
+        self.bps = self.rps * req_sz
+        self.lat_avg = np.average(self.raw)
+        self.lat_99 = np.percentile(self.raw, 99)
+        self.lat_95 = np.percentile(self.raw, 95)
+        self.lat_50 = np.percentile(self.raw, 50)
+        self.lat_std = np.std(self.raw)
+
+def parse_file(lines : list, time : int, req_sz : int) -> DatObj :
+    raw = []
+    for line in lines:
+        if len(line) > 0:
+            raw.append(int(line))
+    return DatObj(raw, time, req_sz)
+
+def output_col():
+    ret = "Benchmark"
+    for name,_,_ in columns:
+        ret = ret + "," + name + "," + name + " (NUMA)" + "," + "% change"
+    return ret
+
+def get_attr_or_none(obj, attr):
+    if (obj != None):
+        val = getattr(obj, attr)
+    else:
+        val = None
+    return val
+
+def output_objs(name: str, obj : DatObj, obj_numa : DatObj):
+    ret = name
+    for _, attr, fmt in columns:
+        val = get_attr_or_none(obj, attr)
+        val_numa = get_attr_or_none(obj_numa, attr)
+ 
+        ret = ret + "," + (format(val, fmt) if val != None else "N/A") 
+        ret = ret + "," + (format(val_numa, fmt) if val_numa != None else "N/A") 
+        
+        if val == None or val_numa == None:
+            ret = ret + "," + "N/A"
+        else:
+            ret = ret + "," + format((val_numa - val) / val * 100, ".2f") + "%"
+    return ret
+
+def process_file(f : str, obj_map):
+    with open(f, "r") as fp:
+        lines = fp.readlines()
+    
+    bench_name = os.path.basename(f)
+    obj_map[bench_name] = parse_file(lines, TIME, REQ_SZ)
+    print("Processed file " + f + ". Benchmark name: " + bench_name)
+
+def process_dir(path : str, obj_map):
+    files = [os.path.abspath(os.path.join(path, x)) for x in os.listdir(path)]
+    for f in files:
+        if (".sh" in f):
+            continue
+        if (os.path.isfile(f)):
+            process_file(f, obj_map)
+
+def main():
+    datdir = None
+    options = getopt.getopt(sys.argv[1:], 'd:')[0]
+
+    for opt, arg in options:
+        if opt in ('-d'):
+            datdir = arg
+
+    if datdir == None:
+        raise Exception("Must specify -d parameter")
+
+    obj_map = dict()
+    process_dir(datdir, obj_map)
+
+    with open("results.csv", "w") as f:
+        f.write(output_col())
+        f.write("\n")
+
+        for bench in obj_map:
+            if bench.endswith("_numa"):
+                continue
+            f.write(output_objs(bench, obj_map[bench], obj_map.get(bench+"_numa")))
+            f.write("\n")
+
+if __name__ == "__main__":
+    main()
--- a/scripts/storage/test_posix.sh
+++ b/scripts/storage/test_posix.sh
@ -0,0 +1,19 @@
+# rand_read
+sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D /dev/nvd0 -P R,100 -Q 3 -o rand_read
+sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D /dev/nvd0 -P R,100 -Q 3 -o rand_read_numa
+
+# rand_write
+sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D /dev/nvd0 -P R,0 -Q 3 -o rand_write
+sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D /dev/nvd0 -P R,0 -Q 3 -o rand_write_numa
+
+# mono_read
+sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D /dev/nvd0 -P M,100 -Q 3 -o mono_read
+sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D /dev/nvd0 -P M,100 -Q 3 -o mono_read_numa
+
+# mono_write
+sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D /dev/nvd0 -P M,0 -Q 3 -o mono_write
+sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D /dev/nvd0 -P M,0 -Q 3 -o mono_write_numa
+
+# mixed
+sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D /dev/nvd0 -P R,70 -Q 3 -o mixed_read
+sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D /dev/nvd0 -P R,70 -Q 3 -o mixed_read_numa
--- a/scripts/storage/test_spdk_bdev.sh
+++ b/scripts/storage/test_spdk_bdev.sh
@ -0,0 +1,19 @@
+# rand_read
+sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P R,100 -Q 3 -o rand_read -k bdev
+sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P R,100 -Q 3 -o rand_read_numa -k bdev
+
+# rand_write
+sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P R,0 -Q 3 -o rand_write -k bdev
+sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P R,0 -Q 3 -o rand_write_numa -k bdev
+
+# mono_read
+sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P M,100 -Q 3 -o mono_read -k bdev
+sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P M,100 -Q 3 -o mono_read_numa -k bdev
+
+# mono_write
+sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P M,0 -Q 3 -o mono_write -k bdev
+sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P M,0 -Q 3 -o mono_write_numa -k bdev
+
+# mixed
+sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P R,70 -Q 3 -o mixed_read -k bdev
+sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P R,70 -Q 3 -o mixed_read_numa -k bdev
--- a/scripts/storage/test_spdk_nvme.sh
+++ b/scripts/storage/test_spdk_nvme.sh
@ -0,0 +1,19 @@
+# rand_read
+sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P R,100 -Q 3 -o rand_read -k bdev
+sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P R,100 -Q 3 -o rand_read_numa -k bdev
+
+# rand_write
+sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P R,0 -Q 3 -o rand_write -k bdev
+sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P R,0 -Q 3 -o rand_write_numa -k bdev
+
+# mono_read
+sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P M,100 -Q 3 -o mono_read -k bdev
+sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P M,100 -Q 3 -o mono_read_numa -k bdev
+
+# mono_write
+sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P M,0 -Q 3 -o mono_write -k bdev
+sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P M,0 -Q 3 -o mono_write_numa -k bdev
+
+# mixed
+sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P R,70 -Q 3 -o mixed_read -k bdev
+sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P R,70 -Q 3 -o mixed_read_numa -k bdev
--- a/storage/birb.cc
+++ b/storage/birb.cc
@ -0,0 +1,797 @@
+#include <sys/endian.h>
+#include <sys/errno.h>
+#include <sys/types.h>
+#include <x86/_stdint.h>
+#include <getopt.h>
+#include <pthread.h>
+#include <pthread_np.h>
+#include <threads.h>
+#include <unistd.h>
+
+#include <cerrno>
+#include <cstddef>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <chrono>
+#include <list>
+#include <set>
+
+#include "rte_lcore.h"
+#include "spdk/cpuset.h"
+#include "spdk/stdinc.h"
+#include "spdk/thread.h"
+#include "spdk/env.h"
+#include "spdk/event.h"
+#include "spdk/log.h"
+#include "spdk/string.h"
+
+#include "gen.hh"
+#include "ntr.h"
+#include "defs.hh"
+#include "nm.hh"
+#include "storage/io_gen.hh"
+#include "storage/drivers/driver.hh"
+#include "storage/drivers/bdev.hh"
+#include "storage/drivers/nvme.hh"
+
+static inline uint64_t get_cur_ts_nano()
+{
+    return std::chrono::duration_cast<std::chrono::nanoseconds>
+              (std::chrono::high_resolution_clock::now().time_since_epoch()).count();
+}
+
+/*
+ * We'll use this struct to gather housekeeping hello_context to pass between
+ * our events and callbacks.
+ */
+static constexpr unsigned long MAX_SPEC_LEN = 32;
+static constexpr unsigned long MAX_DEV_NAME_LEN = 32;
+static constexpr unsigned long MAX_OUTPUT_FILE_LEN = 256;
+struct options_t {
+	// args
+	int verbosity = NTR_LEVEL_DEFAULT;
+	int num_threads = 1;
+	unsigned long cpumask = 1;
+	char pattern_spec[MAX_SPEC_LEN] = "R,100";
+	char ia_spec[MAX_SPEC_LEN] = "fixed";
+	
+	unsigned int time = 5;
+	unsigned int warmup = 2;
+	unsigned int queue_depth = 1;
+	char dev_name[MAX_DEV_NAME_LEN] = "Malloc0";
+	char driver_name[MAX_DEV_NAME_LEN] = "bdev";
+	unsigned int read_pct = 0;
+	io_generator_address_mode addr_mode = IOGEN_ADDR_UNIFORM_RANDOM;
+
+	char output_file[MAX_OUTPUT_FILE_LEN] = "output.txt";
+
+	unsigned long req_size = 4096;
+	unsigned long rps = 0;
+};
+
+struct main_thread_cb_vars {
+	uint32_t worker_thread_init_cnt;
+	uint32_t worker_thread_stop_cnt;
+};
+
+struct worker_thread_cb_vars {
+	uint32_t worker_start;
+	uint32_t worker_stop;
+	struct thread_context * ctx;
+	std::list<struct io_request *> * free_ios;
+};
+
+static __thread void * cb_vars;
+static struct options_t options;
+
+struct io_record {
+	uint64_t start_ts;
+	uint64_t end_ts;
+};
+
+struct io_request {
+	uint64_t start_ts;
+	io_generator_opcode op;
+	char * user_buf;
+	char * dma_buf;
+};
+
+struct thread_context {
+	unsigned int tid;
+	unsigned int coreid;
+	unsigned int sockid;
+	pthread_t sys_thread;
+	struct spdk_thread * main_thread;
+	birb_driver * driver;
+
+	unsigned long start_region_offset;
+	unsigned long start_region_length;
+
+	/* modified by worker threads */
+	struct spdk_thread * sp_thread;
+	std::list<io_record *> *io_records;
+	uint64_t overhead_avg;
+	uint32_t overhead_cnt;
+	uint64_t overhead_max;
+	uint64_t overhead_min;
+};
+
+static void dump_options()
+{
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: Options:\n"
+					"    dev name: %s\n"
+					"    driver name: %s\n"
+                    "    worker threads: 0x%lx\n"
+					"    number of threads: %d\n"
+					"    IO request size: %lu\n"
+					"    IO requests per second: %lu\n"
+					"    IO pattern: %s\n"
+					"    IO queue depth: %d\n"
+					"    IO addressing mode: %d\n"
+					"    read percent: %u\n"
+					"    inter-arrival dist: %s\n"
+					"    run time: %d\n"
+					"    warmup time: %d\n"
+					"    output file: %s\n",
+					options.dev_name,
+					options.driver_name,
+					options.cpumask,
+					options.num_threads,
+					options.req_size,
+					options.rps,
+					options.pattern_spec,
+					options.queue_depth,
+					options.addr_mode,
+					options.read_pct,
+					options.ia_spec,
+					options.time,
+					options.warmup,
+					options.output_file
+	);
+}
+
+static void usage()
+{
+	fprintf(stdout, 
+		" -V(VV): verbose mode\n"
+		" -D: dev name\n"
+		" -k: driver to use (default bdev)\n"
+		" -a: worker threads spec (0x3 = spawn 2 threads on core 1 & 2)\n"
+		" -b: IO request size\n"
+		" -q: IO requests per second\n"
+		" -P: IO request pattern\n"
+		" -Q: IO request queue depth\n"
+		" -I: inter-arrival time distribution\n"
+		" -t: total run time\n"
+		" -w: warm up time\n"
+		" -o: latency response output file\n");
+}
+
+static int parse_arg(int c, char *arg)
+{
+	switch (c) {
+	case 'V':
+		ntr_set_level(NTR_DEP_USER1,
+			ntr_get_level(NTR_DEP_USER1) + 1);
+		break;
+	case 'D':
+		strncpy(options.dev_name, arg, MAX_DEV_NAME_LEN);
+		break;
+	case 'k':
+		strncpy(options.driver_name, arg, MAX_DEV_NAME_LEN);
+		break;
+	case 'a':
+		options.cpumask = strtoull(optarg, nullptr, 16);
+		options.num_threads = cmask_get_num_cpus(
+			options.cpumask);
+
+		if (options.num_threads == 0) {
+			fprintf(stderr,
+				"must run at least one thread\n");
+			return EINVAL;
+		}
+		break;
+	case 'b':
+		options.req_size = strtoull(
+			optarg, nullptr, 10);
+		break;
+	case 'q':
+		options.rps = strtoull(
+			optarg, nullptr, 10);
+		break;
+	case 'Q':
+		options.queue_depth = strtoull(
+			optarg, nullptr, 10);
+		break;
+	case 'P':
+		strncpy(options.pattern_spec, optarg, MAX_SPEC_LEN);
+		break;
+	case 'I':
+		strncpy(options.ia_spec, optarg, MAX_SPEC_LEN);
+		break;
+	case 't':
+		options.time = strtoull(
+			optarg, nullptr, 10);
+		break;
+	case 'w':
+		options.warmup = strtoull(
+			optarg, nullptr, 10);
+		break;
+	case 'o':
+		strncpy(options.output_file, optarg, MAX_OUTPUT_FILE_LEN);
+		break;
+	case 'h':
+	default:
+		return EINVAL;
+	}
+
+	return 0;
+}
+
+static birb_driver * 
+birb_create_driver(const char * driver_name, void * context)
+{
+	if (strcmp(driver_name, "bdev") == 0) {
+		return new birb_bdev_driver(reinterpret_cast<const char *>(context));
+	} else if (strcmp(driver_name, "nvme") == 0) {
+		return new birb_nvme_driver(reinterpret_cast<const char *>(context));
+	} else {
+		return nullptr;
+	}
+}
+
+static birb_driver_thread_context * 
+birb_create_thread_context(birb_driver * driver)
+{
+	if (driver->get_type() == birb_driver::BIRB_DRV_BDEV) {
+		return new birb_bdev_thread_context(dynamic_cast<birb_bdev_driver *>(driver));
+	} else if (driver->get_type() == birb_driver::BIRB_DRV_NVME) {
+		return new birb_nvme_thread_context(dynamic_cast<birb_nvme_driver *>(driver));
+	} else {
+		return nullptr;
+	}
+}
+
+static void
+birb_destroy_driver(birb_driver * drv)
+{
+	delete drv;
+}
+
+static void
+birb_destroy_thread_context(birb_driver_thread_context * ctx)
+{
+	delete ctx;
+}
+
+/*
+ * Callback function for io completion.
+ */
+static void
+worker_io_complete(bool success, void *cb_arg)
+{
+	auto vars = (struct worker_thread_cb_vars *)cb_vars;
+	auto req = (struct io_request *)cb_arg;
+
+	uint64_t end_ts = get_cur_ts_nano();
+
+	if (!success) {
+		// XXX: print warning for errors for now
+		ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "thread %d <worker_io_complete>: io request failed\n", vars->ctx->tid);
+	} else {
+		auto rec = new struct io_record;
+		rec->start_ts = req->start_ts;
+		rec->end_ts = end_ts;
+		vars->ctx->io_records->push_back(rec);
+
+		if (req->op == IOGEN_READ) {
+			memcpy(req->user_buf, req->dma_buf, options.req_size);
+		}
+
+		ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d <worker_io_complete>: completed io request type %d\n", vars->ctx->tid, req->op);
+	}
+
+	vars->free_ios->push_back(req);
+}
+
+
+static void
+cb_notify_main_init(void * arg)
+{
+	auto * ctx = (struct thread_context *)arg;
+	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "cb_notify_main_init: from thread %d to main.\n", ctx->tid);
+
+	auto * vars = (struct main_thread_cb_vars *) cb_vars;
+	vars->worker_thread_init_cnt++; 
+}
+
+static void
+cb_notify_main_stop(void * arg)
+{
+	auto * ctx = (struct thread_context *)arg;
+	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "cb_notify_main_stop: from thread %d to main.\n", ctx->tid);
+
+	auto * vars = (struct main_thread_cb_vars *) cb_vars;
+	vars->worker_thread_stop_cnt++; 
+}
+
+static void
+cb_notify_worker_start(void * arg)
+{
+	auto * ctx = (struct thread_context *)arg;
+	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "cb_notify_worker_start: from main to thread %d.\n", ctx->tid);
+
+	auto * vars = (struct worker_thread_cb_vars *) cb_vars;
+	vars->worker_start = 1;	
+}
+
+static void
+cb_notify_worker_stop(void * arg)
+{
+	auto * ctx = (struct thread_context *)arg;
+	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "cb_notify_worker_stop: from main to thread %d.\n", ctx->tid);
+
+	auto * vars = (struct worker_thread_cb_vars *) cb_vars;
+	vars->worker_stop = 1;	
+}
+
+static void 
+main_thread_cb_vars_init(struct main_thread_cb_vars * vars)
+{
+	vars->worker_thread_init_cnt = 0;
+	vars->worker_thread_stop_cnt = 0;
+}
+
+static void
+worker_thread_cb_vars_init(struct worker_thread_cb_vars * vars, struct thread_context * ctx, 
+	std::list<struct io_request *> * free_ios)
+{
+	vars->worker_start = 0;
+	vars->worker_stop = 0;
+	vars->ctx = ctx;
+	vars->free_ios = free_ios;
+}
+
+static void * 
+worker_thread_main(void * arg)
+{
+	int rc = 0;
+
+	constexpr static unsigned int SPDK_THREAD_NAME_SZ = 16;
+
+	struct worker_thread_cb_vars vars;
+	auto *ctx = (struct thread_context *)arg;
+	birb_driver_thread_context * driver_thread_ctx;
+	std::list<struct io_request *> free_ios;
+	char spdk_thread_name[SPDK_THREAD_NAME_SZ];
+	struct spdk_cpuset * cpuset;
+
+	Generator * ia_gen = nullptr;
+	io_generator * io_gen = nullptr;
+
+	struct io_generator_ctx io_ctx;
+	uint64_t next_ts;
+	uint64_t a_offset;
+	uint64_t last_loop_ts = 0;
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: init...\n", ctx->tid);
+
+	ctx->overhead_avg = 0;
+	ctx->overhead_cnt = 0;
+	ctx->overhead_max = 0;
+	ctx->overhead_min = UINT64_MAX;
+
+	// create spdk thread
+	cpuset = spdk_cpuset_alloc();
+	if (cpuset == nullptr) {
+		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to alloc cpuset\n");
+		rc = ENOMEM;
+		goto cleanup;
+	}
+	spdk_cpuset_zero(cpuset);
+	spdk_cpuset_set_cpu(cpuset, ctx->coreid, true);
+	snprintf(spdk_thread_name, SPDK_THREAD_NAME_SZ, "birb_worker_%u", ctx->tid);
+	ctx->sp_thread = spdk_thread_create(spdk_thread_name, cpuset);
+	if (ctx->sp_thread == nullptr) {
+		rc = ENOMEM;
+		goto cleanup;
+	}
+	spdk_set_thread(ctx->sp_thread);
+
+	// create thread context
+	driver_thread_ctx = birb_create_thread_context(ctx->driver);
+	if (driver_thread_ctx == nullptr || driver_thread_ctx->get_status() != birb_driver::BIRB_SUCCESS) {
+		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: could not create thread context!\n", ctx->tid);
+		rc = EINVAL;
+		goto cleanup;
+	}
+
+	// create io request objects
+	for (unsigned int i = 0; i < options.queue_depth; i++) {
+		auto dma_buf = (char *)spdk_dma_zmalloc_socket(options.req_size, ctx->driver->get_align(), NULL, ctx->sockid);
+		auto user_buf = (char *)nm_malloc(ctx->sockid, options.req_size);
+
+		if (dma_buf == nullptr || user_buf == nullptr) {
+			ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: could not allocate buffers!\n", ctx->tid);
+			rc = ENOMEM;
+			goto cleanup;
+		}
+
+		auto io_req = new struct io_request;
+		io_req->dma_buf = dma_buf;
+		io_req->user_buf = user_buf;
+
+		free_ios.push_back(io_req);
+	}
+
+	// init thread local states
+	worker_thread_cb_vars_init(&vars, ctx, &free_ios);
+	cb_vars = &vars;
+	
+	ia_gen = createGenerator(options.ia_spec);
+	if (ia_gen == nullptr) {
+		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: could not allocate ia generator!\n", ctx->tid);
+		rc = EINVAL;
+		goto cleanup;
+	}
+	ia_gen->set_lambda((double)options.rps / (double)(options.num_threads));
+
+	io_gen = new io_generator(options.req_size, ctx->start_region_length, options.read_pct, options.addr_mode);
+	if (io_gen == nullptr) {
+		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: could not allocate ia generator!\n", ctx->tid);
+		rc = EINVAL;
+		goto cleanup;
+	}
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: init complete.\n", ctx->tid);
+
+	if ((rc = spdk_thread_send_msg(ctx->main_thread, cb_notify_main_init, ctx)) != 0) {
+		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: could not send message %d\n", ctx->tid, rc);
+		goto cleanup;
+	}
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: waiting for start...\n", ctx->tid);
+
+	while (vars.worker_start != 1) {
+		spdk_thread_poll(spdk_get_thread(), 0, 0);
+	}
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: started...\n", ctx->tid);
+	
+	/* random delay 0-100 us */
+	usleep(nm_get_uptime_ns() % 100);
+
+	next_ts = get_cur_ts_nano();
+	
+	while (true) {
+		uint64_t cur_loop_ts = get_cur_ts_nano();
+		if (last_loop_ts > 0) {
+			uint64_t overhead = cur_loop_ts - last_loop_ts;
+			if (ctx->overhead_max < overhead) {
+				ctx->overhead_max = overhead;
+			}
+
+			if (ctx->overhead_min > overhead) {
+				ctx->overhead_min = overhead;
+			}
+
+			ctx->overhead_avg = ctx->overhead_avg * ctx->overhead_cnt + overhead;
+			ctx->overhead_cnt++;
+			ctx->overhead_avg /= ctx->overhead_cnt;
+		}
+		last_loop_ts = cur_loop_ts;
+
+		spdk_thread_poll(spdk_get_thread(), 0, 0);
+		driver_thread_ctx->poll();
+
+		if (vars.worker_stop != 0) {
+			if (free_ios.size() >= options.queue_depth) {
+				break;
+			}
+		} else {
+			if (!free_ios.empty()) {
+				auto io_req = free_ios.front();
+
+				uint64_t cur_ts = get_cur_ts_nano();
+
+				if (cur_ts >= next_ts) {
+					io_gen->issue(&io_ctx, io_req->dma_buf);
+
+					a_offset = io_ctx.offset + ctx->start_region_offset;
+
+					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: issuing IO type %d at offset 0x%lx size 0x%lx...\n", ctx->tid, io_ctx.op, a_offset, io_ctx.size);
+
+					io_req->start_ts = cur_ts;
+					io_req->op = io_ctx.op;
+
+					if(io_ctx.op == IOGEN_READ) {
+						rc = driver_thread_ctx->read(a_offset, io_ctx.size, io_req->dma_buf, worker_io_complete, io_req);
+					} else {
+						rc = driver_thread_ctx->write(a_offset, io_ctx.size, io_req->dma_buf, worker_io_complete, io_req);
+					}
+
+					if (rc != 0) {
+						ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "thread %d: failed to issue io %d, retrying...", ctx->tid, rc);
+					} else {
+						free_ios.pop_front();
+						next_ts = next_ts + ia_gen->generate() * S2NS;
+					}
+				}
+			}
+		}
+	}
+
+cleanup:
+	while (!free_ios.empty()) {
+		auto req = free_ios.front();
+		free_ios.pop_front();
+		spdk_dma_free(req->dma_buf);
+		nm_free(ctx->sockid, req->user_buf);
+	}
+
+	if (ia_gen != nullptr) {
+		delete ia_gen;
+	}
+
+	if (io_gen != nullptr) {
+		delete io_gen;
+	}
+
+	if (cpuset != nullptr) {
+		spdk_cpuset_free(cpuset);
+	}
+
+	if (driver_thread_ctx != nullptr) {
+		birb_destroy_thread_context(driver_thread_ctx);
+	}
+
+	if (rc == 0) {
+		if ((rc = spdk_thread_send_msg(ctx->main_thread, cb_notify_main_stop, ctx)) != 0) {
+			ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: could not send message %d\n", ctx->tid, rc);
+		}
+	}
+	
+	spdk_thread_exit(ctx->sp_thread);
+	
+	while (!spdk_thread_is_exited(ctx->sp_thread)) {
+		spdk_thread_poll(ctx->sp_thread, 0, 0);
+	};
+
+	if (ctx->sp_thread != nullptr) {
+		spdk_set_thread(nullptr);
+		spdk_thread_destroy(ctx->sp_thread);
+	}
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: stopped...\n", ctx->tid);
+
+	if (rc != 0) {
+		spdk_app_stop(rc);
+	}
+
+	return nullptr;
+}
+
+
+static void
+parse_pattern(char * pattern, unsigned int * read_pct, io_generator_address_mode * addr_mode)
+{
+	char * token = strtok(pattern, ",");
+
+	if (strcmp(token, "M") == 0) {
+		*addr_mode = IOGEN_ADDR_MONOTONIC_INCREASING;
+	} else {
+		*addr_mode = IOGEN_ADDR_UNIFORM_RANDOM;
+	}
+
+	token = strtok(nullptr, ",");
+	*read_pct = strtoull(token, nullptr, 10);
+}
+
+static void
+birb_main(void * arg1 UNUSED)
+{
+	int rc = 0;
+	std::list<struct thread_context *> worker_threads;
+	std::ofstream output_file;
+	struct main_thread_cb_vars vars;
+	birb_driver * drv = nullptr;
+
+	unsigned long record_cutoff_time = 0;
+	unsigned long current_s = 0;
+	unsigned int total_reqs = 0;
+	unsigned int tid = 0;
+	unsigned long per_thread_cap = 0;
+	int cur_core;
+
+	/* initialize driver */
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: initializing device driver for device %s\n", options.dev_name);
+	drv = birb_create_driver(options.driver_name, options.dev_name);
+	if (drv == nullptr || drv->get_status() != birb_driver::BIRB_SUCCESS) {
+		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to create device driver.\n");
+		rc = EINVAL;
+		goto end;
+	}
+	per_thread_cap = drv->get_capacity() / options.num_threads;
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: initialized device with capacity %zu bytes ~= %zu MB\n", drv->get_capacity(), drv->get_capacity() / 1024 / 1024);
+
+	/* misc init */
+	main_thread_cb_vars_init(&vars);
+	cb_vars = &vars;
+
+	parse_pattern(options.pattern_spec, &options.read_pct, &options.addr_mode);
+	dump_options();
+	
+	output_file.open(options.output_file, std::ofstream::out);
+	if (!output_file) {
+		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to open output file %s\n", options.output_file);
+		rc = EINVAL;
+		goto end;
+	}
+
+	cur_core = cmask_get_next_cpu(&options.cpumask);
+	while(cur_core != NEXT_CPU_NULL) {
+		auto * ctx = new struct thread_context;
+		memset(ctx, 0, sizeof(struct thread_context));
+
+		if (ctx == NULL) {
+			ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to alloc thread ctx.\n");
+			spdk_app_stop(ENOMEM);
+			return;
+		}
+
+		ctx->tid = tid++;
+		ctx->driver = drv;
+		ctx->main_thread = spdk_get_thread();
+		ctx->sockid = rte_lcore_to_socket_id(cur_core);
+		ctx->coreid = cur_core;
+		ctx->io_records = new std::list<struct io_record *>();
+		ctx->start_region_length = per_thread_cap;
+		ctx->start_region_offset = per_thread_cap * ctx->tid;
+
+		// create sys thread
+		pthread_attr_t attr;
+		cpuset_t scpuset;
+		CPU_ZERO(&scpuset);
+		CPU_SET(cur_core, &scpuset);
+		pthread_attr_init(&attr);
+		pthread_attr_setaffinity_np(&attr, sizeof(cpuset_t), &scpuset);
+		rc = pthread_create(&ctx->sys_thread, &attr, worker_thread_main, ctx);
+		if (rc != 0) {
+			ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to create sys thread: %d\n", rc);
+			rc = EINVAL;
+			goto end;
+		}
+		worker_threads.push_back(ctx);
+		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: created worker thread %d on core %d socket %d offset 0x%lx length %ld\n", ctx->tid, cur_core, ctx->sockid, 
+																																	ctx->start_region_offset, 
+																																	ctx->start_region_length);
+
+		cur_core = cmask_get_next_cpu(&options.cpumask);
+	}
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "main: waiting for worker thread init...\n");
+	while(vars.worker_thread_init_cnt < (uint32_t)options.num_threads) {
+		spdk_thread_poll(spdk_get_thread(), 0, 0);
+	}
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "main: starting worker threads...\n");
+	for (struct thread_context * tctx : worker_threads) {
+		rc = spdk_thread_send_msg(tctx->sp_thread, cb_notify_worker_start, tctx);
+
+		if (rc != 0) {
+			ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to send message %d\n", rc);
+			goto end;
+		}
+	}
+
+	/* main event loop */
+	while(current_s < options.time) {
+		if (current_s >= options.warmup && record_cutoff_time == 0) {
+			record_cutoff_time = get_cur_ts_nano();
+		}
+		usleep(1 * S2US);
+		current_s++;
+	}
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "main: stopping worker threads...\n");
+	for (struct thread_context * tctx : worker_threads) {
+		rc = spdk_thread_send_msg(tctx->sp_thread, cb_notify_worker_stop, tctx);
+
+		if (rc != 0) {
+			ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to send message %d\n", rc);
+			goto end;
+		}
+	}
+
+	while(vars.worker_thread_stop_cnt < (uint32_t)options.num_threads) {
+		spdk_thread_poll(spdk_get_thread(), 0, 0);
+	}
+
+	// keep stats
+	for (struct thread_context * tctx : worker_threads) {
+		uint64_t last_ts = 0;
+		uint64_t processed = 0;
+		for (struct io_record * r : *tctx->io_records) {
+			if (r->start_ts >= record_cutoff_time) {
+				if (r->end_ts > last_ts) {
+					last_ts = r->end_ts;
+				}
+
+				processed++;
+				output_file << r->end_ts - r->start_ts << std::endl;
+				total_reqs++;
+			}
+		}
+
+		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: thread %d processed requests: %lu, last request %lu. Overhead - avg %lu min %lu max %lu\n", 
+											tctx->tid, processed, last_ts, tctx->overhead_avg, tctx->overhead_min, tctx->overhead_max);
+	}
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: total requests: %u, bytes per second: %lu\n", 
+						total_reqs, total_reqs * options.req_size / (options.time - options.warmup));
+
+end:
+	if (drv != nullptr) {
+		birb_destroy_driver(drv);
+	}
+
+	output_file.close();
+
+	for (struct thread_context * tctx : worker_threads) {
+		for (struct io_record * r : *tctx->io_records) {
+			delete r;
+		}
+		delete tctx->io_records;
+		delete tctx;
+	}
+
+	exit(0);
+	spdk_app_stop(rc);
+	return;
+}
+
+int
+main(int argc, char **argv)
+{
+	struct spdk_app_opts opts = {};
+	int rc = 0;
+
+	ntr_init();
+	ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_INFO);
+
+	/* Set default values in opts structure. */
+	spdk_app_opts_init(&opts, sizeof(opts));
+	opts.name = "birb";
+
+	/*
+	 * Parse built-in SPDK command line parameters as well
+	 * as our custom one(s).
+	 */
+	if ((rc = spdk_app_parse_args(argc, argv, &opts, "VD:k:a:b:q:Q:P:I:t:w:o:", NULL, parse_arg,
+				      usage)) != SPDK_APP_PARSE_ARGS_SUCCESS) {
+		exit(rc);
+	}
+
+	nm_init(options.verbosity);
+
+	/*
+	 * spdk_app_start() will initialize the SPDK framework, call hello_start(),
+	 * and then block until spdk_app_stop() is called (or if an initialization
+	 * error occurs, spdk_app_start() will return with rc even without calling
+	 * hello_start().
+	 */
+	rc = spdk_app_start(&opts, birb_main, NULL);
+	if (rc) {
+		SPDK_ERRLOG("ERROR starting application\n");
+	}
+
+	/* At this point either spdk_app_stop() was called, or spdk_app_start()
+	 * failed because of internal error.
+	 */
+
+	/* Gracefully close out all of the SPDK subsystems. */
+	spdk_app_fini();
+	return rc;
+}
--- a/storage/birb_posix.cc
+++ b/storage/birb_posix.cc
@ -0,0 +1,585 @@
+#include <sys/endian.h>
+#include <sys/errno.h>
+#include <sys/signal.h>
+#include <sys/types.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <pthread.h>
+#include <pthread_np.h>
+#include <threads.h>
+#include <unistd.h>
+#include <aio.h>
+#include <getopt.h>
+#include <sys/ioctl.h>
+#include <sys/disk.h>
+
+#include <cerrno>
+#include <cstddef>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <chrono>
+#include <list>
+#include <set>
+
+#include "gen.hh"
+#include "ntr.h"
+#include "defs.hh"
+#include "nm.hh"
+#include "storage/io_gen.hh"
+
+static inline uint64_t get_cur_ts_nano()
+{
+    return std::chrono::duration_cast<std::chrono::nanoseconds>
+              (std::chrono::high_resolution_clock::now().time_since_epoch()).count();
+}
+
+/*
+ * We'll use this struct to gather housekeeping hello_context to pass between
+ * our events and callbacks.
+ */
+static constexpr unsigned long MAX_SPEC_LEN = 32;
+static constexpr unsigned long MAX_DEV_NAME_LEN = 32;
+static constexpr unsigned long MAX_OUTPUT_FILE_LEN = 256;
+struct options_t {
+	// args
+	int verbosity = NTR_LEVEL_DEFAULT;
+	int num_threads = 1;
+	unsigned long cpumask = 1;
+	char pattern_spec[MAX_SPEC_LEN] = "R,100";
+	char ia_spec[MAX_SPEC_LEN] = "fixed";
+	
+	unsigned int time = 5;
+	unsigned int warmup = 2;
+	unsigned int queue_depth = 1;
+	char dev_name[MAX_DEV_NAME_LEN] = "Malloc0";
+	char driver_name[MAX_DEV_NAME_LEN] = "bdev";
+	unsigned int read_pct = 0;
+	io_generator_address_mode addr_mode = IOGEN_ADDR_UNIFORM_RANDOM;
+
+	char output_file[MAX_OUTPUT_FILE_LEN] = "output.txt";
+
+	unsigned long req_size = 4096;
+	unsigned long rps = 0;
+};
+
+
+std::atomic<int> worker_thread_init_cnt(0);
+std::atomic<int> worker_thread_stop_cnt(0);
+std::atomic<int> worker_start(0);
+std::atomic<int> worker_stop(0);
+static struct options_t options;
+
+struct io_record {
+	uint64_t start_ts;
+	uint64_t end_ts;
+};
+
+struct io_request {
+	uint64_t start_ts;
+	io_generator_opcode op;
+	char * user_buf;
+	char * dma_buf;
+	struct aiocb aio;
+};
+
+struct thread_context {
+	unsigned int tid;
+	unsigned int coreid;
+	unsigned int sockid;
+	pthread_t sys_thread;
+	int disk_fd;
+
+	unsigned long start_region_offset;
+	unsigned long start_region_length;
+
+	/* modified by worker threads */
+	std::list<io_record *> *io_records;
+	uint64_t overhead_avg;
+	uint32_t overhead_cnt;
+	uint64_t overhead_max;
+	uint64_t overhead_min;
+};
+
+static void dump_options()
+{
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: Options:\n"
+					"    dev name: %s\n"
+					"    driver name: %s\n"
+                    "    worker threads: 0x%lx\n"
+					"    number of threads: %d\n"
+					"    IO request size: %lu\n"
+					"    IO requests per second: %lu\n"
+					"    IO pattern: %s\n"
+					"    IO queue depth: %d\n"
+					"    IO addressing mode: %d\n"
+					"    read percent: %u\n"
+					"    inter-arrival dist: %s\n"
+					"    run time: %d\n"
+					"    warmup time: %d\n"
+					"    output file: %s\n",
+					options.dev_name,
+					options.driver_name,
+					options.cpumask,
+					options.num_threads,
+					options.req_size,
+					options.rps,
+					options.pattern_spec,
+					options.queue_depth,
+					options.addr_mode,
+					options.read_pct,
+					options.ia_spec,
+					options.time,
+					options.warmup,
+					options.output_file
+	);
+}
+
+static void usage()
+{
+	fprintf(stdout, 
+		" -V(VV): verbose mode\n"
+		" -D: dev name\n"
+		" -k: driver to use (default bdev)\n"
+		" -a: worker threads spec (0x3 = spawn 2 threads on core 1 & 2)\n"
+		" -b: IO request size\n"
+		" -q: IO requests per second\n"
+		" -P: IO request pattern\n"
+		" -Q: IO request queue depth\n"
+		" -I: inter-arrival time distribution\n"
+		" -t: total run time\n"
+		" -w: warm up time\n"
+		" -o: latency response output file\n");
+}
+
+static void * 
+worker_thread_main(void * arg)
+{
+	int rc = 0;
+ 
+	auto *ctx = (struct thread_context *)arg;
+	std::list<struct io_request *> free_ios;
+	std::list<struct io_request *> prog_ios;
+
+	Generator * ia_gen = nullptr;
+	io_generator * io_gen = nullptr;
+
+	struct io_generator_ctx io_ctx;
+	uint64_t next_ts;
+	uint64_t a_offset;
+	uint64_t last_loop_ts = 0;
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: init...\n", ctx->tid);
+
+	// create io request objects
+	for (unsigned int i = 0; i < options.queue_depth; i++) {
+		auto buf = (char *)nm_malloc(ctx->sockid, options.req_size);
+		auto user_buf = (char *)nm_malloc(ctx->sockid, options.req_size);
+
+		if (buf == nullptr || user_buf == nullptr) {
+			ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: could not allocate buffers!\n", ctx->tid);
+			rc = ENOMEM;
+			goto cleanup;
+		}
+
+		auto io_req = new struct io_request;
+		io_req->dma_buf = buf;
+		io_req->user_buf = user_buf;
+		io_req->aio.aio_fildes = ctx->disk_fd;
+		io_req->aio.aio_nbytes = options.req_size;
+		io_req->aio.aio_buf = buf;
+		io_req->aio.aio_sigevent.sigev_notify = SIGEV_NONE;
+		io_req->aio.aio_reqprio = 0;
+
+		free_ios.push_back(io_req);
+	}
+
+	// init thread local states
+	ia_gen = createGenerator(options.ia_spec);
+	if (ia_gen == nullptr) {
+		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: could not allocate ia generator!\n", ctx->tid);
+		rc = EINVAL;
+		goto cleanup;
+	}
+	ia_gen->set_lambda((double)options.rps / (double)(options.num_threads));
+
+	io_gen = new io_generator(options.req_size, ctx->start_region_length, options.read_pct, options.addr_mode);
+	if (io_gen == nullptr) {
+		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: could not allocate ia generator!\n", ctx->tid);
+		rc = EINVAL;
+		goto cleanup;
+	}
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: init complete.\n", ctx->tid);
+
+	worker_thread_init_cnt.fetch_add(1);
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: waiting for start...\n", ctx->tid);
+
+	while (worker_start.load() == 0) {}
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: started...\n", ctx->tid);
+	
+	/* random delay 0-100 us */
+	usleep(nm_get_uptime_ns() % 100);
+
+	next_ts = get_cur_ts_nano();
+	
+	while (true) {
+		uint64_t cur_ts = get_cur_ts_nano();
+		if (last_loop_ts > 0) {
+			uint64_t overhead = cur_ts - last_loop_ts;
+			if (ctx->overhead_max < overhead) {
+				ctx->overhead_max = overhead;
+			}
+
+			if (ctx->overhead_min > overhead) {
+				ctx->overhead_min = overhead;
+			}
+
+			ctx->overhead_avg = ctx->overhead_avg * ctx->overhead_cnt + overhead;
+			ctx->overhead_cnt++;
+			ctx->overhead_avg /= ctx->overhead_cnt;
+		}
+		last_loop_ts = cur_ts;
+
+		// process io completion
+		auto itr = prog_ios.begin();
+		while (itr != prog_ios.end()) {
+			int err;
+			struct io_request * ioreq = *itr;
+			if ((err = aio_error(&ioreq->aio)) != EINPROGRESS) {
+				if (err == 0) {
+					auto rec = new struct io_record;
+					rec->start_ts = ioreq->start_ts;
+					rec->end_ts = cur_ts;
+
+					ctx->io_records->push_back(rec);
+					if (ioreq->op == IOGEN_READ) {
+						memcpy(ioreq->user_buf, ioreq->dma_buf, options.req_size);
+					}
+					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d <worker_io_complete>: completed io request type %d\n", ctx->tid, ioreq->op);
+					
+				} else {
+					ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "thread %d: aio failed with %d...\n", ctx->tid, err);
+				}
+
+				if (aio_return(&ioreq->aio) == -1) {
+					ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: aio_return failed with %d...\n", ctx->tid, errno);
+					exit(errno);
+				}
+
+				/* cleanup */
+				itr = prog_ios.erase(itr);
+				free_ios.push_back(ioreq);
+			} else {
+				++itr;
+			}
+		}
+
+		if (worker_stop.load() == 1) {
+			if (free_ios.size() >= options.queue_depth) {
+				break;
+			}
+		} else {
+			if (!free_ios.empty()) {
+				auto io_req = free_ios.front();
+
+				cur_ts = get_cur_ts_nano();
+
+				if (cur_ts >= next_ts) {
+					io_gen->issue(&io_ctx, io_req->dma_buf);
+
+					a_offset = io_ctx.offset + ctx->start_region_offset;
+
+					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: issuing IO type %d at offset 0x%lx size 0x%lx...\n", ctx->tid, io_ctx.op, a_offset, io_ctx.size);
+
+					io_req->start_ts = cur_ts;
+					io_req->op = io_ctx.op;
+					io_req->aio.aio_offset = a_offset;
+
+					if(io_ctx.op == IOGEN_READ) {
+						rc = aio_read(&io_req->aio);
+					} else {
+						rc = aio_write(&io_req->aio);
+					}
+
+					if (rc != 0) {
+						ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "thread %d: failed to issue io %d, retrying...\n", ctx->tid, errno);
+					} else {
+						free_ios.pop_front();
+						prog_ios.push_back(io_req);
+						next_ts = next_ts + ia_gen->generate() * S2NS;
+					}
+				}
+			}
+		}
+	}
+
+cleanup:
+	while (!free_ios.empty()) {
+		auto req = free_ios.front();
+		free_ios.pop_front();
+		nm_free(ctx->sockid, req->dma_buf);
+		nm_free(ctx->sockid, req->user_buf);
+	}
+
+	if (ia_gen != nullptr) {
+		delete ia_gen;
+	}
+
+	if (io_gen != nullptr) {
+		delete io_gen;
+	}
+
+	worker_thread_stop_cnt.fetch_add(1);
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: stopped...\n", ctx->tid);
+
+	return nullptr;
+}
+
+
+static void
+parse_pattern(char * pattern, unsigned int * read_pct, io_generator_address_mode * addr_mode)
+{
+	char * token = strtok(pattern, ",");
+
+	if (strcmp(token, "M") == 0) {
+		*addr_mode = IOGEN_ADDR_MONOTONIC_INCREASING;
+	} else {
+		*addr_mode = IOGEN_ADDR_UNIFORM_RANDOM;
+	}
+
+	token = strtok(nullptr, ",");
+	*read_pct = strtoull(token, nullptr, 10);
+}
+
+static void
+birb_main()
+{
+	int rc = 0;
+	std::list<struct thread_context *> worker_threads;
+	std::ofstream output_file;
+
+	unsigned long record_cutoff_time = 0;
+	unsigned long current_s = 0;
+	unsigned int total_reqs = 0;
+	unsigned int tid = 0;
+	unsigned long per_thread_cap = 0;
+	int cur_core;
+	int disk_fd;
+	off_t disk_size;
+	u_int disk_sec_size;
+
+	/* initialize driver */
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: initializing device driver for device %s\n", options.dev_name);
+	disk_fd = open(options.dev_name, O_RDWR | O_DIRECT);
+	if (disk_fd == -1) {
+		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to open device - %d\n", errno);
+		exit(errno);
+	}
+
+	rc = ioctl(disk_fd, DIOCGMEDIASIZE, &disk_size);
+	if (rc == -1) {
+		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to obtain disk size - %d\n", errno);
+		exit(errno);
+	}
+
+	rc = ioctl(disk_fd, DIOCGSECTORSIZE, &disk_sec_size);
+	if (rc == -1) {
+		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to obtain disk sector size - %d\n", errno);
+		exit(errno);
+	}
+
+	per_thread_cap = disk_size / options.num_threads;
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: initialized device with capacity %zu bytes ~= %zu MB, sector %u bytes\n", disk_size, disk_size / 1024 / 1024, disk_sec_size);
+
+	parse_pattern(options.pattern_spec, &options.read_pct, &options.addr_mode);
+	dump_options();
+	
+	output_file.open(options.output_file, std::ofstream::out);
+	if (!output_file) {
+		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to open output file %s\n", options.output_file);
+		rc = EINVAL;
+		goto end;
+	}
+
+	cur_core = cmask_get_next_cpu(&options.cpumask);
+	while(cur_core != NEXT_CPU_NULL) {
+		auto * ctx = new struct thread_context;
+		memset(ctx, 0, sizeof(struct thread_context));
+
+		if (ctx == NULL) {
+			ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to alloc thread ctx.\n");
+			exit(ENOMEM);
+		}
+
+		ctx->tid = tid++;
+	
+		ctx->sockid = nm_get_node_from_core(cur_core);
+		ctx->coreid = cur_core;
+		ctx->io_records = new std::list<struct io_record *>();
+		ctx->start_region_length = per_thread_cap;
+		ctx->start_region_offset = per_thread_cap * ctx->tid;
+		ctx->disk_fd = disk_fd;
+
+		// create sys thread
+		pthread_attr_t attr;
+		cpuset_t scpuset;
+		CPU_ZERO(&scpuset);
+		CPU_SET(cur_core, &scpuset);
+		pthread_attr_init(&attr);
+		pthread_attr_setaffinity_np(&attr, sizeof(cpuset_t), &scpuset);
+		rc = pthread_create(&ctx->sys_thread, &attr, worker_thread_main, ctx);
+		if (rc != 0) {
+			ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to create sys thread: %d\n", rc);
+			rc = EINVAL;
+			goto end;
+		}
+		worker_threads.push_back(ctx);
+		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: created worker thread %d on core %d socket %d offset 0x%lx length %ld\n", ctx->tid, cur_core, ctx->sockid, 
+																																	ctx->start_region_offset, 
+																																	ctx->start_region_length);
+
+		cur_core = cmask_get_next_cpu(&options.cpumask);
+	}
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "main: waiting for worker thread init...\n");
+	while(worker_thread_init_cnt.load() < options.num_threads) {
+	}
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "main: starting worker threads...\n");
+	worker_start.store(1);
+
+	/* main event loop */
+	while(current_s < options.time) {
+		if (current_s >= options.warmup && record_cutoff_time == 0) {
+			record_cutoff_time = get_cur_ts_nano();
+		}
+		usleep(1 * S2US);
+		current_s++;
+	}
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "main: stopping worker threads...\n");
+	worker_stop.store(1);
+
+	while(worker_thread_stop_cnt.load() < options.num_threads) {
+	}
+
+	// keep stats
+	for (struct thread_context * tctx : worker_threads) {
+		uint64_t last_ts = 0;
+		uint64_t processed = 0;
+		for (struct io_record * r : *tctx->io_records) {
+			if (r->start_ts >= record_cutoff_time) {
+				if (r->end_ts > last_ts) {
+					last_ts = r->end_ts;
+				}
+
+				processed++;
+				output_file << r->end_ts - r->start_ts << std::endl;
+				total_reqs++;
+			}
+		}
+
+		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: thread %d processed requests: %lu, last request %lu. Overhead - avg %lu min %lu max %lu\n", 
+											tctx->tid, processed, last_ts, tctx->overhead_avg, tctx->overhead_min, tctx->overhead_max);
+	}
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: total requests: %u, bytes per second: %lu\n", 
+						total_reqs, total_reqs * options.req_size / (options.time - options.warmup));
+
+end:
+	if (disk_fd != -1) {
+		close(disk_fd);
+	}
+
+	output_file.close();
+
+	for (struct thread_context * tctx : worker_threads) {
+		for (struct io_record * r : *tctx->io_records) {
+			delete r;
+		}
+		delete tctx->io_records;
+		delete tctx;
+	}
+
+	return;
+}
+
+int
+main(int argc, char **argv)
+{
+	ntr_init();
+	ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_INFO);
+
+	int c;
+	while (( c = getopt(argc, argv, "VD:k:a:b:q:Q:P:I:t:w:o:")) != -1)
+	{
+		switch (c) {
+			case 'V':
+				ntr_set_level(NTR_DEP_USER1,
+					ntr_get_level(NTR_DEP_USER1) + 1);
+				break;
+			case 'D':
+				strncpy(options.dev_name, optarg, MAX_DEV_NAME_LEN);
+				break;
+			case 'k':
+				strncpy(options.driver_name, optarg, MAX_DEV_NAME_LEN);
+				break;
+			case 'a':
+				options.cpumask = strtoull(optarg, nullptr, 16);
+				options.num_threads = cmask_get_num_cpus(
+					options.cpumask);
+
+				if (options.num_threads == 0) {
+					fprintf(stderr,
+						"must run at least one thread\n");
+					return EINVAL;
+				}
+				break;
+			case 'b':
+				options.req_size = strtoull(
+					optarg, nullptr, 10);
+				break;
+			case 'q':
+				options.rps = strtoull(
+					optarg, nullptr, 10);
+				break;
+			case 'Q':
+				options.queue_depth = strtoull(
+					optarg, nullptr, 10);
+				break;
+			case 'P':
+				strncpy(options.pattern_spec, optarg, MAX_SPEC_LEN);
+				break;
+			case 'I':
+				strncpy(options.ia_spec, optarg, MAX_SPEC_LEN);
+				break;
+			case 't':
+				options.time = strtoull(
+					optarg, nullptr, 10);
+				break;
+			case 'w':
+				options.warmup = strtoull(
+					optarg, nullptr, 10);
+				break;
+			case 'o':
+				strncpy(options.output_file, optarg, MAX_OUTPUT_FILE_LEN);
+				break;
+			case 'h':
+				usage();
+				exit(0);
+			default:
+				usage();
+				exit(EINVAL);
+		}
+	}
+
+	nm_init(options.verbosity);
+	birb_main();
+
+	return 0;
+}
--- a/storage/drivers/bdev.cc
+++ b/storage/drivers/bdev.cc
@ -0,0 +1,95 @@
+#include <sys/endian.h>
+#include "storage/drivers/bdev.hh"
+#include "ntr.h"
+#include "spdk/bdev.h"
+#include "spdk/thread.h"
+
+size_t
+birb_bdev_driver::get_capacity()
+{
+    return block_num * block_sz;
+}
+
+birb_driver::birb_driver_status
+birb_bdev_driver::get_status()
+{
+    return this->status;
+}
+
+void
+birb_bdev_driver::bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev * bdev UNUSED,
+		 void * event_ctx UNUSED)
+{
+	ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "bdev_event_cb: unsupported bdev event: type %d\n", type);
+}
+
+void 
+birb_bdev_driver::print_all_bdev()
+{
+	struct spdk_bdev * cur = spdk_bdev_first();
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "birb_bdev_driver: all registered block devices: ");
+	
+	while(cur != NULL) {
+		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "%s, ", spdk_bdev_get_name(cur));
+		cur = spdk_bdev_next(cur);
+	}
+}
+
+birb_bdev_driver::birb_bdev_driver(const char * dev_name) : bdev_desc(nullptr),
+															bdev(nullptr),
+															block_sz(0),
+															block_num(0),
+															status(BIRB_FAIL)
+{
+    int rc;
+
+    rc = spdk_bdev_open_ext(dev_name, true, birb_bdev_driver::bdev_event_cb, NULL, &this->bdev_desc);
+
+	if (rc != 0) {
+		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "birb_bdev_driver: failed to open bdev: %d\n", rc);
+		return;
+	}
+
+	/* A bdev pointer is valid while the bdev is opened. */
+	this->bdev = spdk_bdev_desc_get_bdev(this->bdev_desc);
+    this->block_sz = spdk_bdev_get_block_size(this->bdev);
+    this->block_num = spdk_bdev_get_num_blocks(this->bdev);
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "birb_bdev_driver: bdev block size %zu bytes, blocks count %zu\n", this->block_sz, this->block_num);
+
+    this->status = BIRB_SUCCESS;
+}
+
+birb_bdev_driver::~birb_bdev_driver()
+{
+    if (this->status == BIRB_SUCCESS) {
+		spdk_bdev_close(this->bdev_desc);
+	}
+}
+
+birb_driver::birb_driver_type
+birb_bdev_driver::get_type()
+{
+	return BIRB_DRV_BDEV;
+}
+
+size_t
+birb_bdev_driver::get_align()
+{
+	return spdk_bdev_get_buf_align(this->bdev);
+}
+
+
+struct spdk_bdev *
+birb_bdev_driver::get_bdev()
+{
+	return this->bdev;
+}
+
+
+struct spdk_bdev_desc *
+birb_bdev_driver::get_bdev_desc()
+{
+	return this->bdev_desc;
+}
--- a/storage/drivers/bdev_thread.cc
+++ b/storage/drivers/bdev_thread.cc
@ -0,0 +1,72 @@
+#include <sys/endian.h>
+#include "storage/drivers/bdev.hh"
+#include "ntr.h"
+#include "spdk/bdev.h"
+#include "spdk/thread.h"
+
+birb_bdev_thread_context::birb_bdev_thread_context(birb_bdev_driver * driver) : io_channel(nullptr),
+																				status(birb_driver::BIRB_FAIL),
+																				driver(driver)
+{
+	struct spdk_bdev_desc * desc = driver->get_bdev_desc();
+
+	// obtain io channel
+	this->io_channel = spdk_bdev_get_io_channel(desc);
+	if (io_channel == nullptr) {
+		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "birb_bdev_thread_context: could not create bdev I/O channel!\n");
+	}
+
+
+	this->status = birb_driver::BIRB_SUCCESS;
+}
+
+birb_driver::birb_driver_status
+birb_bdev_thread_context::get_status()
+{
+    return this->status;
+}
+
+birb_bdev_thread_context::~birb_bdev_thread_context()
+{
+	if (this->io_channel != nullptr) {
+		spdk_put_io_channel(this->io_channel);
+	}
+}
+
+/*
+ * Callback function for io completion.
+ */
+
+void
+birb_bdev_thread_context::io_callback(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+	spdk_bdev_free_io(bdev_io);
+
+	auto ctx = reinterpret_cast<struct cb_context *>(cb_arg);
+	ctx->cb(success, ctx->ctx);
+	delete ctx;
+}
+
+int
+birb_bdev_thread_context::read(size_t offset, size_t size, char * buffer, callback callback, void * context)
+{
+	auto ctx = new struct cb_context;
+	ctx->cb = callback;
+	ctx->ctx = context;
+	return spdk_bdev_read(driver->get_bdev_desc(), this->io_channel, buffer, offset, size, io_callback, reinterpret_cast<void*>(ctx));
+}
+
+int 
+birb_bdev_thread_context::write(size_t offset, size_t size, char * buffer, callback callback, void * context)
+{
+	auto ctx = new struct cb_context;
+	ctx->cb = callback;
+	ctx->ctx = context;
+	return spdk_bdev_write(driver->get_bdev_desc(), this->io_channel, buffer, offset, size, io_callback, reinterpret_cast<void*>(ctx));
+}
+
+void
+birb_bdev_thread_context::poll()
+{
+	return;
+}
--- a/storage/drivers/nvme.cc
+++ b/storage/drivers/nvme.cc
@ -0,0 +1,135 @@
+#include <sys/endian.h>
+#include "ntr.h"
+#include "spdk/nvme.h"
+#include "spdk/thread.h"
+#include "storage/drivers/nvme.hh"
+
+size_t
+birb_nvme_driver::get_capacity()
+{
+    return spdk_nvme_ns_get_size(this->ns);
+}
+
+birb_driver::birb_driver_status
+birb_nvme_driver::get_status()
+{
+    return this->status;
+}
+
+void
+birb_nvme_driver::attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
+	  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts UNUSED)
+{
+	struct spdk_nvme_ns * ns;
+	auto ctx = reinterpret_cast<struct attach_context *>(cb_ctx);
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "birb_nvme_driver: attached to nvme at %s\n", trid->traddr);
+
+	for (int nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); nsid != 0;
+	     nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) {
+		ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
+		if (ns == nullptr || !spdk_nvme_ns_is_active(ns)) {
+			continue;
+		}
+
+		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "birb_nvme_driver: namespace id: %d size: %zu LBA size: %u\n", spdk_nvme_ns_get_id(ns), spdk_nvme_ns_get_size(ns), spdk_nvme_ns_get_sector_size(ns));
+		/* XXX: use the first namespace */
+		break;
+	}
+
+	*ctx->ns = ns;
+	*ctx->ctrlr = ctrlr;
+	ctx->valid = 1;
+}
+
+bool
+birb_nvme_driver::probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
+	 struct spdk_nvme_ctrlr_opts *opts UNUSED)
+{
+	printf("birb_nvme_driver: found nvme at %s\n", trid->traddr);
+	auto ctx = reinterpret_cast<struct attach_context *>(cb_ctx);
+
+    if (strcmp(trid->traddr, ctx->dev_name) == 0) {
+		return true;
+    }
+	return false;
+}
+
+birb_nvme_driver::birb_nvme_driver(const char * dev_name) : status(BIRB_FAIL),
+															ctrlr(nullptr),
+															ns(nullptr),
+															opts()
+{
+    int rc;
+    struct spdk_nvme_transport_id trid;
+	struct attach_context ctx;
+	ctx.ctrlr = &this->ctrlr;
+	ctx.ns = &this->ns;
+	ctx.dev_name = dev_name;
+	ctx.valid = 0;
+
+    spdk_nvme_trid_populate_transport(&trid, SPDK_NVME_TRANSPORT_PCIE);
+	snprintf(trid.subnqn, sizeof(trid.subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN);
+
+    rc = spdk_nvme_probe(&trid, reinterpret_cast<void *>(&ctx), probe_cb, attach_cb, nullptr);
+	if (rc != 0) {
+		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "birb_nvme_driver: failed to probe nvme device: %d\n", rc);
+		goto end;
+	}
+
+	if (ctx.valid != 1) {
+		rc = EINVAL;
+		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "birb_nvme_driver: could not find device: %s\n", dev_name);
+		goto end;
+	}
+
+	if (spdk_nvme_ns_get_csi(this->ns) == SPDK_NVME_CSI_ZNS) {
+		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "birb_nvme_driver: zoned nvme namespace is unsupported\n");
+		spdk_nvme_detach(this->ctrlr);
+		goto end;
+	} else {
+		spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &this->opts, sizeof(this->opts));
+		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "birb_nvme_driver: io queue depth: %d io queue requests: %d\n", opts.io_queue_size, opts.io_queue_requests);
+		this->status = BIRB_SUCCESS;
+	}
+
+end:
+	return;	
+}
+
+birb_nvme_driver::~birb_nvme_driver()
+{
+    if (this->ctrlr != nullptr) {
+		spdk_nvme_detach(this->ctrlr);
+	}
+}
+
+birb_driver::birb_driver_type
+birb_nvme_driver::get_type()
+{
+	return BIRB_DRV_NVME;
+}
+
+size_t
+birb_nvme_driver::get_align()
+{
+	return 0x1000;
+}
+
+spdk_nvme_ctrlr * 
+birb_nvme_driver::get_ctrlr()
+{
+	return this->ctrlr;
+}
+
+spdk_nvme_ns * 
+birb_nvme_driver::get_ns()
+{
+	return this->ns;
+}
+
+spdk_nvme_io_qpair_opts * 
+birb_nvme_driver::get_io_qpair_opts()
+{
+	return &this->opts;
+}
--- a/storage/drivers/nvme_thread.cc
+++ b/storage/drivers/nvme_thread.cc
@ -0,0 +1,90 @@
+#include <sys/endian.h>
+
+#include "storage/drivers/nvme.hh"
+#include "ntr.h"
+#include "spdk/bdev.h"
+#include "spdk/nvme.h"
+#include "spdk/nvme_spec.h"
+#include "spdk/thread.h"
+
+birb_nvme_thread_context::birb_nvme_thread_context(birb_nvme_driver * driver) : status(birb_driver::BIRB_FAIL),
+																				driver(driver),
+                                                                                qpair(nullptr)
+{
+	struct spdk_nvme_ctrlr * ctrlr = driver->get_ctrlr();
+    struct spdk_nvme_qpair * qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, driver->get_io_qpair_opts(), sizeof(struct spdk_nvme_io_qpair_opts));
+    if (qpair == nullptr) {
+        ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "birb_nvme_thread_context: could not allocate qpairs.\n");
+    } else {
+        this->qpair = qpair;
+        status = birb_driver::BIRB_SUCCESS;
+    }
+}
+
+birb_driver::birb_driver_status
+birb_nvme_thread_context::get_status()
+{
+    return this->status;
+}
+
+birb_nvme_thread_context::~birb_nvme_thread_context()
+{
+	if (this->qpair != nullptr) {
+		spdk_nvme_ctrlr_free_io_qpair(this->qpair);
+	}
+}
+
+/*
+ * Callback function for io completion.
+ */
+void
+birb_nvme_thread_context::io_callback(void *arg, const struct spdk_nvme_cpl *completion)
+{
+    bool success = !spdk_nvme_cpl_is_error(completion);
+	auto ctx = reinterpret_cast<struct cb_context *>(arg);
+	ctx->cb(success, ctx->ctx);
+	delete ctx;
+}
+
+uint32_t
+birb_nvme_thread_context::size_to_lba(size_t size, int lba_size)
+{
+    return (size - 1) / lba_size + 1;
+}
+
+uint64_t
+birb_nvme_thread_context::addr_to_lba(size_t addr, int lba_size)
+{
+    return addr / lba_size;
+}
+
+int
+birb_nvme_thread_context::read(size_t offset, size_t size, char * buffer, callback callback, void * context)
+{
+	auto ctx = new struct cb_context;
+	ctx->cb = callback;
+	ctx->ctx = context;
+
+    struct spdk_nvme_ns * ns = this->driver->get_ns();
+    int lba_size = spdk_nvme_ns_get_sector_size(ns);
+	return spdk_nvme_ns_cmd_read(ns, this->qpair, buffer, addr_to_lba(offset, lba_size), size_to_lba(size, lba_size), io_callback, reinterpret_cast<void*>(ctx), 0);
+}
+
+int 
+birb_nvme_thread_context::write(size_t offset, size_t size, char * buffer, callback callback, void * context)
+{
+	auto ctx = new struct cb_context;
+	ctx->cb = callback;
+	ctx->ctx = context;
+
+    struct spdk_nvme_ns * ns = this->driver->get_ns();
+    int lba_size = spdk_nvme_ns_get_sector_size(ns);
+
+	return spdk_nvme_ns_cmd_write(ns, this->qpair, buffer, addr_to_lba(offset, lba_size), size_to_lba(size, lba_size), io_callback, reinterpret_cast<void*>(ctx), 0);
+}
+
+void
+birb_nvme_thread_context::poll()
+{
+    spdk_nvme_qpair_process_completions(this->qpair, 0);
+}
--- a/storage/io_gen.cc
+++ b/storage/io_gen.cc
@ -0,0 +1,57 @@
+#include <sys/endian.h>
+#include <random>
+
+#include "nm.hh"
+#include "storage/io_gen.hh"
+
+io_generator::io_generator(
+                    unsigned long req_size,
+                    unsigned long capacity,
+                    unsigned int read_pct,
+                    io_generator_address_mode addr_mode) : cur_offset(0),
+                                                            capacity(capacity),
+                                                            req_size(req_size),
+                                                            read_pct(read_pct),
+                                                            addr_mode(addr_mode),
+                                                            rng(rd()),
+                                                            dist(std::uniform_int_distribution<int>(0, 99)),
+                                                            addr_rng(addr_rd()),
+                                                            addr_dist(std::uniform_int_distribution<uint64_t>(0, capacity - 1))
+{
+    rng.seed(nm_get_uptime_ns());
+    addr_rng.seed(nm_get_uptime_ns());
+}
+
+
+/* returns 0 on success */
+int io_generator::issue(struct io_generator_ctx *ctx, char * buf)
+{
+    ctx->size = req_size;
+
+    // determine next IO offset
+    if (addr_mode == IOGEN_ADDR_MONOTONIC_INCREASING) {
+        if (cur_offset + req_size > capacity) {
+            cur_offset = 0;   
+        }
+
+        ctx->offset = cur_offset;
+        cur_offset = cur_offset + req_size;
+    } else {
+        ctx->offset = (addr_dist(addr_rng) / req_size) * req_size;
+        if (ctx->offset + req_size > capacity) {
+            ctx->offset -= req_size;
+        }
+    }
+
+    // determine next IO data
+    int op_rng = dist(rng);
+    if (op_rng < (int)read_pct) {
+        ctx->op = IOGEN_READ;
+    } else {
+        ctx->op = IOGEN_WRITE;
+        int data = dist(rng);
+        memset(buf, data, req_size);
+    }
+
+    return 0;
+}
--- a/tests/nms_test.c
+++ b/tests/nms_test.c
@ -0,0 +1,32 @@
+#include "nms.h"
+#include <assert.h>
+#include <stdio.h>
+
+int main(void)
+{
+    void * ret;
+
+    nms_init(1);
+    // duplicate init
+    nms_init(1);
+
+    // 1G 
+    ret = nms_malloc(0, 1024 * 1024 * 1024);
+    assert(ret != NULL);
+    printf("1G: %p\n", ret);
+
+    // two 511Ms 
+    ret = nms_malloc(0, 511 * 1024 * 1024);
+    assert(ret != NULL);
+    printf("511M: %p\n", ret);
+    ret = nms_malloc(0, 511 * 1024 * 1024);
+    assert(ret != NULL);
+    printf("511M: %p\n", ret);
+
+    // another 1G
+    ret = nms_malloc(0, 1024 * 1024 * 1024);
+    assert(ret != NULL);
+    printf("1G: %p\n", ret);
+
+    return 0;
+}
--- a/util/memloadgen.cc
+++ b/util/memloadgen.cc
@ -0,0 +1,239 @@
+#include <sys/endian.h>
+#include <sys/select.h>
+#include <sys/signal.h>
+#include "gen.hh"
+#include <array>
+#include <atomic>
+#include <cstdlib>
+#include <cstring>
+#include <list>
+#include <iostream>
+#include <fstream>
+#include "ntr.h"
+#include "nms.h"
+#include <getopt.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <topo.h>
+
+static void
+usage()
+{
+	fprintf(stdout,
+	    "Usage:\n"
+		"    -v: verbose mode\n"
+	    "    -b: buffer size\n"
+		"    -q: bytes per second\n"
+	    "    -d: destination domain index\n"
+	    "    -s: worker threads cpu list\n"
+		"    -m: pull mode cpu list\n"
+		"    -S: enable shared buffer\n"
+		"    -t: time to run\n"
+		"    -T: transaction size\n"
+		"    -i: inter arrival time distribution\n"
+		"    -o: output file path\n"
+		"    -H: history size for pct adjustment\n"
+		"    -M: print this string when threads are ready to run\n");
+	fflush(stdout);
+}
+
+static char output_file[256] = "memloadgen_samples.txt";
+
+int main(int argc, char * argv[])
+{
+	ntr_init();
+	ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_WARNING);
+
+	size_t arr_sz = 64 * 1024 * 1024;
+	uint32_t time = -1;
+	uint64_t bps = 0;
+	uint64_t transaction_size = arr_sz;
+	cpuset_t threads, modes;
+	char magic[256] = {0};
+	CPU_ZERO(&threads);
+	CPU_ZERO(&modes);
+	CPU_SET(0, &threads);
+	char ia_dist[32] = "fixed";
+	int history_sz = 5;
+	std::list<uint64_t> history;
+
+	int shared_buffer = 0;
+	int rate_ctrl = 0;
+	cpuset_t domain_mask;
+	CPU_ZERO(&domain_mask);
+	CPU_SET(0, &domain_mask);
+	{
+		int c;
+		// parse arguments
+		while ((c = getopt(argc, argv, "vhb:d:s:m:So:T:t:q:i:H:M:")) != -1) {
+			switch (c) {
+			case 'v':
+				ntr_set_level(NTR_DEP_USER1, ntr_get_level(NTR_DEP_USER1) + 1);
+				break;
+			case 'h':
+				usage();
+				exit(0);
+			case 'b':
+				arr_sz = strtoull(optarg, nullptr, 10);
+				break;
+			case 'd':
+				cpulist_to_cpuset(optarg, &domain_mask);
+				break;
+			case 's':
+				cpulist_to_cpuset(optarg, &threads);
+				break;
+			case 'm':
+				cpulist_to_cpuset(optarg, &modes);
+				break;
+			case 'S':
+				shared_buffer = 1;
+				break;
+			case 'o':
+				strncpy(output_file, optarg, 256);
+				break;
+			case 't':
+				time = strtoul(optarg, nullptr, 10);
+				break;
+			case 'T':
+				transaction_size = strtoul(optarg, nullptr, 10);
+				break;
+			case 'q':
+				bps = (uint64_t)strtoull(optarg, nullptr, 10);
+				break;
+			case 'i':
+				strncpy(ia_dist, optarg, sizeof(ia_dist));
+				break;
+			case 'H':
+				history_sz = strtol(optarg, nullptr, 10);
+				break;
+			case 'M':
+				strncpy(magic, optarg, sizeof(magic));
+				break;
+			default:
+				usage();
+				exit(0);
+			}
+		}
+	}
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "Configruation:\n"
+									   "    buffer size: %ld\n"
+									   "    num threads: %d\n"
+									   "    target domain: %ld\n"
+									   "    bytes per second: %lu\n"
+									   "    interarrival distribution: %s\n"
+									   "    shared buffer: %d\n"
+									   "    transaction time: %lu\n"
+									   "    runtime: %d\n"
+									   "    history: %d\n"
+									   "    magic: %s\n",
+									    arr_sz, CPU_COUNT(&threads), 
+										CPU_FFS(&domain_mask) - 1, bps, 
+										ia_dist, shared_buffer, 
+										transaction_size,time, history_sz, magic);
+
+	// init topo
+	if (topo_init(ntr_get_level(NTR_DEP_USER1) != NTR_LEVEL_DEFAULT)) {
+		fprintf(stderr, "libtopo init failed!\n");
+		exit(1);
+	}
+
+	// init 
+	if (nms_init(ntr_get_level(NTR_DEP_USER1) != NTR_LEVEL_DEFAULT)) {
+		fprintf(stderr, "libnms init failed!\n");
+		exit(1);
+	}
+
+	bool success = false;
+	memload_generator::memload_generator_options opts;
+	opts.buffer_size = arr_sz;
+	opts.trans_per_second = bps / transaction_size;
+	opts.shared_buffer = shared_buffer;
+	opts.transaction_size = transaction_size;
+	opts.verbose = ntr_get_level(NTR_DEP_USER1) != NTR_LEVEL_DEFAULT;
+	strncpy(opts.ia_dist, ia_dist, sizeof(opts.ia_dist));
+	std::ofstream ofile;
+	ofile.open(output_file, std::ios::out | std::ios::trunc);
+
+    auto mgen = new memload_generator(&threads, &modes, &domain_mask, &opts, &success);
+	if (strlen(magic) > 0) {
+		fprintf(stdout, "%s\n", magic);
+		fflush(stdout);
+	}
+	if (!mgen->start()) {
+		fprintf(stderr, "failed to start memloadgen!\n");
+		exit(1);
+	}
+
+	struct timeval stval;
+	stval.tv_sec = 0;
+	stval.tv_usec = 0;
+	char pct_line[64] = {0};
+
+	uint64_t prev_ts = topo_uptime_ns();
+	uint64_t prev_trans = mgen->get_transactions();
+	uint32_t cur_time = 0;
+	while(cur_time < time) {
+		usleep(S2US);
+		uint64_t cur_ts = topo_uptime_ns();
+		uint64_t trans = mgen->get_transactions();
+		uint64_t bps = (uint64_t)((double)((trans - prev_trans) * transaction_size) / ((double)(cur_ts - prev_ts) / (double)S2NS));
+			
+		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "%ldB,%ldM\n", bps, bps / 1024 / 1024);
+		ofile << "s," << cur_time << "," << bps << std::endl;
+		ofile.flush();
+		
+		prev_ts = cur_ts;
+		prev_trans = trans;
+		cur_time++;
+
+		if (rate_ctrl == 0) {
+			// keep history
+			history.emplace_back(bps);
+			if ((int)history.size() > history_sz) {
+				history.pop_front();
+			}
+
+			fd_set fdset;
+			FD_ZERO(&fdset);
+			FD_SET(STDIN_FILENO, &fdset);
+			int ret = select(1, &fdset, NULL, NULL, &stval);
+			if (ret < 0) {
+				if (errno != EINTR) {
+					fprintf(stderr, "select() failed with %d\n", errno);
+					exit(1);
+				}
+			} else if (ret > 0) {
+				if (FD_ISSET(STDIN_FILENO, &fdset)) {
+					ret = read(STDIN_FILENO, pct_line, sizeof(pct_line) - 1);
+					if (ret < 0) {
+						fprintf(stderr, "read() failed with %d\n", errno);
+						exit(1);
+					}
+					unsigned int pct = strtoul(pct_line, NULL, 10);
+					uint64_t sum = 0;
+					size_t sz = history.size();
+					while (history.size() > 0) {
+						sum += history.front();
+						history.pop_front();
+					}
+
+					uint64_t newbps = ((sum / sz) * (double)pct / 100.0);
+					mgen->set_transactions(newbps / transaction_size);
+					ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "adjusted target bps to %u%% = %ldB ~= %ldM\n", pct, newbps, newbps / 1024 / 1024);
+					
+					ofile << "p," << cur_time << "," << pct << std::endl;
+					ofile.flush();
+
+					rate_ctrl = 1;
+				}
+			}
+		}
+	}
+	mgen->stop();
+	delete mgen;
+	ofile.close();
+
+	return 0;
+}
+
--- a/util/mornafah.c
+++ b/util/mornafah.c
@ -0,0 +1,237 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "nms.h"
+#include <getopt.h>
+#include <unistd.h>
+#include <topo.h>
+#include <immintrin.h>
+#include <x86intrin.h>
+#include <stdatomic.h>
+#include <string.h>
+#include <time.h>
+#include <math.h>
+#include <errno.h>
+#include <stdint.h>
+
+#include <sys/cpuset.h>
+#include <sys/sysctl.h>
+#include <pthread.h>
+#include <pthread_np.h>
+
+#define BUFFER_SIZE (128 * 1024 * 1024)
+#define BUFFER_CNT (BUFFER_SIZE / sizeof(int))
+
+static _Atomic int flush = 0;
+static _Atomic uint64_t offset = 0;
+static int * remote_buffer = NULL;
+static uint64_t * latencies;
+static int times = 100;
+static int local_core = 0;
+static int remote_core = 1;
+static int cache_mode = 0;
+static int verbose = 0;
+static int random_access = 0;
+static uint64_t tsc_freq = 0;
+
+static inline uint64_t cyc2ns(uint64_t cyc)
+{
+	return (double)cyc / ((double)tsc_freq / 1000000000.0);
+}
+
+static inline uint64_t read_time(void)
+{
+	uint64_t l;
+	unsigned int a;
+	l = __rdtscp(&a);
+	_mm_lfence();
+	return l;
+}
+
+static void * local_thread(void *)
+{
+	int temp, *addr;
+	uint64_t start, end;
+	printf("Local thread running...\n");
+	while(times > 0) {
+		if (random_access) {
+			// change offset
+			offset = (rand() % BUFFER_CNT) * sizeof(int);
+		}
+
+		flush = 1;
+		while(flush != 0) {
+		}
+
+		addr = (int *)((char *)remote_buffer + offset);
+
+		if (verbose > 1) {
+			printf("Local thread(%d): flushing %p.\n", local_core, addr);
+		}
+
+		_mm_clflush(addr);
+		_mm_mfence();
+
+		atomic_signal_fence(memory_order_seq_cst);
+
+		start = read_time();
+		temp = *addr;
+		end = read_time();
+
+		atomic_signal_fence(memory_order_seq_cst);
+
+		if (verbose > 1) {
+			printf("Local thread(%d): read %p.\n", local_core, addr);
+		}
+
+		latencies[times - 1] = end - start;
+		times--;
+	}
+
+	return (void *)(uintptr_t)temp;
+}
+
+static void * remote_thread(void *)
+{
+	int temp;
+	int * addr;
+	printf("Remote thread running...\n");
+	while(1) {
+		while(flush == 0) {
+		}
+
+		addr = (int *)((char *)remote_buffer + offset);
+
+		if(cache_mode) {
+			temp = *addr;
+			_mm_mfence();
+		} else {
+			_mm_clflush(addr);
+			_mm_mfence();
+		}
+
+		if (verbose > 1) {
+			printf("Remote thread(%d): %p %s.\n", remote_core, addr, cache_mode ? "read into cache" : "flushed");
+		}
+
+		flush = 0;
+	}
+	return (void *)(uintptr_t)temp;
+}
+
+int main(int argc, char * argv[])
+{
+	{
+		int c;
+		// parse arguments
+		while ((c = getopt(argc, argv, "l:r:t:vR")) != -1) {
+			switch (c) {
+			case 'l':
+				local_core = atoi(optarg);
+				break;
+			case 'r':
+				remote_core = atoi(optarg);
+				break;
+			case 't':
+				times = atoi(optarg);
+				break;
+			case 'R':
+				random_access = 1;
+				break;
+			case 'v':
+				verbose++;
+				break;
+			default:
+				exit(1);
+			}
+		}
+	}
+
+	srand(time(NULL));
+
+	// init topo
+	if (topo_init(1)) {
+		fprintf(stderr, "libtopo init failed!\n");
+		exit(1);
+	}
+
+	// init 
+	if (nms_init(1)) {
+		fprintf(stderr, "libnms init failed!\n");
+		exit(1);
+	}
+
+	size_t sz = sizeof(tsc_freq);
+	int rc;
+	if ((rc = sysctlbyname("machdep.tsc_freq", &tsc_freq, &sz, NULL, 0)) < 0) {
+		fprintf(stderr,"failed to query tsc frequency via sysctl (%d)\n", errno);
+	} else {
+		fprintf(stdout,"system tsc frequency = %lu\n", tsc_freq);
+	}
+
+	latencies = malloc(sizeof(uint64_t) * times);
+	const int remote_numa = topo_core_to_numa(remote_core);
+	const int local_numa = topo_core_to_numa(local_core);
+	const int total = times;
+
+	remote_buffer = nms_malloc(remote_numa, BUFFER_SIZE);
+	// fill with random values
+	for (int i = 0; i < BUFFER_SIZE; i++) {
+		remote_buffer[i] = rand();
+	}
+
+	pthread_attr_t lattr, rattr;
+	pthread_t lthread, rthread;
+	cpuset_t lcpuset, rcpuset;
+	CPU_ZERO(&lcpuset);
+	CPU_ZERO(&rcpuset);
+
+	CPU_SET(local_core, &lcpuset);
+	CPU_SET(remote_core, &rcpuset);
+
+	pthread_attr_init(&rattr);
+	pthread_attr_setaffinity_np(&rattr, sizeof(cpuset_t), &rcpuset);
+	pthread_attr_init(&lattr);
+	pthread_attr_setaffinity_np(&lattr, sizeof(cpuset_t), &lcpuset);
+
+	printf("local thread: %d numa: %d, remote: %d numa: %d\n", local_core, local_numa, remote_core, remote_numa);
+	pthread_create(&lthread, &lattr, local_thread, NULL);
+	pthread_create(&rthread, &rattr, remote_thread, NULL);
+	
+	pthread_join(lthread, NULL);
+
+	uint64_t min = UINT64_MAX;
+	uint64_t max = 0;
+	uint64_t sum = 0;
+	for (int i = total - 1; i >= 0; i--) {
+		if (verbose) {
+			printf("%lu,\n", latencies[i]);
+		}
+		if (min > latencies[i]) {
+			min = latencies[i];
+		}
+		if (max < latencies[i]) {
+			max = latencies[i];
+		}
+		sum += latencies[i];
+	}
+
+	double var = 0.0;
+	double avg = (double)sum / (double)total;
+	for (int i = total - 1; i >= 0; i--) {
+		var += pow(latencies[i] - avg, 2);
+	}
+	var = sqrt(var / avg);
+
+	printf("Avg: %lu cycles (%lu ns)\n"
+		   "Std: %lu cycles (%lu ns)\n"
+		   "Min: %lu cycles (%lu ns)\n"
+		   "Max: %lu cycles (%lu ns)\n", 
+		   (uint64_t)avg, cyc2ns((uint64_t)avg),
+		   (uint64_t)var, cyc2ns((uint64_t)var),
+		   min, cyc2ns(min),
+		   max, cyc2ns(max));
+
+	free(latencies);
+	return 0;
+}
+
Author	SHA1	Message	Date
quackerd	3320852dd5	sandybridge doesn't support clflushopt	2023-12-06 04:22:46 +08:00
quackerd	76a41666a0	fix dpdk	2023-12-06 03:38:32 +08:00
quackerd	b57fe6e5ea	akh morn	2023-12-06 03:23:00 +08:00
oscar	fc687426ae	stuff	2023-05-01 15:28:51 -04:00
quackerd	aba80e8869	stuff	2023-05-01 21:18:34 +02:00
quackerd	1a90104d53	minor fix	2023-03-29 22:00:59 +02:00
quackerd	59b8c36ced	multiarch	2023-03-17 21:13:05 +01:00
quackerd	4effb3f1bd	multiarch	2023-03-16 09:43:34 +01:00
oscar	bb9792cf06	memloadgen allocate memory in thread	2023-03-15 19:44:46 -04:00
oscar	a385866002	memloadgen allocate memory in thread	2023-03-15 19:10:52 -04:00
oscar	7e4fd3d721	memloadgen allocate memory in thread	2023-03-15 19:07:36 -04:00
oscar	05965dbb94	memloadgen allocate memory in thread	2023-03-15 18:43:37 -04:00
quackerd	25c18b4fc5	stdin based pct control	2023-03-05 16:48:54 +01:00
quackerd	28d469e8ff	better printing	2023-03-05 15:59:42 +01:00
quackerd	6cd0e7d12f	add signal control	2023-03-05 15:58:06 +01:00
quackerd	521a49d945	add magic number	2023-03-05 15:15:13 +01:00
quackerd	a9cac61069	cleanup and stuff	2023-01-04 17:25:32 +01:00
quackerd	f20ae16e31	temp commit	2022-12-14 20:52:12 +01:00
quackerd	2a543d7e4d	iperf	2022-11-30 20:37:51 +01:00
quackerd	a3b7b7db5d	iperf	2022-11-26 00:08:26 +01:00
quackerd	5e76edab89	useless but useful check	2022-11-24 10:11:14 +01:00
oscar	d0c7329f9f	iperf	2022-11-23 20:05:48 -05:00
quackerd	4ff2de5d1e	dpdk refactor	2022-11-22 16:27:27 +01:00
quackerd	933e9708f3	refactor iperf conf to human readable	2022-11-22 13:58:33 +01:00
quackerd	e85928e3f5	iperf script change	2022-11-21 22:52:13 +01:00
quackerd	df880a453c	new scripts	2022-11-18 09:27:04 +01:00
oscar	b5be9c38fe	memloadgen	2022-11-16 15:37:39 -05:00
quackerd	18339fb109	memloadgen pct support	2022-11-16 08:44:43 +01:00
quackerd	1836bd89df	memloadgen rate control	2022-11-11 22:11:50 +01:00
quackerd	075902ba1d	add break	2022-11-01 11:27:34 +01:00
quackerd	68b621fd3c	snapshot memloadgen transaction change	2022-11-01 11:01:23 +01:00
quackerd	565dbca278	latest dpdk & refactoring	2022-06-22 23:40:48 +08:00
quackerd	a716583b19	update various components for new machines	2022-05-25 06:55:01 -04:00
quackerd	d217bde46a	bug fix	2022-03-29 00:50:10 +08:00
quackerd	6e7e152915	posix support	2022-03-29 00:47:46 +08:00
quackerd	0d26960686	nvme support	2022-03-21 23:01:24 +08:00
quackerd	186150ca00	fixed hardcoded exit	2022-03-21 19:45:42 +08:00
quackerd	27c6cd188d	device driver abstraction	2022-03-21 19:43:49 +08:00
quackerd	2ecfacff11	spdk	2022-03-20 22:17:26 +08:00
quackerd	0dc463ba35	memload generator	2022-02-21 21:41:40 +08:00
quackerd	997587c519	temp save	2021-03-17 21:45:01 -04:00
quackerd	cd4785f08a	add mem region support for nm malloc	2021-03-04 02:25:34 -05:00
quackerd	4d50e55e1e	+fix workload gen	2021-03-04 01:54:13 -05:00
quackerd	7fd7c7f776	+libnm refactor and numa allocator support. +khat threads now have numa-local memory.	2021-03-03 22:22:06 -05:00
quackerd	b85777e6f0	+stuff?	2021-02-23 13:12:27 -05:00
quackerd	162d41a4cc	+ cat packet loss control and max packet loss tolerance \ + output and parse packet loss for master and slaves	2021-02-22 06:54:53 -05:00
quackerd	1fd9be7f13	+ packet loss control & + packet depth control	2021-02-21 05:16:39 -05:00
quackerd	d1e43dcf2f	+Bench scripts	2021-02-20 04:53:55 -05:00
quackerd	06b93ddf1c	memload gen Summary: Add memload generator Test Plan: by hand Reviewers: ali Differential Revision: https://review.rcs.uwaterloo.ca/D415	2021-02-16 05:15:11 -05:00
quackerd	f655e5f5cb	Initial commit of benchmarks Summary: + UDP and PTP over UDP & hw timestamping + Khat protocol + Rat protocol + Nanosecond timestamping + Load generation + NUMA detection library + Test scripts + Server & Client multi threading & tx/rx queues + RSS on all packets w/ randomized L4 ports Test Plan: by hand Reviewers: ali Reviewed By: ali Differential Revision: https://review.rcs.uwaterloo.ca/D408	2021-02-10 14:12:47 -05:00
				`@ -0,0 +1 @@`
				`Checks: "-,clang-diagnostic-,clang-analyzer-,modernize,performance*,-modernize-use-trailing-return-type,-modernize-avoid-c-arrays"`