latest dpdk & refactoring

2022-06-22 23:40:48 +08:00 · 2022-06-22 23:40:48 +08:00 · 565dbca278
commit 565dbca278
parent a716583b19
27 changed files with 1534 additions and 1765 deletions
--- a/.clang-format
+++ b/.clang-format
@ -52,7 +52,11 @@ ForEachMacros:
  - ARB_FOREACH_REVERSE
  - ARB_FOREACH_REVERSE_FROM
  - ARB_FOREACH_REVERSE_SAFE
+  - BIT_FOREACH_ISCLR
+  - BIT_FOREACH_ISSET
  - CPU_FOREACH
+  - CPU_FOREACH_ISCLR
+  - CPU_FOREACH_ISSET
  - FOREACH_THREAD_IN_PROC
  - FOREACH_PROC_IN_SYSTEM
  - FOREACH_PRISON_CHILD
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -4,68 +4,79 @@ project(khat)

 list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}")
 find_package(PkgConfig REQUIRED)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY lib)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY bin)

-pkg_check_modules(HWLOC hwloc REQUIRED)
 pkg_check_modules(DPDK libdpdk)
 pkg_check_modules(SPDK spdk_event_bdev spdk_env_dpdk)
 pkg_check_modules(SPDK_SYS spdk_syslibs)
 pkg_check_modules(UUID uuid)
-# get_filename_component(ISAL_LIB_PATH ${SPDK_INCLUDE_DIRS} DIRECTORY)
-# get_filename_component(ISAL_LIB_PATH ${ISAL_LIB_PATH} DIRECTORY)
-# set(ISAL_LIB_PATH ${ISAL_LIB_PATH}/isa-l/.libs)
+pkg_check_modules(TOPO bsdtopo)

 set(CC_FLAGS -O2 -g -Wall -Wextra -Werror -std=c++11 
        -Wno-deprecated-declarations 
        -Wno-address-of-packed-member
        -Wno-zero-length-array
        -Wno-gnu-zero-variadic-macro-arguments
-        -msse4
-        -mavx)
+        -march=native)
+
+set(C_FLAGS -O2 -g -Wall -Wextra -Werror -std=c17
+        -Wno-deprecated-declarations 
+        -Wno-address-of-packed-member
+        -Wno-zero-length-array
+        -Wno-gnu-zero-variadic-macro-arguments
+        -march=native)
+

 include_directories(${CMAKE_SOURCE_DIR}/inc)
+include_directories()

-set(LIBNM_CC_FLAGS -O2 -g -Wall -Wextra -Werror -std=c++11 -mavx -msse4)
 set(LIBNTR_C_FLAGS -O2 -g -Wall -Wextra -Werror -std=c11)
 set(LIBGEN_CC_FLAGS -O2 -g -Wall -Wextra -Werror -std=c++11)

-add_library(ntr STATIC libntr/ntr.c)
+add_library(ntr SHARED libntr/ntr.c)
 target_compile_options(ntr PRIVATE ${LIBNTR_C_FLAGS})

-add_library(gen STATIC libgen/generator.cc)
-target_compile_options(gen PRIVATE ${LIBGEN_CC_FLAGS})
+add_library(gen SHARED libgen/generator.cc libgen/loadgen.cc)
+target_link_libraries(gen PRIVATE pthread ntr ${TOPO_LINK_LIBRARIES} nms)
+target_compile_options(gen PRIVATE ${LIBGEN_CC_FLAGS} ${TOPO_CFLAGS})

-add_library(nm STATIC libnm/nm.cc libnm/alloc.cc libnm/loadgen.cc libnm/topo.cc)
-target_include_directories(nm PRIVATE ${HWLOC_INCLUDE_DIRS})
-target_link_libraries(nm PRIVATE gen ${HWLOC_LINK_LIBRARIES})
-target_compile_options(nm PRIVATE ${LIBNM_CC_FLAGS} ${HWLOC_CFLAGS})
+add_library(netsup SHARED net/libnetsup/dpdk.cc net/libnetsup/portconf.cc)
+target_link_libraries(netsup PRIVATE ntr ${DPDK_LINK_LIBRARIES})
+target_compile_options(netsup PRIVATE ${LIBGEN_CC_FLAGS} ${DPDK_CFLAGS})
+
+add_library(nms SHARED libnms/alloc.c)
+target_link_libraries(nms PRIVATE ${TOPO_LINK_LIBRARIES})
+target_compile_options(nms PRIVATE ${TOPO_CFLAGS})

 add_executable(khat EXCLUDE_FROM_ALL net/khat.cc)
-target_link_libraries(khat PRIVATE pthread nm ntr ${DPDK_LINK_LIBRARIES})
-target_compile_options(khat PRIVATE ${CC_FLAGS} ${DPDK_CFLAGS})
+target_link_libraries(khat PRIVATE pthread ntr gen netsup nms ${DPDK_LINK_LIBRARIES} ${TOPO_LINK_LIBRARIES})
+target_compile_options(khat PRIVATE ${CC_FLAGS} ${DPDK_CFLAGS} ${TOPO_CFLAGS})

 add_executable(cat EXCLUDE_FROM_ALL net/cat.cc)
-target_link_libraries(cat PRIVATE pthread nm ntr gen ${DPDK_LINK_LIBRARIES})
-target_compile_options(cat PRIVATE ${CC_FLAGS} ${DPDK_CFLAGS})
+target_link_libraries(cat PRIVATE pthread ntr gen netsup nms ${DPDK_LINK_LIBRARIES} ${TOPO_LINK_LIBRARIES})
+target_compile_options(cat PRIVATE ${CC_FLAGS} ${DPDK_CFLAGS} ${TOPO_CFLAGS})

 add_executable(rat EXCLUDE_FROM_ALL net/rat.cc)
-target_link_libraries(rat PRIVATE pthread nm ntr gen ${DPDK_LINK_LIBRARIES})
-target_compile_options(rat PRIVATE ${CC_FLAGS} ${DPDK_CFLAGS})
+target_link_libraries(rat PRIVATE pthread ntr gen netsup nms ${DPDK_LINK_LIBRARIES} ${TOPO_LINK_LIBRARIES})
+target_compile_options(rat PRIVATE ${CC_FLAGS} ${DPDK_CFLAGS} ${TOPO_CFLAGS})

 add_executable(birb EXCLUDE_FROM_ALL storage/birb.cc storage/io_gen.cc storage/drivers/bdev.cc storage/drivers/bdev_thread.cc storage/drivers/nvme.cc storage/drivers/nvme_thread.cc)
 target_include_directories(birb PRIVATE ${SPDK_INCLUDE_DIRS} ${DPDK_INCLUDE_DIRS} ${UUID_INCLUDE_DIRS})
 target_compile_options(birb PRIVATE ${CC_FLAGS} ${SPDK_CFLAGS} ${UUID_CFLAGS})
 target_link_directories(birb PRIVATE ${SPDK_LIBRARY_DIRS} ${SPDK_SYS_STATIC_LIBRARY_DIRS} ${UUID_LIBRARY_DIRS})
-target_link_libraries(birb PRIVATE pthread nm ntr gen -Wl,--whole-archive ${SPDK_LIBRARIES} -Wl,--no-whole-archive ${SPDK_SYS_STATIC_LIBRARIES})
+target_link_libraries(birb PRIVATE pthread ntr gen -Wl,--whole-archive ${SPDK_LIBRARIES} -Wl,--no-whole-archive ${SPDK_SYS_STATIC_LIBRARIES})

 add_executable(birb_posix EXCLUDE_FROM_ALL storage/birb_posix.cc storage/io_gen.cc)
 target_compile_options(birb_posix PRIVATE ${CC_FLAGS})
-target_link_libraries(birb_posix PRIVATE pthread nm ntr gen)
+target_link_libraries(birb_posix PRIVATE pthread ntr gen)

 add_executable(memloadgen util/memloadgen.cc)
-target_link_libraries(memloadgen PRIVATE pthread nm ntr)
-target_compile_options(memloadgen PRIVATE ${CC_FLAGS})
+target_link_libraries(memloadgen PRIVATE pthread gen ntr ${TOPO_LINK_LIBRARIES})
+target_compile_options(memloadgen PRIVATE ${CC_FLAGS} ${TOPO_CFLAGS})

-add_executable(test_ts test/ts.cc)
-set_target_properties(test_ts PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/test)
-target_link_libraries(test_ts PRIVATE nm)
-target_compile_options(test_ts PRIVATE ${CC_FLAGS})
+add_executable(nms_test tests/nms_test.c)
+set_target_properties(nms_test PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/tests)
+target_link_libraries(nms_test PRIVATE nms)
+target_compile_options(nms_test PRIVATE ${C_FLAGS})
--- a/inc/defs.hh
+++ b/inc/defs.hh
@ -1,13 +1,12 @@
 #pragma once

-#include <sys/cpuset.h>
 #include <cstdint>
 #include <cstring>
 #include <immintrin.h>
 #include <ctime>
 #include <cstdio>
+#include <sys/types.h>
 #include <sys/cpuset.h>
-#include <sys/_cpuset.h>

 #define DISALLOW_EVIL_CONSTRUCTORS(TypeName) \
 	TypeName(const TypeName &) = delete; \
--- a/inc/gen.hh
+++ b/inc/gen.hh
@ -7,10 +7,6 @@

 #pragma once

-#include <sys/param.h>
-
-#include <netinet/in.h>
-
 #include <assert.h>
 #include <inttypes.h>
 #include <limits.h>
@ -23,6 +19,10 @@
 #include <utility>
 #include <vector>

+#include <sys/param.h>
+
+#include "defs.hh"
+
 #define D(fmt, ...)
 #define DIE(fmt, ...) (void)0;

@ -292,3 +292,44 @@ Generator *createGenerator(std::string str);
 Generator *createFacebookKey();
 Generator *createFacebookValue();
 Generator *createFacebookIA();
+
+// memload generator
+class memload_generator {
+	public:
+	struct memload_generator_options {
+		constexpr static size_t ITERATION_MAX = -1;
+		size_t chunk_size {512 * 1024 * 1024};
+		size_t iteration {ITERATION_MAX};
+		int verbose {0};
+		bool shared_buffer {true};
+	};
+
+    private:
+	DISALLOW_EVIL_CONSTRUCTORS(memload_generator);
+	struct thread_info {
+		pthread_t pthr;
+		void *from_buffer;
+		void *to_buffer;
+		std::atomic<int> *stop;
+
+		int tid;
+
+		struct memload_generator_options * opts;
+	
+		// stat keeping
+		std::atomic<uint32_t> num_trans;
+		uint64_t begin_ts;
+		uint64_t end_ts;
+	};
+
+	std::vector<struct thread_info *> thr_infos;
+	std::atomic<int> end;
+	struct memload_generator_options opts;
+	static void *worker_thrd(void *_tinfo);
+
+    public:
+	memload_generator(cpuset_t * threads, cpuset_t * target_domain, struct memload_generator_options * opt, bool *success);
+	uint64_t get_bps();
+	bool check_done();
+	~memload_generator();
+};
--- a/inc/net/netsup.hh
+++ b/inc/net/netsup.hh
@ -1,22 +1,63 @@
 #pragma once
-#include <rte_ethdev.h>
-#include <rte_ip.h>
+#include <cstdint>
+
+#include "rte_ethdev.h"
+#include "rte_ether.h"
+
+#define MAX_NUMA_NODES (64)
+
+struct device_conf {
+	int portid;
+	uint16_t tx_ring_sz;
+	uint16_t rx_ring_sz;
+	int num_threads;
+	int mtu;
+	uint64_t rx_offloads;
+	uint64_t tx_offloads;
+	uint64_t rss_hf;
+	
+	rte_tx_callback_fn tx_fn;
+	void * tx_user;
+
+	rte_rx_callback_fn rx_fn;
+	void * rx_user;
+
+	bool timesync;
+};
+
+struct mem_conf {
+	int num_elements;
+	int cache_size;
+	int data_room_size;
+	int priv_size;
+	unsigned int max_pools;
+};

 constexpr static uint16_t MIN_RANDOM_PORT = 1000;
 constexpr static uint16_t DEFAULT_RAT_PORT = 1234;
-constexpr static unsigned int INIT_DELAY = 1;
+constexpr static unsigned int INIT_DELAY = 3;
 constexpr static unsigned int MAX_NODES = 64;

-static inline void 
-tx_burst_all(int portid, int txqid, struct rte_mbuf ** tx_bufs, int sz)
-{
-	int remaining = sz;
-	while(remaining > 0) {
-		remaining -= rte_eth_tx_burst(
-			    portid, txqid, &tx_bufs[sz - remaining],
-			    remaining);
-	}
-}
+void
+dpdk_init(struct device_conf *dconf, struct mem_conf *mconf);
+
+void
+dpdk_cleanup(struct device_conf *dconf);
+
+struct rte_mempool *
+mempool_get(int nodeid);
+
+struct port_conf {
+	const char * driver_name;
+	uint64_t rxoffload;
+	uint64_t txoffload;
+	uint64_t rss_hf;
+	bool timesync;
+};
+
+int
+portconf_get(int portid, struct port_conf * out);
+

 // constexpr static int LATENCY_MEASURE_TIMES = 10000;

--- a/inc/net/pkt.hh
+++ b/inc/net/pkt.hh
@ -10,8 +10,7 @@
 #include <rte_udp.h>
 #include <unistd.h>

-#include "nm.hh"
-#include "util.hh"
+#include "defs.hh"

 #include <random>

@ -24,11 +23,23 @@
 constexpr static uint32_t MAX_JUMBO_MTU = 9000;
 constexpr static uint32_t MAX_STANDARD_MTU = 1500;

-static inline int mtu_to_pkt_size(int mtu)
+static inline int
+mtu_to_pkt_size(int mtu)
 {
 	return mtu + RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN;
 }

+static inline void 
+tx_burst_all(int portid, int txqid, struct rte_mbuf ** tx_bufs, int sz)
+{
+	int remaining = sz;
+	while(remaining > 0) {
+		remaining -= rte_eth_tx_burst(
+			    portid, txqid, &tx_bufs[sz - remaining],
+			    remaining);
+	}
+}
+
 constexpr static uint32_t ETHER_FRAME_MAGIC = 0xDCDCE5E5;
 const static struct rte_ether_addr POU_MAC {
 	0x01, 0x00, 0x5e, 0x00, 0x01, 0x81
@ -87,7 +98,7 @@ pkt_hdr_to_netspec(struct pkt_hdr *pkt, struct net_spec *src,
    uint16_t *src_port, struct net_spec *dst, uint16_t *dst_port)
 {
 	if (src != nullptr) {
-		rte_ether_addr_copy(&pkt->eth_hdr.s_addr, &src->mac_addr);
+		rte_ether_addr_copy(&pkt->eth_hdr.src_addr, &src->mac_addr);
 		src->ip = rte_be_to_cpu_32(pkt->ipv4_hdr.src_addr);
 	}

@ -96,7 +107,7 @@ pkt_hdr_to_netspec(struct pkt_hdr *pkt, struct net_spec *src,
 	}

 	if (dst != nullptr) {
-		rte_ether_addr_copy(&pkt->eth_hdr.d_addr, &dst->mac_addr);
+		rte_ether_addr_copy(&pkt->eth_hdr.dst_addr, &dst->mac_addr);
 		dst->ip = rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr);
 	}

@ -203,7 +214,7 @@ class rdport_generator {
 	    , cur(0)
 	    , dist(0, MAX_PORT - min_port)
 	{
-		gen.seed(nm_get_uptime_ns());
+		gen.seed(get_uptime());
 		cur = dist(gen);
 	}
 	uint16_t next()
@ -224,23 +235,23 @@ class rdport_generator {
 	    (rte_be_to_cpu_32(pkt->ipv4_hdr.src_addr) >> 8) & 0xff,                                                           \
 	    (rte_be_to_cpu_32(pkt->ipv4_hdr.src_addr) >> 0) & 0xff,                                                           \
 	    rte_be_to_cpu_16(pkt->udp_hdr.src_port),                                                                          \
-	    pkt->eth_hdr.s_addr.addr_bytes[0],                                                                                \
-	    pkt->eth_hdr.s_addr.addr_bytes[1],                                                                                \
-	    pkt->eth_hdr.s_addr.addr_bytes[2],                                                                                \
-	    pkt->eth_hdr.s_addr.addr_bytes[3],                                                                                \
-	    pkt->eth_hdr.s_addr.addr_bytes[4],                                                                                \
-	    pkt->eth_hdr.s_addr.addr_bytes[5],                                                                                \
+	    pkt->eth_hdr.src_addr.addr_bytes[0],                                                                                \
+	    pkt->eth_hdr.src_addr.addr_bytes[1],                                                                                \
+	    pkt->eth_hdr.src_addr.addr_bytes[2],                                                                                \
+	    pkt->eth_hdr.src_addr.addr_bytes[3],                                                                                \
+	    pkt->eth_hdr.src_addr.addr_bytes[4],                                                                                \
+	    pkt->eth_hdr.src_addr.addr_bytes[5],                                                                                \
 	    (rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr) >> 24) & 0xff,                                                          \
 	    (rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr) >> 16) & 0xff,                                                          \
 	    (rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr) >> 8) & 0xff,                                                           \
 	    (rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr) >> 0) & 0xff,                                                           \
 	    rte_be_to_cpu_16(pkt->udp_hdr.dst_port),                                                                          \
-	    pkt->eth_hdr.d_addr.addr_bytes[0],                                                                                \
-	    pkt->eth_hdr.d_addr.addr_bytes[1],                                                                                \
-	    pkt->eth_hdr.d_addr.addr_bytes[2],                                                                                \
-	    pkt->eth_hdr.d_addr.addr_bytes[3],                                                                                \
-	    pkt->eth_hdr.d_addr.addr_bytes[4],                                                                                \
-	    pkt->eth_hdr.d_addr.addr_bytes[5], rte_be_to_cpu_16(pkt->type))
+	    pkt->eth_hdr.dst_addr.addr_bytes[0],                                                                                \
+	    pkt->eth_hdr.dst_addr.addr_bytes[1],                                                                                \
+	    pkt->eth_hdr.dst_addr.addr_bytes[2],                                                                                \
+	    pkt->eth_hdr.dst_addr.addr_bytes[3],                                                                                \
+	    pkt->eth_hdr.dst_addr.addr_bytes[4],                                                                                \
+	    pkt->eth_hdr.dst_addr.addr_bytes[5], rte_be_to_cpu_16(pkt->type))

 static inline void
 print_mac(struct rte_ether_addr *mac)
@ -253,7 +264,7 @@ print_mac(struct rte_ether_addr *mac)
 static inline void
 print_ipv4(uint32_t ip)
 {
-	printf("%d-%d-%d-%d", (ip >> 24) & 0xff, (ip >> 16) & 0xff,
+	printf("%d.%d.%d.%d", (ip >> 24) & 0xff, (ip >> 16) & 0xff,
 	    (ip >> 8) & 0xff, (ip >> 0) & 0xff);
 }

@ -276,10 +287,10 @@ dump_pkt(struct rte_mbuf *pkt)
 	    "Packet %p: Length 0x%x\n", (void *)pkt, rte_pktmbuf_data_len(pkt));
 	printf("    Ethernet header:\n");
 	printf("        Src:");
-	print_mac(&eth_hdr->s_addr);
+	print_mac(&eth_hdr->src_addr);
 	printf("\n");
 	printf("        Dst:");
-	print_mac(&eth_hdr->d_addr);
+	print_mac(&eth_hdr->dst_addr);
 	printf("\n");
 	printf("        Type: 0x%x\n", rte_be_to_cpu_16(eth_hdr->ether_type));

@ -339,11 +350,11 @@ construct_pkt_hdr(

 	// construct l2 header
 	eth_hdr = &pkt_data->eth_hdr;
-	rte_ether_addr_copy(&conn->src->mac_addr, &eth_hdr->s_addr);
+	rte_ether_addr_copy(&conn->src->mac_addr, &eth_hdr->src_addr);
 	if (is_ts_pkt) {
-		rte_ether_addr_copy(&POU_MAC, &eth_hdr->d_addr);
+		rte_ether_addr_copy(&POU_MAC, &eth_hdr->dst_addr);
 	} else {
-		rte_ether_addr_copy(&conn->dst->mac_addr, &eth_hdr->d_addr);
+		rte_ether_addr_copy(&conn->dst->mac_addr, &eth_hdr->dst_addr);
 	}
 	eth_hdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4);
 	buf->l2_len = sizeof(struct rte_ether_hdr);
@ -378,10 +389,13 @@ construct_pkt_hdr(
 	udp_hdr->dgram_cksum = 0; /* No UDP checksum. */
 	udp_hdr->dgram_len = total_sz - sizeof(struct rte_ether_hdr) - sizeof(struct rte_ipv4_hdr) - sizeof(struct rte_udp_hdr);
 	buf->l4_len = sizeof(struct rte_udp_hdr);
+	buf->ol_flags |= RTE_MBUF_F_TX_IPV4;
+	buf->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
+	buf->ol_flags |= RTE_MBUF_F_TX_UDP_CKSUM;

 	if (is_ts_pkt) {
 		// set misc flags
-		buf->ol_flags |= PKT_TX_IEEE1588_TMST;
+		buf->ol_flags |= RTE_MBUF_F_TX_IEEE1588_TMST;
 		pkt_data->ptp_hdr.ptp_ver = 0x2;      // VER 2
 		pkt_data->ptp_hdr.ptp_msg_type = 0x0; // SYNC
 	} else {
@ -463,7 +477,7 @@ check_valid_packet(struct rte_mbuf *pkt, const struct rte_ether_addr *host_mac)
 		}

 		if (!rte_is_same_ether_addr(
-			expected_mac, &pkt_data->eth_hdr.d_addr))
+			expected_mac, &pkt_data->eth_hdr.dst_addr))
 			return nullptr;
 	}

--- a/inc/nm.hh
+++ b/inc/nm.hh
@ -1,103 +0,0 @@
-#pragma once
-
-#include <sys/endian.h>
-#include "gen.hh"
-#include "defs.hh"
-
-#include <atomic>
-#include <cstdint>
-#include <chrono>
-#include <ctime>
-
-constexpr static unsigned int NM_LEVEL_NUMA = 0;
-constexpr static unsigned int NM_LEVEL_CPU = 1;
-constexpr static unsigned int NM_LEVEL_CORE = 2;
-constexpr static unsigned int NM_MAX_LEVEL = NM_LEVEL_CORE + 1; 
-
-constexpr static int NM_MAX_OBJS_PER_LVL = 256;
-
-// misc functions
-
-// 0 on success -1 on error
-int nm_init(int verbosity);
-
-uint64_t nm_tsc2ns(uint64_t tsc);
-
-uint64_t nm_get_uptime_ns();
-
-// topology stuff
-struct nm_obj;
-
-struct nm_obj *
-nm_find_parent_obj(struct nm_obj * start, int parent_level);
-
-int
-nm_obj_count(int level);
-
-struct nm_obj *
-nm_obj_from_id(int level, int id);
-
-struct nm_obj *
-nm_obj_find_parent(struct nm_obj * start, int parent_level);
-
-int
-nm_obj_get_id(struct nm_obj * obj);
-
-int
-nm_obj_get_level(struct nm_obj * obj);
-
-static inline int
-nm_get_node_from_core(int coreid)
-{
-	return nm_obj_get_id(nm_obj_find_parent(nm_obj_from_id(NM_LEVEL_CORE, coreid), NM_LEVEL_NUMA));
-}
-
-// memload generator
-class memload_generator {
-    private:
-	DISALLOW_EVIL_CONSTRUCTORS(memload_generator);
-	struct thread_info {
-		pthread_t pthr;
-		uint32_t iter;
-		uint32_t array_size;
-		std::atomic<int> init_status;
-		std::atomic<int> *state;
-		std::atomic<uint32_t> num_trans;
-		constexpr static int INIT_START = 0;
-		constexpr static int INIT_SUCCESS = 1;
-		constexpr static int INIT_FAILED = 2;
-		void *from_region;
-		void *to_region;
-		uint32_t to_domainid;
-		uint32_t from_domainid;
-		uint64_t begin_ts;
-		uint64_t stop_ts;
-	};
-	std::vector<struct thread_info *> thr_infos;
-
-	std::atomic<int> state;
-	constexpr static uint32_t STATE_READY = 0;
-	constexpr static uint32_t STATE_START = 1;
-	constexpr static uint32_t STATE_STOP = 2;
-
-	uint32_t array_size;
-	uint32_t iteration;
-
-	static void *worker_thrd(void *_tinfo);
-
-    public:
-	memload_generator(uint64_t from_cmask, uint64_t to_cmask, uint32_t array_sz, uint32_t iteration,
-	    bool *success);
-	void start();
-	void stop();
-	uint64_t get_bps();
-	bool check_done();
-	~memload_generator();
-};
-
-// allocators
-void *
-nm_malloc(unsigned int node, size_t size);
-
-void
-nm_free(unsigned int node, void * addr);
--- a/inc/nms.h
+++ b/inc/nms.h
@ -0,0 +1,20 @@
+#pragma once
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int
+nms_init(int verbose);
+
+void *
+nms_malloc(int nodeid, size_t sz);
+
+void
+nms_free(int nodeid, void * addr);
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
--- a/libgen/loadgen.cc
+++ b/libgen/loadgen.cc
@ -0,0 +1,136 @@
+#include <atomic>
+#include <pthread.h>
+#include <pthread_np.h>
+#include <unistd.h>
+
+#include <sys/cpuset.h>
+#include <sys/domainset.h>
+#include <sys/endian.h>
+#include <sys/thr.h>
+#include <topo.h>
+
+
+#include "gen.hh"
+#include "nms.h"
+
+void *
+memload_generator::worker_thrd(void *_tinfo)
+{
+	auto *tinfo = (struct thread_info *)_tinfo;
+	long tid;
+	thr_self(&tid);
+
+	if (tinfo->opts->verbose) {
+		fprintf(
+		    stdout, "memload_generator <thread %ld>: running...\n", tid);
+	}
+	tinfo->begin_ts = topo_uptime_ns();
+	while (tinfo->num_trans.load() < tinfo->opts->iteration) {
+		memcpy((char *)tinfo->from_buffer, (char *)tinfo->to_buffer, tinfo->opts->chunk_size);
+		tinfo->end_ts = topo_uptime_ns();
+		tinfo->num_trans.fetch_add(1);
+	}
+
+	if (tinfo->opts->verbose) {
+		fprintf(
+		    stdout, "memload_generator <thread %ld>: exiting...\n", tid);
+	}
+	return nullptr;
+}
+
+memload_generator::memload_generator(cpuset_t * threads, cpuset_t * target_domain, struct memload_generator_options * opt, bool *success)
+{
+	*success = false;
+	end.store(0);
+	std::memcpy(&this->opts, opt, sizeof(memload_generator_options));
+
+	int nextcore = CPU_FFS(threads) - 1;
+	int target_domain_id = CPU_FFS(target_domain) - 1;
+	int num_cores = CPU_COUNT(threads);
+	if (target_domain_id < 0 || num_cores == 0) {
+		return;
+	}
+
+	void * local_buffer;
+	void * target_buffer;
+	if (opts.shared_buffer) {
+		local_buffer = nms_malloc(topo_core_to_numa(nextcore), opt->chunk_size);
+		target_buffer = nms_malloc(target_domain_id, opt->chunk_size);
+	}
+
+	int tid = 0;
+	while (nextcore != -1) {
+		auto *info = new struct thread_info;
+		cpuset_t cpuset;
+		pthread_attr_t attr;
+
+		info->stop = &this->end;
+		info->num_trans.store(0);
+		info->begin_ts = 0;
+		info->end_ts = 0;
+		info->opts = &this->opts;
+		if (opt->shared_buffer) {
+			info->from_buffer = local_buffer;
+			info->to_buffer = target_buffer;
+		} else {
+			info->from_buffer = nms_malloc(topo_core_to_numa(nextcore), opt->chunk_size);
+			info->to_buffer = nms_malloc(target_domain_id, opt->chunk_size);
+		}
+
+		CPU_ZERO(&cpuset);
+		CPU_SET(nextcore, &cpuset);
+		pthread_attr_init(&attr);
+		pthread_attr_setaffinity_np(&attr, sizeof(cpuset_t), &cpuset);
+		pthread_create(&info->pthr, &attr, worker_thrd, info);
+
+		if (opts.verbose) {
+			fprintf(stdout,
+			    "memload_generator: created thread %d on core %d target domain %d\n", tid,
+			    nextcore, target_domain_id);
+		}
+
+		thr_infos.push_back(info);
+
+		CPU_CLR(nextcore, threads);
+		nextcore = CPU_FFS(threads) - 1;
+		tid++;
+	}
+
+	*success = true;
+	if (opts.verbose) {
+		fprintf(stdout,
+		    "memload_generator: exiting constructor. Success: %d...\n",
+		    success ? 1 : 0);
+	}
+}
+
+bool
+memload_generator::check_done()
+{
+	bool done = true;
+	for (auto i : thr_infos) {
+		done = done && (i->num_trans.load() >= this->opts.iteration);
+	}
+	return done;
+}
+
+uint64_t
+memload_generator::get_bps()
+{
+	uint64_t total_transactions = 0;
+	uint64_t total_time = 0;
+	for (auto i : thr_infos) {
+		total_transactions += i->num_trans.load();
+		total_time = (i->end_ts - i->begin_ts) + total_time;
+	}
+	return (double)(total_transactions * this->opts.chunk_size) / ((double)total_time / thr_infos.size() / S2NS);
+}
+
+memload_generator::~memload_generator()
+{
+	for (auto i : thr_infos) {
+		// XXX: free
+		pthread_join(i->pthr, NULL);
+		delete i;
+	}
+}
--- a/libnm/alloc.cc
+++ b/libnm/alloc.cc
@ -1,113 +0,0 @@
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/cpuset.h>
-#include <sys/domainset.h>
-#include <sys/thr.h>
-#include <sys/mman.h>
-
-#include <cerrno>
-
-#include "nmp.hh"
-
-static pthread_mutex_t alloc_lock;
-static constexpr unsigned int MEM_OBJ_SIZE = 4096; // 4k
-static constexpr unsigned int MEM_OBJ_NUM = 1024 * 256; // 4k * 1024 * 256 = 1GB per region
-static constexpr unsigned int MEM_REGION_NUM = 4; // 4 x 1GB = 4GB total
-static int nm_mem_idx[NM_MAX_OBJS_PER_LVL];
-static int nm_mem_region_idx[NM_MAX_OBJS_PER_LVL];
-static void* nm_mem_regions[NM_MAX_OBJS_PER_LVL][MEM_REGION_NUM];
-
-int
-nm_alloc_init()
-{
-    long tid;
-    thr_self(&tid);
-	domainset_t orig_dom;
-	int orig_policy;
-
-	pthread_mutex_init(&alloc_lock, nullptr);
-
-	DOMAINSET_ZERO(&orig_dom);
-
-	// save existing thread's allocation strategy
-	int ret = cpuset_getdomain(CPU_LEVEL_WHICH, CPU_WHICH_TID, tid, sizeof(orig_dom), &orig_dom, &orig_policy);
-	if (ret != 0) {
-		return ret;
-	}
-
-	domainset_t tmp_domain;
-	for (int i = 0; i < nm_obj_count(NM_LEVEL_NUMA); i++) {
-		DOMAINSET_ZERO(&tmp_domain);
-		DOMAINSET_SET(i, &tmp_domain);
-		
-		ret = cpuset_setdomain(CPU_LEVEL_WHICH, CPU_WHICH_TID, tid, sizeof(tmp_domain), &tmp_domain, DOMAINSET_POLICY_PREFER);
-		if (ret != 0) {
-			if (nm_get_verbose() > 0) {
-				fprintf(stdout, "libnm: cpuset_setdomain failed with %d\n", errno);
-			}
-			return ret;
-		}
-
-		for (unsigned int j = 0; j < MEM_REGION_NUM; j++) {
-			if ((nm_mem_regions[i][j] = mmap(nullptr, MEM_OBJ_NUM * MEM_OBJ_SIZE, PROT_READ | PROT_WRITE,
-			MAP_ANON | MAP_ALIGNED_SUPER | MAP_NOCORE | MAP_PRIVATE | MAP_NOSYNC,
-			-1, 0)) == MAP_FAILED) {
-				if (nm_get_verbose() > 0) {
-					fprintf(stdout, "libnm: mmap failed with %d\n", errno);
-				}
-				return -1;
-			}
-
-			// touch the pages to prefault the pages
-			for (unsigned int k = 0; k < MEM_OBJ_NUM; k++) {
-				*(uint32_t*)((char*)nm_mem_regions[i][j] + k * MEM_OBJ_SIZE) = 0;
-			}
-
-			if (nm_get_verbose() > 0) {
-				fprintf(stdout, "libnm: reserved %u bytes (%u MB) on node %d. vaddr: 0x%p\n", MEM_OBJ_NUM * MEM_OBJ_SIZE, MEM_OBJ_SIZE * MEM_OBJ_NUM / 1024 / 1024, i, nm_mem_regions[i][j]);
-			}
-		}
-
-		nm_mem_idx[i] = 0;
-		nm_mem_region_idx[i] = 0;
-	}
-
-	// restore existing thread's allocation strategy
-	ret = cpuset_setdomain(CPU_LEVEL_WHICH, CPU_WHICH_TID, tid, sizeof(orig_dom), &orig_dom, orig_policy);
-
-	return ret;
-}
-
-void *
-nm_malloc(unsigned int node, size_t size)
-{
-	void * ret = nullptr;
-	int num_objs = (size + MEM_OBJ_SIZE - 1) / MEM_OBJ_SIZE;
-	bool retry = false;
-
-	pthread_mutex_lock(&alloc_lock);
-	int cur_region = nm_mem_region_idx[node];
-	int cur_idx = nm_mem_idx[node];
-
-retry:
-	if ((int)MEM_OBJ_NUM - cur_idx >= num_objs) {
-		ret = (char*)nm_mem_regions[node][cur_region] + MEM_OBJ_SIZE * cur_idx;
-		nm_mem_region_idx[node] = cur_region;
-		nm_mem_idx[node] = cur_idx + num_objs;
-	} else if (!retry && (cur_region < (int)MEM_REGION_NUM)) {
-		// check next region
-		cur_region++;
-		cur_idx = 0;
-		retry = true;
-		goto retry;
-	}
-	pthread_mutex_unlock(&alloc_lock);
-
-	return ret;
-}
-
-void
-nm_free(unsigned int node ATTR_UNUSED, void * addr ATTR_UNUSED)
-{
-	// dummy function
-}
--- a/libnm/loadgen.cc
+++ b/libnm/loadgen.cc
@ -1,217 +0,0 @@
-#include "nmp.hh"
-#include <atomic>
-
-#include <sys/cpuset.h>
-#include <sys/domainset.h>
-#include <sys/endian.h>
-#include <sys/thr.h>
-#include <pthread.h>
-#include <pthread_np.h>
-#include <unistd.h>
-
-void *
-memload_generator::worker_thrd(void *_tinfo)
-{
-	auto *tinfo = (struct thread_info *)_tinfo;
-	long tid;
-	thr_self(&tid);
-
-	if (nm_get_verbose() > 0) {
-		fprintf(stdout,
-		    "memload_generator <thread %ld>: from domain %d to %d\n",
-		    tid, tinfo->from_domainid, tinfo->to_domainid);
-	}
-
-	if (tinfo->from_region == nullptr) {
-    	tinfo->from_region = nm_malloc(tinfo->from_domainid, tinfo->array_size);
-	}
-
-	if (tinfo->to_region == nullptr) {
-    	tinfo->to_region = nm_malloc(tinfo->to_domainid, tinfo->array_size);
-	}
-
-    if (tinfo->from_region == nullptr || tinfo->to_region == nullptr) {
-		tinfo->init_status.store(thread_info::INIT_FAILED);
-		if (nm_get_verbose() > 0) {
-			fprintf(stderr,
-			    "memload_generator <thread %ld>: failed to allocate memory\n", tid);
-		}
-        return nullptr;
-    }
-
-	// populate the region with 1/2/3s
-	memset((char*)tinfo->from_region, 1, tinfo->array_size);
-	memset((char*)tinfo->to_region, 2, tinfo->array_size);
-
-	tinfo->init_status.store(thread_info::INIT_SUCCESS);
-
-	if (nm_get_verbose() > 0) {
-		fprintf(stdout,
-		    "memload_generator <thread %ld>: init finished, from: %p, to: %p, waiting for start...\n",
-		    tid, tinfo->from_region, tinfo->to_region);
-	}
-
-	while (tinfo->state->load() == STATE_READY) {
-	};
-
-	if (nm_get_verbose() > 0) {
-		fprintf(
-		    stdout, "memload_generator <thread %ld>: running...\n", tid);
-	}
-	tinfo->begin_ts = nm_get_uptime_ns();
-	while (tinfo->state->load() == STATE_START) {
-		if (tinfo->num_trans.load() < tinfo->iter) {
-			// generate traffic
-			memcpy((char *)tinfo->to_region, (char *)tinfo->from_region, tinfo->array_size);
-			tinfo->num_trans.fetch_add(1);
-			if (tinfo->num_trans.load() >= tinfo->iter) {
-				tinfo->stop_ts = nm_get_uptime_ns();
-			}
-		} else {
-			usleep(1000 * 100);
-		}
-	}
-
-	//nm_free(tinfo->from_domainid, tinfo->from_region);
-	//nm_free(tinfo->to_domainid, tinfo->to_region);
-
-	if (nm_get_verbose() > 0) {
-		fprintf(
-		    stdout, "memload_generator <thread %ld>: exiting...\n", tid);
-	}
-	return nullptr;
-}
-
-memload_generator::memload_generator(
-    uint64_t from_cmask, uint64_t to_cmask, uint32_t array_sz, uint32_t iteration, bool *success)
-{
-	*success = false;
-	state.store(STATE_READY);
-	this->array_size = array_sz;
-	this->iteration = iteration;
-	int nextcore;
-	int to_coreid = cmask_get_next_cpu(&to_cmask);
-	int num_cores = cmask_get_num_cpus(from_cmask);
-	cpuset_t cpuset;
-
-	if (to_coreid == NEXT_CPU_NULL || num_cores == 0) {
-		return;
-	}
-
-	void * from = nm_malloc(nm_obj_get_id(nm_obj_find_parent(nm_obj_from_id(NM_LEVEL_CORE, to_coreid), NM_LEVEL_NUMA)), array_sz);
-	nextcore = cmask_get_next_cpu(&from_cmask);
-	void * to = nm_malloc(nm_obj_get_id(nm_obj_find_parent(nm_obj_from_id(NM_LEVEL_CORE, nextcore), NM_LEVEL_NUMA)), array_sz);
-	while (nextcore != NEXT_CPU_NULL) {
-		auto *info = new struct thread_info;
-		pthread_attr_t attr;
-		info->num_trans.store(0);
-		info->to_domainid = nm_obj_get_id(nm_obj_find_parent(nm_obj_from_id(NM_LEVEL_CORE, to_coreid), NM_LEVEL_NUMA));
-		info->from_domainid = nm_obj_get_id(nm_obj_find_parent(nm_obj_from_id(NM_LEVEL_CORE, nextcore), NM_LEVEL_NUMA));
-		info->init_status.store(thread_info::INIT_START);
-		info->state = &state;
-		info->iter = this->iteration;
-		info->array_size = this->array_size;
-
-    	info->from_region = from;
-    	info->to_region = to;
-
-		CPU_ZERO(&cpuset);
-		CPU_SET(nextcore, &cpuset);
-		pthread_attr_init(&attr);
-		pthread_attr_setaffinity_np(&attr, sizeof(cpuset_t), &cpuset);
-		pthread_create(&info->pthr, &attr, worker_thrd, info);
-
-		if (nm_get_verbose() > 0) {
-			fprintf(stdout,
-			    "memload_generator: created thread on core %d\n",
-			    nextcore);
-		}
-
-		thr_infos.push_back(info);
-		nextcore = cmask_get_next_cpu(&from_cmask);
-	}
-
-	if (nm_get_verbose() > 0) {
-		fprintf(
-		    stdout, "memload_generator: waiting for thread init...\n");
-	}
-
-	bool failed = false;
-	uint num_success = 0;
-	while (num_success < thr_infos.size() && !failed) {
-		num_success = 0;
-		for (auto i : thr_infos) {
-			if (i->init_status.load() ==
-			    thread_info::INIT_SUCCESS) {
-				num_success++;
-			}
-			if (i->init_status.load() == thread_info::INIT_FAILED) {
-				failed = true;
-			}
-		}
-	}
-
-	*success = num_success == thr_infos.size();
-	if (nm_get_verbose() > 0) {
-		fprintf(stdout,
-		    "memload_generator: exiting constructor. Success: %d...\n",
-		    success ? 1 : 0);
-	}
-}
-
-void
-memload_generator::start()
-{
-	if (this->state.load() == STATE_READY) {
-		state.store(STATE_START);
-	}
-}
-
-void
-memload_generator::stop()
-{
-	if (this->state.load() == STATE_START) {
-		state.store(STATE_STOP);
-	}
-}
-
-bool
-memload_generator::check_done()
-{
-	bool done = true;
-	for (auto i : thr_infos) {
-		done = done && (i->num_trans.load() >= this->iteration);
-	}
-	if (done) {
-		stop();
-	}
-	return done;
-}
-
-uint64_t
-memload_generator::get_bps()
-{
-	if (this->state.load() == STATE_STOP) {
-		uint64_t total_transactions = 0;
-		uint64_t total_time = 0;
-		for (auto i : thr_infos) {
-			total_transactions += i->num_trans.load();
-			total_time = (i->stop_ts - i->begin_ts) + total_time;
-		}
-		return (double)(total_transactions * this->array_size) / ((double)total_time / thr_infos.size() / S2NS);
-	} else {
-		return 0;
-	}
-}
-
-memload_generator::~memload_generator()
-{
-	if (this->state.load() != STATE_STOP) {
-		stop();
-	}
-
-	for (auto i : thr_infos) {
-		pthread_join(i->pthr, nullptr);
-		delete i;
-	}
-}
--- a/libnm/nm.cc
+++ b/libnm/nm.cc
@ -1,72 +0,0 @@
-#include <sys/types.h>
-#include <sys/sysctl.h>
-#include <sys/thr.h>
-#include <x86intrin.h>
-
-#include <algorithm>
-#include <atomic>
-#include <vector>
-#include <cerrno>
-
-#include "nmp.hh"
-#include "defs.hh"
-
-static const char *SYSCTL_TSC = "machdep.tsc_freq";
-static uint64_t sysctl_tsc_freq = 0;
-
-static int verbose = 0;
-
-int nm_get_verbose()
-{
-	return verbose;
-}
-
-uint64_t
-nm_get_uptime_ns()
-{
-	unsigned int dummy;
-	_mm_lfence();
-	uint64_t tsc = __rdtscp(&dummy);
-	_mm_lfence();
-	return nm_tsc2ns(tsc);
-}
-
-uint64_t
-nm_tsc2ns(uint64_t tsc)
-{
-	return (uint64_t)(
-	    (double)tsc / (double)sysctl_tsc_freq * S2NS);
-}
-
-// 0 on success
-// -1 on error
-int
-nm_init(int verbosity)
-{
-	int ret;
-	size_t sz = sizeof(sysctl_tsc_freq);
-	verbose = verbosity;
-
-	// init nm_tsc2ns
-	if ((ret = sysctlbyname(
-		 SYSCTL_TSC, &sysctl_tsc_freq, &sz, nullptr, 0)) < 0) {
-		if (nm_get_verbose()) {
-			fprintf(stderr,
-			    "libnm: failed to query tsc frequency via sysctl (%d)\n", errno);
-		}
-		return ret;
-	}
-
-	if (nm_get_verbose()) {
-		fprintf(stdout, "libnm: tsc frequency: %lu\n", sysctl_tsc_freq);
-	}
-
-	ret = nm_topo_init();
-	if (ret != 0) {
-		return ret;
-	}
-
-	ret = nm_alloc_init();
-
-	return ret;
-}
--- a/libnm/nmp.hh
+++ b/libnm/nmp.hh
@ -1,9 +0,0 @@
-#pragma once
-
-#include "nm.hh"
-
-int nm_topo_init();
-
-int nm_alloc_init();
-
-int nm_get_verbose();
--- a/libnm/topo.cc
+++ b/libnm/topo.cc
@ -1,204 +0,0 @@
-#include "nmp.hh"
-
-#include <cstdio>
-#include <cstdlib>
-#include <hwloc.h>
-
-constexpr static int NM_MAX_CHILDREN = 128;
-
-struct nm_obj {
-	int level;
-	int id;
-	struct nm_obj *parent;
-	
-	int num_children;
-	struct nm_obj * children[NM_MAX_CHILDREN];
-};
-
-static int size_tbl[NM_MAX_LEVEL] = { 0 };
-static struct nm_obj * obj_tbl[NM_MAX_LEVEL][NM_MAX_OBJS_PER_LVL];
-
-static hwloc_obj_t
-get_parent_type(hwloc_obj_t obj, hwloc_obj_type_t type)
-{
-	obj = obj->parent;
-	while (obj != nullptr) {
-		if (obj->type == type) {
-			return obj;
-		}
-		obj = obj->parent;
-	}
-	return nullptr;
-}
-
-// static int
-// obj_comparator(const void * a, const void * b)
-// {
-// 	return ((struct nm_obj *)a)->id - ((struct nm_obj *)b)->id;
-// }
-
-static inline int 
-is_level_valid(int level) 
-{
-	return level < (int)NM_MAX_LEVEL;
-}
-
-int
-nm_obj_count(int level)
-{
-	if (is_level_valid(level)) {
-		return size_tbl[level];
-	}
-	return -1;
-}
-
-struct nm_obj *
-nm_obj_from_id(int level, int id)
-{
-	if (is_level_valid(level) && id <= size_tbl[level]) {
-		return obj_tbl[level][id];
-	}
-	return nullptr;
-}
-
-struct nm_obj *
-nm_obj_find_parent(struct nm_obj * start, int parent_level)
-{
-    struct nm_obj * ret = nullptr;
-    while (start != nullptr) {
-        if (parent_level == start->level) {
-            ret = start;
-            break;
-        }
-        start = start->parent;
-    }
-    return ret;
-}
-
-int
-nm_obj_get_id(struct nm_obj * obj)
-{
-    return obj->id;
-}
-
-int
-nm_obj_get_level(struct nm_obj * obj)
-{
-    return obj->level;
-}
-
-static int
-validate_objs(int level)
-{
-	for (int i = 0; i < size_tbl[level]; i++) {
-		struct nm_obj * each = obj_tbl[level][i];
-		if (each->id != i) {
-			return 0;
-		}	
-	}
-	return 1;
-}
-
-static int
-add_obj(int id, int level, struct nm_obj * parent)
-{
-	if (size_tbl[level] >= NM_MAX_OBJS_PER_LVL) {
-
-		return -1;
-	}
-
-	auto each = (struct nm_obj *)malloc(sizeof(struct nm_obj));
-
-	each->id = id;
-	each->level = level;
-	each->num_children = 0;
-	each->parent = parent;
-
-	if (parent != nullptr) {
-		// add children
-		if (parent->num_children >= NM_MAX_CHILDREN) {
-			return -1;
-		} else {
-			parent->children[parent->num_children] = each;
-			parent->num_children++;
-		}
-	}
-
-	obj_tbl[level][size_tbl[level]] = each;
-	size_tbl[level]++;
-
-	return 0;
-}
-
-static int
-add_level(hwloc_topology * topo, hwloc_obj_type_t hwloc_level, hwloc_obj_type_t hwloc_plevel, int level, int plevel)
-{
-	// populate numa nodes
-	hwloc_obj_t obj = nullptr;
-	hwloc_obj_t pobj = nullptr;
-	struct nm_obj *parent = nullptr;
-	while (true) {
-
-		obj = hwloc_get_next_obj_by_type(topo, hwloc_level, obj);
-		if (obj == nullptr) {
-			break;
-		}
-
-		pobj = get_parent_type(obj, hwloc_plevel);
-
-		if (pobj != nullptr) {
-			parent = obj_tbl[plevel][pobj->logical_index];
-		}
-
-		if (add_obj(obj->logical_index, level, parent) != 0) {
-			if (nm_get_verbose() > 0) {
-				fprintf(stderr, "libnm: failed to add object %d.\n", obj->logical_index);
-			}
-			return -1;
-		}
-
-		if (nm_get_verbose() > 0) {
-			fprintf(stdout, "libnm: identified id %d type %d parent %d type %d\n", obj->logical_index, level, parent == nullptr ? -1 : parent->id, plevel);
-		}
-	}
-
-	// sort
-	// std::qsort(obj_tbl[level], size_tbl[level], sizeof(struct nm_obj), obj_comparator);
-	if (!validate_objs(level)) {
-		if (nm_get_verbose() > 0) {
-			fprintf(stdout, "libnm: objects are shuffled at level %d.\n", level);
-		}
-		return -1;
-	}
-
-	return 0;
-}
-
-int
-nm_topo_init()
-{
-	int ret;
-
-	// init numa stuff
-	hwloc_topology *topo;
-	if ((ret = hwloc_topology_init(&topo)) != 0) {
-		return ret;
-	}
-
-	if ((ret = hwloc_topology_load(topo)) != 0)
-		return ret;
-
-	if ((ret = add_level(topo, HWLOC_OBJ_PACKAGE, HWLOC_OBJ_PACKAGE, NM_LEVEL_NUMA, NM_LEVEL_NUMA)) != 0) {
-		return ret;
-	}
-
-	if ((ret = add_level(topo, HWLOC_OBJ_CORE, HWLOC_OBJ_PACKAGE, NM_LEVEL_CPU, NM_LEVEL_NUMA)) != 0) {
-		return ret;
-	}
-
-	if ((ret = add_level(topo, HWLOC_OBJ_PU, HWLOC_OBJ_CORE, NM_LEVEL_CORE, NM_LEVEL_CPU)) != 0) {
-		return ret;
-	}
-
-	return ret;
-}
--- a/libnms/alloc.c
+++ b/libnms/alloc.c
@ -0,0 +1,196 @@
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/cpuset.h>
+#include <sys/domainset.h>
+#include <sys/thr.h>
+#include <sys/mman.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <errno.h>
+#include <stdatomic.h>
+#include <string.h>
+#include <assert.h>
+
+#include <nms.h>
+
+#define MAX_NUMA_DOMAINS (64)
+#define MAX_REGIONS (64)
+#define REGION_SIZE (1024 * 1024 * 1024)
+#define MALLOC_UNIT (4096)
+
+struct nms_region {
+	uintptr_t start_addr;
+	size_t size;
+	size_t occupied;
+};
+
+struct nms_desc {
+    // alloc
+    pthread_mutex_t alloc_lock;
+
+	struct nms_region regions[MAX_NUMA_DOMAINS][MAX_REGIONS];
+	int region_sz[MAX_NUMA_DOMAINS];
+};
+
+static _Atomic(int) initialized = 0;
+static struct nms_desc g_desc;
+
+static void *
+nms_alloc_region(int nodeid, size_t sz)
+{
+	long tid;
+	domainset_t orig_dom;
+	int orig_policy;
+	void * region;
+
+    thr_self(&tid);
+	DOMAINSET_ZERO(&orig_dom);
+
+	// save existing thread's allocation strategy
+	int ret = cpuset_getdomain(CPU_LEVEL_WHICH, CPU_WHICH_TID, tid, sizeof(orig_dom), &orig_dom, &orig_policy);
+	if (ret != 0) {
+		fprintf(stderr, "libnms: cpuset_getdomain failed with %d\n", errno);
+		return NULL;
+	}
+
+	domainset_t tmp_domain;
+	DOMAINSET_ZERO(&tmp_domain);
+	DOMAINSET_SET(nodeid, &tmp_domain);
+
+	ret = cpuset_setdomain(CPU_LEVEL_WHICH, CPU_WHICH_TID, tid, sizeof(tmp_domain), &tmp_domain, DOMAINSET_POLICY_ROUNDROBIN);
+	if (ret != 0) {
+		fprintf(stderr, "libnms: cpuset_setdomain failed with %d\n", errno);
+		return NULL;
+	}
+
+	if ((region = mmap(NULL, REGION_SIZE, PROT_READ | PROT_WRITE, MAP_ANON | MAP_ALIGNED_SUPER | MAP_NOCORE | MAP_PRIVATE | MAP_NOSYNC, -1, 0)) == MAP_FAILED) {
+		fprintf(stderr, "libnms: mmap failed with %d\n", errno);
+		return NULL;
+	}
+
+	// touch the pages to prefault the pages
+	for (size_t i = 0; i < REGION_SIZE; i += MALLOC_UNIT) {
+		*(uint8_t*)((uintptr_t)region + i) = 0;
+	}
+
+	// restore existing thread's allocation strategy
+	ret = cpuset_setdomain(CPU_LEVEL_WHICH, CPU_WHICH_TID, tid, sizeof(orig_dom), &orig_dom, orig_policy);
+	if (ret != 0) {
+		fprintf(stderr, "libnms: cpuset_setdomain failed with %d\n", errno);
+		munmap(region, REGION_SIZE);
+		return NULL;
+	}
+
+	return region;
+}
+
+static int
+nms_desc_init(struct nms_desc * desc, int verbose)
+{
+	memset(desc, 0, sizeof(struct nms_desc));
+	pthread_mutex_init(&desc->alloc_lock, NULL);
+	return 0;
+}
+
+static void *
+nms_region_malloc(struct nms_region * region, size_t size)
+{
+	void * ret = NULL;
+	if (region->size >= region->occupied + size) {
+		ret = (void *)(region->start_addr + region->occupied);
+		region->occupied += size;
+		region->occupied = (region->occupied + MALLOC_UNIT - 1) & ~(MALLOC_UNIT - 1);
+	}
+	return ret;
+}
+
+static int
+nms_desc_add_region(struct nms_desc * desc, int nodeid, size_t size)
+{
+	void * ret;
+	int idx;
+
+	ret = nms_alloc_region(nodeid, REGION_SIZE);
+	if (ret == NULL) {
+		fprintf(stderr, "libnms: failed to allocate region on node %d\n", nodeid);
+		return ENOMEM;
+	}
+
+	desc->region_sz[nodeid]++;
+	idx = desc->region_sz[nodeid] - 1;
+	desc->regions[nodeid][idx].start_addr = (uintptr_t)ret;
+	desc->regions[nodeid][idx].occupied = 0;
+	desc->regions[nodeid][idx].size = REGION_SIZE;
+
+	return 0;
+}
+
+static void *
+nms_desc_malloc(struct nms_desc * desc, unsigned int nodeid, size_t size)
+{
+	void * ret = NULL;
+	int idx;
+	int new_region = 0;
+
+	if (size > REGION_SIZE) {
+		return NULL;
+	}
+
+	pthread_mutex_lock(&desc->alloc_lock);
+
+retry:
+	if (desc->region_sz[nodeid] > 0) {
+		idx = desc->region_sz[nodeid] - 1;
+		ret = nms_region_malloc(&desc->regions[nodeid][idx], size);
+	}
+	
+	if (ret == NULL) {
+		// we need a new region
+		if (nms_desc_add_region(desc, nodeid, REGION_SIZE) != 0) {
+			pthread_mutex_unlock(&desc->alloc_lock);
+			return NULL;
+		}
+		fprintf(stdout, "libnms: request of size %zu -> allocated new region on node %d\n", size, nodeid);
+		goto retry;
+	}
+
+	pthread_mutex_unlock(&desc->alloc_lock);
+	return ret;
+}
+
+static void
+nms_desc_free(struct nms_desc * desc __attribute__((unused)), unsigned int node __attribute__((unused)), void * addr __attribute__((unused)))
+{
+	// dummy function
+}
+
+int
+nms_init(int verbose)
+{	
+	int expected = 0;
+	if (atomic_compare_exchange_strong(&initialized, &expected, 2)) {
+		nms_desc_init(&g_desc, verbose);
+		atomic_store(&initialized, 1);
+	} else {
+		while(atomic_load(&initialized) != 1) {
+		}
+		fprintf(stdout,"libnms: already initialized.\n");
+	}
+
+	return 0;
+}
+
+void *
+nms_malloc(int nodeid, size_t sz)
+{
+	assert(atomic_load(&initialized) == 1);
+	return nms_desc_malloc(&g_desc, nodeid, sz);
+}
+
+void
+nms_free(int nodeid, void * addr)
+{
+	assert(atomic_load(&initialized) == 1);
+	nms_desc_free(&g_desc, nodeid, addr);
+}
+
--- a/net/cat.cc
+++ b/net/cat.cc
@ -1,3 +1,12 @@
+#include <sys/_timespec.h>
+#include <atomic>
+#include <cstdlib>
+#include <ctime>
+#include <fstream>
+#include <random>
+#include <vector>
+
+#include <topo.h>
 #include <rte_byteorder.h>
 #include <rte_common.h>
 #include <rte_config.h>
@ -10,29 +19,16 @@
 #include <rte_mbuf.h>
 #include <unistd.h>

-#include "gen.hh"
-#include "nm.hh"
 #include "ntr.h"
+#include "gen.hh"
+#include "net/netsup.hh"
 #include "net/pkt.hh"
-#include "net/util.hh"
+#include "nms.h"

-#include <atomic>
-#include <ctime>
-#include <fstream>
-#include <random>
-#include <vector>
-
-#define MBUF_MAX_COUNT (rte_lcore_count() * 4096)
-constexpr static unsigned int MBUF_CACHE_SIZE = 512;
-constexpr static unsigned int RX_RING_SIZE = 1024;
-constexpr static unsigned int TX_RING_SIZE = 1024;
 constexpr static unsigned int BURST_SIZE = 32;
 constexpr static unsigned int MAX_SLAVES = 32;
 constexpr static unsigned int SLAVES_MAX_WAIT_MS = 1000;

-static const struct rte_eth_conf port_conf_default {
-};
-
 struct datapt {
 	uint32_t epoch;
 	uint32_t valid;
@ -59,23 +55,21 @@ struct options_t {
 	char ia_gen_str[256] = "fixed";
 	unsigned int target_qps { 0 };
 	unsigned int master_mode { 0 };
-	struct net_spec server_spec {
-	};
+	struct net_spec server_spec { };
 	cpuset_t cpu_set = CPUSET_T_INITIALIZER(0x2); // 2nd  core
 	std::vector<struct net_spec *> slaves;
 	uint32_t pkt_loss_failure_threshold { 0 };
 	uint32_t pkt_loss_time_ms { UINT32_MAX };
+	int portid { 0 };

 	// states
-	struct rte_mempool *mbuf_pool { nullptr };
-	struct net_spec s_host_spec {
-	};
+	struct net_spec s_host_spec { };
 	struct conn_spec s_host_conn {
 		.src = &s_host_spec, .dst = &server_spec, .dst_port = POU_PORT
 	};
-	uint16_t s_portid { 0 };
 	unsigned int s_rxqid { 0 };
 	unsigned int s_txqid { 0 };
+	unsigned int s_socketid { 0 };
 	// for qps calculation
 	std::atomic<uint32_t> s_recved_pkts { 0 };
 	std::atomic<uint32_t> s_pkt_loss { 0 };
@ -97,14 +91,13 @@ struct options_t {
 static struct options_t options;

 static uint16_t
-rx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
+rx_add_timestamp(uint16_t port, uint16_t qidx __rte_unused,
    struct rte_mbuf **pkts, uint16_t nb_pkts, uint16_t max_pkts __rte_unused,
    void *_ __rte_unused)
 {
-	uint64_t now = nm_tsc2ns(rte_rdtsc());
+	uint64_t now = topo_uptime_ns();
 	struct pkt_hdr *pkt_data;
-	struct timespec ts {
-	};
+	struct timespec ts { };
 	int ret;

 	if (options.s_state != STATE_SENT) {
@ -112,8 +105,8 @@ rx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
 	}

 	for (int i = 0; i < nb_pkts; i++) {
-		pkt_data = check_valid_packet(
-		    pkts[i], &options.s_host_spec.mac_addr);
+		pkt_data = check_valid_packet(pkts[i],
+		    &options.s_host_spec.mac_addr);

 		if (pkt_data == nullptr) {
 			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
@ -165,16 +158,16 @@ static uint16_t
 tx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
    struct rte_mbuf **pkts, uint16_t nb_pkts, void *_ __rte_unused)
 {
-	uint64_t now = nm_tsc2ns(rte_rdtsc());
+	uint64_t now = topo_uptime_ns();
 	struct pkt_hdr *pkt_data;

-	if (options.s_state != STATE_SENT) {
-		return nb_pkts;
-	}
+	// if (options.s_state != STATE_SENT) {
+	// 	return nb_pkts;
+	// }

 	for (int i = 0; i < nb_pkts; i++) {
-		pkt_data = check_valid_packet(
-		    pkts[i], &options.s_host_spec.mac_addr);
+		pkt_data = check_valid_packet(pkts[i],
+		    &options.s_host_spec.mac_addr);

 		if (pkt_data == nullptr) {
 			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
@ -214,6 +207,7 @@ static void
 send_all_slaves(uint16_t type)
 {
 	struct rte_mbuf *tx_bufs[MAX_SLAVES];
+	//struct rte_eth_stats stats;

 	struct conn_spec cspec;
 	cspec.src = &options.s_host_spec;
@ -224,13 +218,18 @@ send_all_slaves(uint16_t type)
 	for (unsigned int i = 0; i < options.slaves.size(); i++) {
 		struct pkt_hdr *hdr;
 		cspec.dst = options.slaves.at(i);
-		if (alloc_pkt_hdr(options.mbuf_pool, type, &cspec, 0, &tx_bufs[i],
-			&hdr) != 0) {
+		if (alloc_pkt_hdr(mempool_get(options.s_socketid), type, &cspec, 0,
+			&tx_bufs[i], &hdr) != 0) {
 			rte_exit(EXIT_FAILURE, "failed to alloc packet\n");
 		}
 	}
 	
-	if (rte_eth_tx_burst(options.s_portid, options.s_txqid, tx_bufs,
+	// if (rte_eth_stats_get(options.portid, &stats) != 0 ) {
+	// 	rte_exit(EXIT_FAILURE, "failed!");
+	// }
+	// printf("send_all_slaves: ipackets %lu, opackets %lu, ierrors %lu, oerrors %lu\n", stats.ipackets, stats.opackets, stats.ierrors, stats.oerrors);
+
+	if (rte_eth_tx_burst(options.portid, options.s_txqid, tx_bufs,
 		options.slaves.size()) != options.slaves.size()) {
 		rte_exit(EXIT_FAILURE, "failed to send some packets\n");
 	}
@ -243,14 +242,14 @@ wait_for_slaves(uint16_t etype, struct rte_mbuf **out)
 {
 	struct rte_mbuf *tx_bufs[MAX_SLAVES];
 	bool stop = false;
-	const uint64_t start = nm_get_uptime_ns();
+	const uint64_t start = topo_uptime_ns();
 	std::vector<struct rte_ether_addr *> recved;
 	uint32_t tot = 0;

 	while (!stop) {
-		uint64_t now = nm_get_uptime_ns();
-		const uint16_t nb_rx = rte_eth_rx_burst(
-		    options.s_portid, options.s_rxqid, tx_bufs, MAX_SLAVES);
+		uint64_t now = topo_uptime_ns();
+		const uint16_t nb_rx = rte_eth_rx_burst(options.portid,
+		    options.s_rxqid, tx_bufs, MAX_SLAVES);

 		if (nb_rx > 0) {
 			for (unsigned int i = 0; i < nb_rx; i++) {
@ -275,7 +274,7 @@ wait_for_slaves(uint16_t etype, struct rte_mbuf **out)
 						if (rte_is_same_ether_addr(
 							&eaddr->mac_addr,
 							&each->eth_hdr
-							     .s_addr)) {
+							     .src_addr)) {
 							invalid = false;
 							break;
 						}
@ -298,7 +297,7 @@ wait_for_slaves(uint16_t etype, struct rte_mbuf **out)
 						if (rte_is_same_ether_addr(
 							eaddr,
 							&each->eth_hdr
-							     .s_addr)) {
+							     .src_addr)) {
 							invalid = true;
 							break;
 						}
@ -314,7 +313,8 @@ wait_for_slaves(uint16_t etype, struct rte_mbuf **out)
 						goto end_loop;
 					}

-					recved.push_back(&each->eth_hdr.s_addr);
+					recved.push_back(
+					    &each->eth_hdr.src_addr);

 					if (recved.size() ==
 					    options.slaves.size()) {
@ -337,9 +337,15 @@ wait_for_slaves(uint16_t etype, struct rte_mbuf **out)
 			}
 		}

+		// struct rte_eth_stats stats;
+		// if (rte_eth_stats_get(options.portid, &stats) != 0 ) {
+		// 	rte_exit(EXIT_FAILURE, "failed!");
+		// }
+		//printf("wait_slaves <AFTER>: ipackets %lu, opackets %lu, ierrors %lu, oerrors %lu\n", stats.ipackets, stats.opackets, stats.ierrors, stats.oerrors);
+
 		if (now - start > SLAVES_MAX_WAIT_MS * MS2NS) {
-			rte_exit(
-			    EXIT_FAILURE, "cat: waiting for too long %d. I QUIT!!", etype);
+			rte_exit(EXIT_FAILURE,
+			    "cat: waiting for too long %d. I QUIT!!", etype);
 		}
 	}
 }
@ -356,25 +362,25 @@ pkt_loop()
 	bool recv_stat = true;
 	bool recv_resp = true;

-	if (rte_eth_dev_socket_id(options.s_portid) > 0 &&
-	    rte_eth_dev_socket_id(options.s_portid) != (int)rte_socket_id()) {
+	if (rte_eth_dev_socket_id(options.portid) > 0 &&
+	    rte_eth_dev_socket_id(options.portid) != (int)rte_socket_id()) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
 		    "locore_main: WARNING, port %d is on remote NUMA node to "
 		    "polling thread.\n\tPerformance will "
 		    "not be optimal.\n",
-		    options.s_portid);
+		    options.portid);
 	}

-	uint64_t next_ts = nm_get_uptime_ns();
+	uint64_t next_ts = topo_uptime_ns();
 	uint64_t last_send_ts = next_ts;
 	bool is_last_pkt_lost = false;
 	uint32_t num_cts_pkt_lost = 0;

 	while (!options.s_stop.load()) {
-		uint64_t now = nm_get_uptime_ns();
+		uint64_t now = topo_uptime_ns();
 		// always pop incoming packets
-		const uint16_t nb_rx = rte_eth_rx_burst(
-		    options.s_portid, options.s_rxqid, rx_bufs, BURST_SIZE);
+		const uint16_t nb_rx = rte_eth_rx_burst(options.portid,
+		    options.s_rxqid, rx_bufs, BURST_SIZE);

 		if (nb_rx > 0) {
 			for (int i = 0; i < nb_rx; i++) {
@ -398,7 +404,7 @@ pkt_loop()

 				uint16_t type = rte_be_to_cpu_16(each->type);
 				NTR_PKT(NTR_DEP_USER1, NTR_LEVEL_DEBUG, each,
-				    "locore_main: ");
+				    "locore_main: received packet %p ", each);
 				struct pkt_payload_epoch *pld_epoch;
 				struct pkt_payload_stat *pld_stat;
 				uint32_t epoch;
@ -409,6 +415,8 @@ pkt_loop()
 					epoch = rte_be_to_cpu_32(
 					    pld_epoch->epoch);

+					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "lcore_main: PROBE_RESP received packet %p epoch %d\n", each, epoch);
+
 					if (options.s_last_datapt == nullptr ||
 					    epoch !=
 						options.s_last_datapt->epoch) {
@ -466,9 +474,10 @@ pkt_loop()
 		if (options.s_state == STATE_SENT) {
 			// check if hw ts is read
 			if (!read_tx) {
+				int ret;
 				struct timespec ts;
-				if (rte_eth_timesync_read_tx_timestamp(
-					options.s_portid, &ts) == 0) {
+				if ((ret = rte_eth_timesync_read_tx_timestamp(
+					options.portid, &ts)) == 0) {
 					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 					    "locore_main: read hw tx timestamp %lu.\n",
 					    (ts.tv_nsec + ts.tv_sec * S2NS));
@ -553,7 +562,7 @@ pkt_loop()
 				    S2NS);

 				options.s_host_conn.src_port = port_gen.next();
-				if (alloc_pkt_hdr(options.mbuf_pool,
+				if (alloc_pkt_hdr(mempool_get(options.s_socketid),
 					PKT_TYPE_PROBE, &options.s_host_conn, 0,
 					&tx_buf, &pkt_data) != 0) {
 					rte_exit(EXIT_FAILURE,
@ -570,24 +579,25 @@ pkt_loop()
 				options.s_last_datapt->valid =
 				    options.s_record.load();

-				read_tx = false;
-				recv_resp = false;
-				recv_stat = false;
 				last_send_ts = now;
-				options.s_state = STATE_SENT;
 				
 				ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
-				    "locore_main: sending packet %p with epoch %d\n",
+				    "locore_main: sending packet 0x%p with epoch %d\n",
 				    (void *)tx_buf, epoch);
-				const uint16_t nb_tx = rte_eth_tx_burst(
-				    options.s_portid, options.s_txqid, &tx_buf,
-				    1);
+				const uint16_t nb_tx =
+				    rte_eth_tx_burst(options.portid,
+					options.s_txqid, &tx_buf, 1);

 				if (nb_tx != 1) {
 					rte_exit(EXIT_FAILURE,
 					    "failed to send packet 0x%p, epoch %d\n",
 					    (void *)tx_buf, epoch);
 				}
+
+				read_tx = false;
+				recv_resp = false;
+				recv_stat = false;
+				options.s_state = STATE_SENT;
 			}
 		}
 	}
@ -611,9 +621,9 @@ locore_main(void *tif __rte_unused)
 		wait_for_slaves(PKT_TYPE_SYNC_ACK, nullptr);
 	}

-	options.s_start_time.store(nm_get_uptime_ns());
+	options.s_start_time.store(topo_uptime_ns());
 	pkt_loop();
-	options.s_end_time.store(nm_get_uptime_ns());
+	options.s_end_time.store(topo_uptime_ns());

 	if (options.master_mode == 1) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
@ -627,8 +637,8 @@ locore_main(void *tif __rte_unused)
 		for (unsigned int i = 0; i < options.slaves.size(); i++) {
 			// these packets already underwent validity check in
 			// wait_for_slaves
-			auto pkt_hdr = rte_pktmbuf_mtod(
-			    mbufs[i], struct pkt_hdr *);
+			auto pkt_hdr = rte_pktmbuf_mtod(mbufs[i],
+			    struct pkt_hdr *);
 			auto pld_qps = (struct pkt_payload_qps *)
 					   pkt_hdr->payload;
 			uint32_t qps = rte_be_to_cpu_32(pld_qps->qps);
@ -650,95 +660,6 @@ locore_main(void *tif __rte_unused)
 	return 0;
 }

-static int
-port_init(uint16_t portid, struct rte_mempool *mbuf_pool)
-{
-	struct rte_eth_dev_info dev_info {
-	};
-	struct rte_eth_conf port_conf = port_conf_default;
-	struct rte_eth_txconf txconf {
-	};
-	struct rte_eth_rxconf rxconf {
-	};
-
-	uint16_t nb_rxd = RX_RING_SIZE;
-	uint16_t nb_txd = TX_RING_SIZE;
-
-	if (!rte_eth_dev_is_valid_port(portid)) {
-		return -1;
-	}
-
-	int ret = rte_eth_dev_info_get(portid, &dev_info);
-	if (ret != 0) {
-		return ret;
-	}
-
-	port_conf.rxmode.max_rx_pkt_len = RTE_ETHER_MAX_LEN;
-	port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
-	port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_RSS_HASH;
-	port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_UDP_CKSUM;
-	port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_IPV4_CKSUM;
-	port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM;
-	port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
-	port_conf.txmode.offloads |= DEV_TX_OFFLOAD_MBUF_FAST_FREE;
-
-	/* Configure the Ethernet device. */
-	ret = rte_eth_dev_configure(portid, 1, 1, &port_conf);
-	if (ret != 0)
-		return ret;
-
-	ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd, &nb_txd);
-	if (ret != 0)
-		return ret;
-
-	/* Allocate and set up 1 RX queue per thread . */
-	rxconf = dev_info.default_rxconf;
-	rxconf.offloads = port_conf.rxmode.offloads;
-	for (uint32_t i = 0; i < 1; i++) {
-		ret = rte_eth_rx_queue_setup(portid, options.s_rxqid, nb_rxd,
-		    rte_eth_dev_socket_id(portid), &rxconf, mbuf_pool);
-		if (ret < 0)
-			return ret;
-	}
-
-	txconf = dev_info.default_txconf;
-	txconf.offloads = port_conf.txmode.offloads;
-	/* Allocate and set up 1 TX queue per Ethernet port. */
-	for (uint32_t i = 0; i < 1; i++) {
-		ret = rte_eth_tx_queue_setup(portid, options.s_txqid, nb_txd,
-		    rte_eth_dev_socket_id(portid), &txconf);
-		if (ret < 0)
-			return ret;
-	}
-
-	ret = rte_eth_dev_start(portid);
-	if (ret < 0)
-		return ret;
-
-	/* Display the port MAC address. */
-	struct rte_ether_addr addr {
-	};
-	ret = rte_eth_macaddr_get(portid, &addr);
-	if (ret != 0)
-		return ret;
-
-	ret = rte_eth_timesync_enable(portid);
-	if (ret != 0)
-		return ret;
-
-	/* Enable RX in promiscuous mode for the Ethernet device. */
-	ret = rte_eth_promiscuous_enable(portid);
-	if (ret != 0)
-		return ret;
-
-	rte_eth_add_tx_callback(
-	    portid, options.s_rxqid, tx_add_timestamp, nullptr);
-	rte_eth_add_rx_callback(
-	    portid, options.s_txqid, rx_add_timestamp, nullptr);
-
-	return 0;
-}
-
 static void
 dump_options()
 {
@ -753,11 +674,13 @@ dump_options()
 	    "    target qps = %d\n"
 	    "    host IP = 0x%x\n"
 	    "    pkt loss time = %u\n"
-	    "    pkt loss failure threshold = %u\n",
+	    "    pkt loss failure threshold = %u\n"
+	    "    portid = %d\n",
 	    ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING, options.run_time,
 	    options.warmup_time, options.output, CPU_COUNT(&options.cpu_set),
 	    options.ia_gen_str, options.target_qps, options.s_host_spec.ip,
-	    options.pkt_loss_time_ms, options.pkt_loss_failure_threshold);
+	    options.pkt_loss_time_ms, options.pkt_loss_failure_threshold,
+	    options.portid);

 	for (auto slave : options.slaves) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
@ -794,8 +717,6 @@ usage()
 int
 main(int argc, char *argv[])
 {
-	unsigned int nb_ports;
-	struct rte_mempool *mbuf_pool;
 	std::ofstream log_file;
 	bool has_host_spec = false;

@ -816,7 +737,7 @@ main(int argc, char *argv[])
 		int c;
 		// parse arguments
 		struct net_spec *ns;
-		while ((c = getopt(argc, argv, "vs:S:t:T:ho:A:i:q:H:L:l:")) !=
+		while ((c = getopt(argc, argv, "vs:S:t:T:ho:A:i:q:H:L:l:p:")) !=
 		    -1) {
 			switch (c) {
 			case 'v':
@ -824,8 +745,8 @@ main(int argc, char *argv[])
 				    ntr_get_level(NTR_DEP_USER1) + 1);
 				break;
 			case 's':
-				if (str_to_netspec(
-					optarg, &options.server_spec) != 0) {
+				if (str_to_netspec(optarg,
+					&options.server_spec) != 0) {
 					rte_exit(EXIT_FAILURE,
 					    "invalid server net spec.\n");
 				}
@ -839,16 +760,16 @@ main(int argc, char *argv[])
 				options.slaves.push_back(ns);
 				options.master_mode = 1;
 				if (options.slaves.size() > MAX_SLAVES) {
-					rte_exit(
-					    EXIT_FAILURE, "too many rats.\n");
+					rte_exit(EXIT_FAILURE,
+					    "too many rats.\n");
 				}
 				break;
 			case 't':
 				options.run_time = strtol(optarg, nullptr, 10);
 				break;
 			case 'T':
-				options.warmup_time = strtol(
-				    optarg, nullptr, 10);
+				options.warmup_time = strtol(optarg, nullptr,
+				    10);
 				break;
 			case 'h':
 				usage();
@ -865,32 +786,35 @@ main(int argc, char *argv[])
 				    sizeof(options.ia_gen_str) - 1);
 				break;
 			case 'q':
-				options.target_qps = strtoul(
-				    optarg, nullptr, 10);
+				options.target_qps = strtoul(optarg, nullptr,
+				    10);
 				break;
 			case 'H':
 				has_host_spec = true;
-				if (str_to_netspec(
-					optarg, &options.s_host_spec) != 0) {
+				if (str_to_netspec(optarg,
+					&options.s_host_spec) != 0) {
 					rte_exit(EXIT_FAILURE,
 					    "invalid host net spec.\n");
 				}
 				break;
 			case 'L':
-				options.pkt_loss_failure_threshold = strtoul(
-				    optarg, nullptr, 10);
+				options.pkt_loss_failure_threshold =
+				    strtoul(optarg, nullptr, 10);
 				break;
 			case 'l':
-				options.pkt_loss_time_ms = strtoul(
-				    optarg, nullptr, 10);
+				options.pkt_loss_time_ms = strtoul(optarg,
+				    nullptr, 10);
 				if (options.pkt_loss_time_ms == 0) {
 					options.pkt_loss_time_ms = UINT32_MAX;
 				}
 				break;
+			case 'p':
+				options.portid = strtol(optarg, nullptr, 10);
+				break;
 			default:
 				usage();
-				rte_exit(
-				    EXIT_FAILURE, "unknown argument: %c\n", c);
+				rte_exit(EXIT_FAILURE, "unknown argument: %c\n",
+				    c);
 			}
 		}
 	}
@ -899,13 +823,68 @@ main(int argc, char *argv[])
 		rte_exit(EXIT_FAILURE, "must specify host IP\n");
 	}

-	// init nm
-	if (nm_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != 0) {
-		rte_exit(EXIT_FAILURE, "nm init failed!\n");
+	// init libtopo
+	if (topo_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) !=
+	    0) {
+		rte_exit(EXIT_FAILURE, "libtopo init failed!\n");
 	}

+	// init nms
+	if (nms_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != 0) {
+		rte_exit(EXIT_FAILURE, "failed to init libnms!\n");
+	}
+
+	if (CPU_COUNT(&options.cpu_set) != 1) {
+		rte_exit(EXIT_FAILURE, "must specify exactly one core\n");
+	}
+	int core_id = CPU_FFS(&options.cpu_set) - 1;
+	
 	dump_options();

+	// configure memory and port
+	struct port_conf pconf;
+	struct device_conf dconf;
+	struct mem_conf mconf;
+	portconf_get(options.portid, &pconf);
+
+	dconf.mtu = MAX_STANDARD_MTU;
+	dconf.num_threads = 1;
+	dconf.portid = options.portid;
+	dconf.rss_hf = pconf.rss_hf;
+	dconf.rx_offloads = pconf.rxoffload;
+	dconf.tx_offloads = pconf.txoffload;
+	dconf.timesync = pconf.timesync;
+
+	dconf.rx_fn = rx_add_timestamp;
+	dconf.rx_user = nullptr;
+	dconf.rx_ring_sz = 2048;
+	dconf.tx_fn = tx_add_timestamp;
+	dconf.tx_user = nullptr;
+	dconf.tx_ring_sz = 2048;
+
+	mconf.cache_size = 64;
+	mconf.priv_size = 0;
+	mconf.num_elements = 4096;
+	mconf.data_room_size = RTE_MBUF_DEFAULT_BUF_SIZE + MAX_STANDARD_MTU;
+	mconf.max_pools = -1;
+
+	dpdk_init(&dconf, &mconf);
+
+	if (rte_eth_macaddr_get(options.portid,
+		&options.s_host_spec.mac_addr) != 0) {
+		rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n",
+		    options.portid);
+	}
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
+	    "Configured port %d with mac addr %x:%x:%x:%x:%x:%x\n",
+	    options.portid, options.s_host_spec.mac_addr.addr_bytes[0],
+	    options.s_host_spec.mac_addr.addr_bytes[1],
+	    options.s_host_spec.mac_addr.addr_bytes[2],
+	    options.s_host_spec.mac_addr.addr_bytes[3],
+	    options.s_host_spec.mac_addr.addr_bytes[4],
+	    options.s_host_spec.mac_addr.addr_bytes[5]);
+
 	// create default generator
 	options.s_iagen = createGenerator(options.ia_gen_str);
 	if (options.s_iagen == nullptr) {
@ -921,50 +900,6 @@ main(int argc, char *argv[])
 		    options.output);
 	}

-	nb_ports = rte_eth_dev_count_avail();
-	if (nb_ports == 0) {
-		rte_exit(EXIT_FAILURE, "number of ports must be > 0\n");
-	}
-
-	uint16_t portid = rte_eth_find_next(0);
-	if (portid == RTE_MAX_ETHPORTS) {
-		rte_exit(EXIT_FAILURE, "cannot find an available port\n");
-	}
-	options.s_portid = portid;
-
-	if (rte_eth_macaddr_get(portid, &options.s_host_spec.mac_addr) != 0) {
-		rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n",
-		    portid);
-	}
-
-	// create a mbuf memory pool on the socket
-	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", MBUF_MAX_COUNT,
-	    MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE,
-	    rte_eth_dev_socket_id(options.s_portid));
-	if (mbuf_pool == nullptr) {
-		rte_exit(EXIT_FAILURE, "cannot create mbuf pool\n");
-	}
-	options.mbuf_pool = mbuf_pool;
-
-	if (port_init(portid, mbuf_pool) != 0) {
-		rte_exit(EXIT_FAILURE, "cannot init port %d\n", portid);
-	}
-
-	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
-	    "Configured port %d with mac addr %x:%x:%x:%x:%x:%x\n", portid,
-	    options.s_host_spec.mac_addr.addr_bytes[0],
-	    options.s_host_spec.mac_addr.addr_bytes[1],
-	    options.s_host_spec.mac_addr.addr_bytes[2],
-	    options.s_host_spec.mac_addr.addr_bytes[3],
-	    options.s_host_spec.mac_addr.addr_bytes[4],
-	    options.s_host_spec.mac_addr.addr_bytes[5]);
-
-	int core_id = CPU_FFS(&options.cpu_set);
-	if (core_id == 0) {
-		rte_exit(EXIT_FAILURE, "invalid cpu list\n");
-	}
-	core_id--;
-
 	sleep(INIT_DELAY);
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
 	    "main: launching thread on core %d\n", core_id);
@ -1014,11 +949,13 @@ main(int argc, char *argv[])
 	}
 	log_file.close();

-	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "qps = %d, recved = %d, loss = %d, slave recved = %d, slave loss = %d\n",
-	    qps, options.s_recved_pkts.load(), options.s_pkt_loss.load(), options.s_slave_recved.load(), options.s_slave_loss.load());
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
+	    "qps = %d, recved = %d, loss = %d, slave recved = %d, slave loss = %d\n",
+	    qps, options.s_recved_pkts.load(), options.s_pkt_loss.load(),
+	    options.s_slave_recved.load(), options.s_slave_loss.load());

 	// clean up
-	rte_eth_dev_stop(portid);
+	dpdk_cleanup(&dconf);

 	return 0;
 }
--- a/net/khat.cc
+++ b/net/khat.cc
@ -1,3 +1,16 @@
+#include <atomic>
+#include <cassert>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <vector>
+#include <unistd.h>
+
+#include <sys/cpuset.h>
+#include <sys/endian.h>
+
+#include <topo.h>
+
 #include <rte_common.h>
 #include <rte_config.h>
 #include <rte_cycles.h>
@ -7,40 +20,27 @@
 #include <rte_launch.h>
 #include <rte_lcore.h>
 #include <rte_mbuf.h>
-#include <unistd.h>

-#include "nm.hh"
 #include "ntr.h"
+
+#include "gen.hh"
+#include "net/netsup.hh"
 #include "net/pkt.hh"
-#include "net/util.hh"
+#include "nms.h"

-#include <atomic>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <vector>
-#include <sys/_cpuset.h>
-#include <sys/cpuset.h>
-#include <sys/endian.h>
-
-#define MBUF_MAX_COUNT (rte_lcore_count() * 4096)
-constexpr static unsigned int MBUF_CACHE_SIZE = 512;
-constexpr static unsigned int RX_RING_SIZE = 2048;
-constexpr static unsigned int TX_RING_SIZE = 2048;
 constexpr static unsigned int BURST_SIZE = 32;
 constexpr static unsigned int CACHELINE_SIZE = 64;
 constexpr static uint16_t THREAD_LOAD_BUFFER_SZ = 16384;
-constexpr static size_t MEMPOOL_NAME_BUF_LEN = 64;

-static const struct rte_mbuf_dynfield rte_mbuf_dynfield_probe_flag = {
-	.name = "rte_mbuf_dynfield_probe_flag",
-	.size = sizeof(uint32_t),
-	.align = __alignof__(uint32_t),
-	.flags = 0
-};
-
-static int PROBE_FLAG_OFFSET { 0 };
-static const struct rte_eth_conf port_conf_default {
+struct probe_state_t {
+	struct net_spec dst;
+	struct conn_spec cspec {
+		.dst = &dst
+	};
+	uint64_t last_sw_rx;
+	uint64_t last_sw_tx;
+	uint64_t last_hw_rx;
+	uint32_t epoch;
 };

 // keep track of the probe state
@ -59,63 +59,63 @@ struct thread_info {
 	int txqid;
 	int lcore_id;
 	int node_id;
-	void * cache_lines;
-	void * load_buffer;
-};
-
-// state machine:
-constexpr static int SERVER_STATE_WAIT = 0;
-constexpr static int SERVER_STATE_PROBE = 1;
-
-struct probe_state_t {
-	struct net_spec dst;
-	struct conn_spec cspec {
-		.dst = &dst
-	};
-	uint32_t epoch;
-	uint64_t last_sw_rx;
-	uint64_t last_sw_tx;
-	uint64_t last_hw_rx;
+	void *cache_lines;
+	void *load_buffer;
 };

 struct options_t {
 	// config
 	int num_threads { 1 };
-	cpuset_t cpu_set = CPUSET_T_INITIALIZER(0x2);  // 2nd core
-	char mempool_name_buf[MEMPOOL_NAME_BUF_LEN];
-	bool jumbo_frame_enabled { false }; // setting this to true changes mbuf size and mtu
+	cpuset_t cpu_set = CPUSET_T_INITIALIZER(0x2); // 2nd core
+	bool jumbo_frame_enabled {
+		false
+	}; // setting this to true changes mbuf size and mtu
 	int port_mtu { MAX_STANDARD_MTU };
 	int thread_cacheline_cnt = { 128 };
 	bool mlg_enabled { false };
-	uint64_t mlg_bps { 0 };
+	uint64_t mlg_arr_sz { 0 };
 	cpuset_t mlg_cset = CPUSET_T_INITIALIZER(0x2);
 	cpuset_t mlg_dset = CPUSET_T_INITIALIZER(0x1);
+	int mlg_shared_buffer { 0 };
 	memload_generator *mlg { nullptr };
-	// states
-	uint16_t s_portid { 0 };
-	struct net_spec s_host_spec {
-	};
-	std::atomic<int> s_state { SERVER_STATE_WAIT };
-	struct probe_state_t s_probe_info;
-	std::vector<struct thread_info *> s_thr_info;
+	uint16_t portid { 0 };

-	struct rte_mempool *s_mempools[MAX_NODES];
+	// states
+	struct net_spec s_host_spec { };
+	std::vector<struct thread_info *> s_thr_info;
+	int probe_state_offset { 0 };
+	bool s_hwtimestamp { true };
+
+	struct probe_state_t s_probe_info;
+	std::atomic<bool> is_probing { false };
 };

 struct options_t options;

+static bool
+mbuf_is_probe_valid(struct rte_mbuf *pkt)
+{
+	return *RTE_MBUF_DYNFIELD(pkt, options.probe_state_offset, bool *);
+}
+
+static void
+mbuf_set_probe_valid(struct rte_mbuf *pkt, bool b)
+{
+	*RTE_MBUF_DYNFIELD(pkt, options.probe_state_offset, bool *) = b;
+}
+
 static uint16_t
 rx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
    struct rte_mbuf **pkts, uint16_t nb_pkts, uint16_t max_pkts __rte_unused,
    void *_ __rte_unused)
 {
-	uint64_t now = nm_get_uptime_ns();
-	struct timespec ts {
-	};
+	int rc = 0;
+	uint64_t now = topo_uptime_ns();
+	struct timespec ts { };
 	struct pkt_hdr *pkt_data;
 	for (int i = 0; i < nb_pkts; i++) {
-		pkt_data = check_valid_packet(
-		    pkts[i], &options.s_host_spec.mac_addr);
+		pkt_data = check_valid_packet(pkts[i],
+		    &options.s_host_spec.mac_addr);

 		if (pkt_data == nullptr) {
 			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
@ -125,40 +125,52 @@ rx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
 		}

 		if (rte_be_to_cpu_16(pkt_data->type) == PKT_TYPE_PROBE) {
-			int state_wait = SERVER_STATE_WAIT;
-			*RTE_MBUF_DYNFIELD(
-			    pkts[i], PROBE_FLAG_OFFSET, uint32_t *) = 0;
-			if (rte_eth_timesync_read_rx_timestamp(
-				port, &ts, pkts[i]->timesync & 0x3) == 0) {
-				if (options.s_state.compare_exchange_strong(
-					state_wait, SERVER_STATE_PROBE)) {
-					// mark the mbuf as probe packet being
-					// processed only the locore that
-					// receives the pkt w/ userdata !=
-					// nullptr processes that packet
-					*RTE_MBUF_DYNFIELD(pkts[i],
-					    PROBE_FLAG_OFFSET, uint32_t *) = 1;
-					// tag with timestamps
-					options.s_probe_info.last_hw_rx =
-					    ts.tv_nsec + ts.tv_sec * S2NS;
-					options.s_probe_info.last_sw_rx = now;
+			bool cmp = false;
+			mbuf_set_probe_valid(pkts[i], false);
+			if (options.is_probing.compare_exchange_strong(cmp,
+				true)) {
+				options.s_probe_info.last_sw_rx = now;
+				if (options.s_hwtimestamp) {
+					if ((rc = rte_eth_timesync_read_rx_timestamp(
+						 port, &ts,
+						 pkts[i]->timesync & 0x3)) ==
+					    0) {
+						options.s_probe_info
+						    .last_hw_rx = ts.tv_nsec +
+						    ts.tv_sec * S2NS;
+						ntr(NTR_DEP_USER1,
+						    NTR_LEVEL_DEBUG,
+						    "rx_add_timestamp: tagged packet %p with sw rx: %lu hw rx:%lu.\n",
+						    (void *)pkts[i],
+						    options.s_probe_info
+							.last_sw_rx,
+						    options.s_probe_info
+							.last_hw_rx);
+						mbuf_set_probe_valid(pkts[i],
+						    true);
+					} else {
+						options.is_probing.store(false);
+						ntr(NTR_DEP_USER1,
+						    NTR_LEVEL_WARNING,
+						    "rx_add_timestamp: packet %p not tagged - failed to read hw rx timestamp: %d.\n",
+						    (void *)pkts[i], rc);
+					}
+				} else {
+					mbuf_set_probe_valid(pkts[i], true);
 					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
-					    "rx_add_timestamp: tagged packet %p epoch %d with sw: %lu hw:%lu.\n",
-					    (void *)pkts[i],
-					    options.s_probe_info.epoch, now,
-					    options.s_probe_info.last_hw_rx);
-				} else
-					ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
-					    "rx_add_timestamp: packet %p not tagged - server is processing a probe.\n",
-					    (void *)pkts[i]);
-			} else
-				ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
-				    "rx_add_timestamp: packet %p not tagged - hw rx timestamp not available.\n",
+					    "rx_add_timestamp: tagged packet %p with sw rx only: %lu.\n",
+					    (void *)pkts[i], now);
+				}
+			} else {
+				ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+				    "rx_add_timestamp: packet %p not tagged - server is probing.\n",
 				    (void *)pkts[i]);
-		} else
+			}
+		} else {
 			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
-			    "rx_add_timestamp: packet %p not tagged - type %d.\n",
+			    "rx_add_timestamp: packet %p not tagged - not PROBE packet: type %d.\n",
 			    (void *)pkts[i], rte_be_to_cpu_16(pkt_data->type));
+		}
 	}

 	return nb_pkts;
@ -168,13 +180,13 @@ static uint16_t
 tx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
    struct rte_mbuf **pkts, uint16_t nb_pkts, void *_ __rte_unused)
 {
-	uint64_t now = nm_get_uptime_ns();
+	uint64_t now = topo_uptime_ns();
 	struct pkt_hdr *pkt_data;

 	for (int i = 0; i < nb_pkts; i++) {

-		pkt_data = check_valid_packet(
-		    pkts[i], &options.s_host_spec.mac_addr);
+		pkt_data = check_valid_packet(pkts[i],
+		    &options.s_host_spec.mac_addr);

 		if (pkt_data == nullptr) {
 			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
@ -188,14 +200,8 @@ tx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,

 			// at this time the packet is not sent to the NIC yet so
 			// the state must be waiting stats
-			// XXX: this should be an assert
-			if (options.s_state.load() != SERVER_STATE_PROBE ||
-			    *RTE_MBUF_DYNFIELD(
-				pkts[i], PROBE_FLAG_OFFSET, uint32_t *) != 1) {
-				rte_exit(EXIT_FAILURE,
-				    "packet %p sent to NIC before sw callback\n",
-				    (void *)pkts[i]);
-			}
+			assert(options.is_probing.load() &&
+			    mbuf_is_probe_valid(pkts[i]));

 			options.s_probe_info.last_sw_tx = now;

@ -225,23 +231,23 @@ locore_main(void *ti)

 	bool pending_probe = false;

-	if (rte_eth_dev_socket_id(options.s_portid) > 0 &&
-	    rte_eth_dev_socket_id(options.s_portid) != (int)rte_socket_id()) {
+	if (rte_eth_dev_socket_id(options.portid) > 0 &&
+	    rte_eth_dev_socket_id(options.portid) != (int)rte_socket_id()) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
 		    "locore_main <thread %d>: WARNING, port %d is on remote NUMA node to "
 		    "polling thread.\n\tPerformance will "
 		    "not be optimal.\n",
-		    tinfo->tid, options.s_portid);
+		    tinfo->tid, options.portid);
 	}

 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
-	    "locore_main <thread %d>: running on locore %d with txidx %d and rxidx %d.\n",
+	    "locore_main <thread %d>: running on locore %d with txqid %d and rxqid %d.\n",
 	    tinfo->tid, rte_lcore_id(), tinfo->txqid, tinfo->rxqid);

 	while (true) {
 		uint16_t nb_tx = 0;
-		const uint16_t nb_rx = rte_eth_rx_burst(
-		    options.s_portid, tinfo->rxqid, bufs, BURST_SIZE);
+		const uint16_t nb_rx = rte_eth_rx_burst(options.portid,
+		    tinfo->rxqid, bufs, BURST_SIZE);
 		struct rte_mbuf *pkt_buf;
 		struct pkt_hdr *tx_data;

@ -249,8 +255,8 @@ locore_main(void *ti)
 			// XXX: optimization: in rx_add_timestamp every packet
 			// is already validated once can just mark valid packet
 			// with a value so we can avoid this redundant check
-			pkt_data = check_valid_packet(
-			    bufs[i], &options.s_host_spec.mac_addr);
+			pkt_data = check_valid_packet(bufs[i],
+			    &options.s_host_spec.mac_addr);

 			if (pkt_data == nullptr) {
 				ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
@ -262,13 +268,10 @@ locore_main(void *ti)
 			}

 			NTR_PKT(NTR_DEP_USER1, NTR_LEVEL_DEBUG, pkt_data,
-			    "locore_main <thread %d>: ", tinfo->tid);
+			    "locore_main <thread %d>: received packet ", tinfo->tid);
 			switch (rte_be_to_cpu_16(pkt_data->type)) {
 			case PKT_TYPE_PROBE: {
-				if (options.s_state.load() ==
-					SERVER_STATE_PROBE &&
-				    *RTE_MBUF_DYNFIELD(bufs[i],
-					PROBE_FLAG_OFFSET, uint32_t *) == 1) {
+				if (mbuf_is_probe_valid(bufs[i])) {
 					// send back probe_resp pkt to probe for
 					// return latency
 					pending_probe = true;
@ -279,6 +282,7 @@ locore_main(void *ti)
 						((struct pkt_payload_epoch *)
 							pkt_data->payload)
 						    ->epoch);
+
 					pkt_hdr_to_netspec(pkt_data,
 					    &options.s_probe_info.dst,
 					    &options.s_probe_info.cspec
@ -286,12 +290,12 @@ locore_main(void *ti)
 					    nullptr,
 					    &options.s_probe_info.cspec
 						 .src_port);
+
 					options.s_probe_info.cspec.src =
 					    &options.s_host_spec;

-					if (alloc_pkt_hdr(
-						options
-						    .s_mempools[tinfo->node_id],
+					if (alloc_pkt_hdr(mempool_get(
+							      tinfo->node_id),
 						PKT_TYPE_PROBE_RESP,
 						&options.s_probe_info.cspec, 0,
 						&pkt_buf, &tx_data) != 0) {
@ -303,10 +307,11 @@ locore_main(void *ti)
 					    pkt_data->payload,
 					    sizeof(struct pkt_payload_epoch));

-					*RTE_MBUF_DYNFIELD(pkt_buf,
-					    PROBE_FLAG_OFFSET, uint32_t *) = 1;
+					mbuf_set_probe_valid(pkt_buf, true);

 					// queue for burst send
+					NTR_PKT(NTR_DEP_USER1, NTR_LEVEL_DEBUG, tx_data,
+			    		"locore_main <thread %d>: sending packet ", tinfo->tid);
 					tx_bufs[nb_tx++] = pkt_buf;
 				}
 				break;
@ -316,20 +321,33 @@ locore_main(void *ti)
 				struct net_spec src;
 				struct net_spec dst;

-				// touch the unused data to pretend that we read those dummy fields
-				memcpy(tinfo->load_buffer, pkt_data->payload, MIN(bufs[i]->data_len - sizeof(struct pkt_hdr), THREAD_LOAD_BUFFER_SZ));
+				// touch the unused data to pretend that we read
+				// those dummy fields
+				memcpy(tinfo->load_buffer, pkt_data->payload,
+				    MIN(bufs[i]->data_len -
+					    sizeof(struct pkt_hdr),
+					THREAD_LOAD_BUFFER_SZ));

 				// perform the load
-				auto pld = (struct pkt_payload_load *)pkt_data->payload;
+				auto pld = (struct pkt_payload_load *)
+					       pkt_data->payload;
 				uint32_t which = rte_be_to_cpu_32(pld->which);
 				uint32_t load = rte_be_to_cpu_32(pld->load);
-				uint32_t start_cacheline = which % (options.thread_cacheline_cnt * options.s_thr_info.size());
-				uint32_t thrd = start_cacheline / options.thread_cacheline_cnt;
-				uint32_t start = start_cacheline % options.thread_cacheline_cnt;
+				uint32_t start_cacheline = which %
+				    (options.thread_cacheline_cnt *
+					options.s_thr_info.size());
+				uint32_t thrd = start_cacheline /
+				    options.thread_cacheline_cnt;
+				uint32_t start = start_cacheline %
+				    options.thread_cacheline_cnt;
 				for (uint j = 0; j < load; j++) {
-					*(uint32_t *)tinfo->load_buffer = (start + j) % options.thread_cacheline_cnt;
+					*(uint32_t *)tinfo->load_buffer =
+					    (start + j) %
+					    options.thread_cacheline_cnt;
 				}
-				ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "locore_main <thread %d>: LOAD @ thread %d, start %d, load %d\n", tinfo->tid, thrd, start, load);
+				ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+				    "locore_main <thread %d>: LOAD @ thread %d, start %d, load %d\n",
+				    tinfo->tid, thrd, start, load);

 				// reply
 				pkt_hdr_to_netspec(pkt_data, &src,
@ -337,11 +355,10 @@ locore_main(void *ti)
 				cspec.dst = &src;
 				cspec.src = &dst;

-				// printf("LOAD PKT SIZE: %d\n", bufs[i]->data_len);
-				// we reply to load packet regardless of the
-				// server state
-				if (alloc_pkt_hdr(
-					options.s_mempools[tinfo->node_id],
+				// printf("LOAD PKT SIZE: %d\n",
+				// bufs[i]->data_len); we reply to load packet
+				// regardless of the server state
+				if (alloc_pkt_hdr(mempool_get(tinfo->node_id),
 					PKT_TYPE_LOAD_RESP, &cspec, 0, &pkt_buf,
 					&tx_data) != 0) {
 					rte_exit(EXIT_FAILURE,
@ -352,6 +369,8 @@ locore_main(void *ti)
 				    sizeof(struct pkt_payload_load));

 				// queue for burst send
+				NTR_PKT(NTR_DEP_USER1, NTR_LEVEL_DEBUG, tx_data,
+			    		"locore_main <thread %d>: sending packet ", tinfo->tid);
 				tx_bufs[nb_tx++] = pkt_buf;
 				break;
 			}
@ -366,175 +385,67 @@ locore_main(void *ti)
 		}

 		// send all packets
-		tx_burst_all(options.s_portid, tinfo->txqid, tx_bufs, nb_tx);
+		tx_burst_all(options.portid, tinfo->txqid, tx_bufs, nb_tx);

 		// we wanna check every loop not only when there are packets
 		if (pending_probe) {
-			struct timespec ts {
-			};
+			assert(options.is_probing.load());
+			struct timespec ts { };
 			struct pkt_payload_stat *stat;
-			if (rte_eth_timesync_read_tx_timestamp(
-				options.s_portid, &ts) == 0) {
-				ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
-				    "locore_main <thread %d>: obtained hw tx timestamp %lu.\n",
-				    tinfo->tid,
-				    (ts.tv_sec * S2NS + ts.tv_nsec));
-				// now we have everything we need

-				if (alloc_pkt_hdr(
-					options.s_mempools[tinfo->node_id],
-					PKT_TYPE_STAT,
-					&options.s_probe_info.cspec, 0, 
-					&pkt_buf, &tx_data) != 0) {
-					rte_exit(EXIT_FAILURE,
-					    "failed to alloc pkt_buf\n");
+			if (options.s_hwtimestamp) {
+				if (rte_eth_timesync_read_tx_timestamp(
+					options.portid, &ts) == 0) {
+					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+					    "locore_main <thread %d>: obtained hw tx timestamp %lu.\n",
+					    tinfo->tid,
+					    (ts.tv_sec * S2NS + ts.tv_nsec));
+				} else {
+					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
+					    "locore_main <thread %d>: failed to obtain hw tx timestamp.\n",
+					    tinfo->tid);
+					pending_probe = false;
+					goto end_stat;
 				}
+			}
+			// now we have everything we need

-				// populate stats
-				stat = (struct pkt_payload_stat *)
-					   tx_data->payload;
-				stat->epoch = rte_cpu_to_be_32(
-				    options.s_probe_info.epoch);
+			if (alloc_pkt_hdr(mempool_get(tinfo->node_id),
+				PKT_TYPE_STAT, &options.s_probe_info.cspec, 0,
+				&pkt_buf, &tx_data) != 0) {
+				rte_exit(EXIT_FAILURE,
+				    "failed to alloc pkt_buf\n");
+			}
+
+			// populate stats
+			stat = (struct pkt_payload_stat *)tx_data->payload;
+			stat->epoch = rte_cpu_to_be_32(
+			    options.s_probe_info.epoch);
+			if (options.s_hwtimestamp) {
 				stat->hw_rx = rte_cpu_to_be_64(
 				    options.s_probe_info.last_hw_rx);
 				stat->hw_tx = rte_cpu_to_be_64(
 				    ts.tv_nsec + ts.tv_sec * S2NS);
-				stat->sw_rx = rte_cpu_to_be_64(
-				    options.s_probe_info.last_sw_rx);
-				stat->sw_tx = rte_cpu_to_be_64(
-				    options.s_probe_info.last_sw_tx);
-
-				// send the packet
-				tx_burst_all(options.s_portid, tinfo->txqid, &pkt_buf, 1);
-
-				// release flux
-				pending_probe = false;
-
-				int expected = SERVER_STATE_PROBE;
-				if (!options.s_state.compare_exchange_strong(
-					expected, SERVER_STATE_WAIT)) {
-					rte_exit(EXIT_FAILURE,
-					    "s_state changed unexpectedly!");
-				}
+			} else {
+				stat->hw_rx = 0;
+				stat->hw_tx = 0;
 			}
+			stat->sw_rx = rte_cpu_to_be_64(
+			    options.s_probe_info.last_sw_rx);
+			stat->sw_tx = rte_cpu_to_be_64(
+			    options.s_probe_info.last_sw_tx);
+
+			// send the packet
+			tx_burst_all(options.portid, tinfo->txqid, &pkt_buf, 1);
+
+		end_stat:
+			// release flux
+			pending_probe = false;
+			options.is_probing.store(false);
 		}
 	}
 }

-static int
-port_init(uint16_t portid, struct rte_mempool *mbuf_pool)
-{
-	struct rte_eth_dev_info dev_info {
-	};
-	struct rte_eth_conf port_conf = port_conf_default;
-	struct rte_eth_txconf txconf {
-	};
-	struct rte_eth_rxconf rxconf {
-	};
-
-	uint16_t nb_rxd = RX_RING_SIZE;
-	uint16_t nb_txd = TX_RING_SIZE;
-
-	if (!rte_eth_dev_is_valid_port(portid)) {
-		return -1;
-	}
-
-	int ret = rte_eth_dev_info_get(portid, &dev_info);
-	if (ret != 0) {
-		return ret;
-	}
-
-	port_conf.rxmode.max_rx_pkt_len = mtu_to_pkt_size(options.port_mtu);
-	port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
-	port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_NONFRAG_IPV4_UDP |
-	    ETH_RSS_L2_PAYLOAD | ETH_RSS_NONFRAG_IPV4_TCP;
-	port_conf.rx_adv_conf.rss_conf.rss_key = nullptr;
-	port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_RSS_HASH;
-	port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_UDP_CKSUM;
-	port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_IPV4_CKSUM;
-	port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM;
-	port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
-	port_conf.txmode.offloads |= DEV_TX_OFFLOAD_MBUF_FAST_FREE;
-	if (options.jumbo_frame_enabled) {
-		port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_JUMBO_FRAME;
-	}
-
-	/* Configure the Ethernet device. */
-	ret = rte_eth_dev_configure(
-	    portid, options.num_threads, options.num_threads, &port_conf);
-	if (ret != 0)
-		return ret;
-
-	ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd, &nb_txd);
-	if (ret != 0)
-		return ret;
-
-	/* Allocate and set up 1 RX queue per thread per Ethernet port. */
-	rxconf = dev_info.default_rxconf;
-
-	if (options.jumbo_frame_enabled) {
-		rxconf.offloads |= DEV_RX_OFFLOAD_JUMBO_FRAME;
-	}
-	for (int i = 0; i < options.num_threads; i++) {
-		ret = rte_eth_rx_queue_setup(portid, i, nb_rxd,
-		    rte_eth_dev_socket_id(portid), &rxconf, mbuf_pool);
-		if (ret < 0)
-			return ret;
-		options.s_thr_info.at(i)->rxqid = i;
-	}
-
-	txconf = dev_info.default_txconf;
-	txconf.offloads = port_conf.txmode.offloads;
-	/* Allocate and set up 1 TX queue per thread per Ethernet port. */
-	for (int i = 0; i < options.num_threads; i++) {
-		ret = rte_eth_tx_queue_setup(
-		    portid, i, nb_txd, rte_eth_dev_socket_id(portid), &txconf);
-		if (ret < 0)
-			return ret;
-		options.s_thr_info.at(i)->txqid = i;
-	}
-
-	// set mtu
-	ret = rte_eth_dev_set_mtu(portid, options.port_mtu);
-	if (ret != 0)
-		return ret;
-
-	ret = rte_eth_dev_start(portid);
-	if (ret < 0)
-		return ret;
-
-	/* Display the port MAC address. */
-	struct rte_ether_addr addr {
-	};
-	ret = rte_eth_macaddr_get(portid, &addr);
-	if (ret != 0)
-		return ret;
-
-	ret = rte_eth_timesync_enable(portid);
-	if (ret != 0)
-		return ret;
-
-	/* Enable RX in promiscuous mode for the Ethernet device. */
-	ret = rte_eth_promiscuous_enable(portid);
-	if (ret != 0)
-		return ret;
-
-	for (int i = 0; i < options.num_threads; i++) {
-		if (rte_eth_add_tx_callback(portid,
-			options.s_thr_info.at(i)->txqid, tx_add_timestamp,
-			nullptr) == nullptr ||
-		    rte_eth_add_rx_callback(portid,
-			options.s_thr_info.at(i)->rxqid, rx_add_timestamp,
-			nullptr) == nullptr) {
-			return -1;
-		}
-	}
-
-	// sync_port_clock(portid);
-
-	return 0;
-}
-
 static void
 usage()
 {
@ -542,13 +453,15 @@ usage()
 	    "Usage:\n"
 	    "    -v(vv): verbose mode\n"
 	    "    -h: seek help\n"
-	    "    -A: cpu mask for worker threads\n"
+	    "    -A: cpu list for worker threads\n"
 	    "    -m: enable memory load generator(MLG)\n"
-	    "    -b: MLG bytes per second\n"
+	    "    -b: MLG trunk size\n"
 	    "    -x: MLG thread affinity mask\n"
 	    "    -X: MLG target domain affinity mask\n"
+	    "    -S: MLG shared buffer\n"
 	    "    -H: host spec\n"
-		"    -J: enable jumbo frames\n");
+	    "    -J: enable jumbo frames\n"
+	    "    -p: port id\n");
 	fflush(stdout);
 }

@ -560,19 +473,22 @@ dump_options()
 	    "          verbosity: +%d\n"
 	    "          thread count: %d\n"
 	    "          ip: 0x%x\n"
-	    "          MLG: %s [bps: %ld, thread cnt: %d, domain: %ld]\n"
-		"          jumbo frame: %d\n",
-	    ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING, options.num_threads,
-	    options.s_host_spec.ip, options.mlg_enabled ? "on" : "off",
-	    options.mlg_bps, CPU_COUNT(&options.mlg_cset), CPU_FFS(&options.mlg_dset) - 1, options.jumbo_frame_enabled);
+	    "          MLG: %s [arr_sz: %ld, thread cnt: %d, domain: %ld]\n"
+	    "          jumbo frame: %d\n"
+	    "          port id: %d\n",
+	    ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING,
+	    options.num_threads, options.s_host_spec.ip,
+	    options.mlg_enabled ? "on" : "off", options.mlg_arr_sz,
+	    CPU_COUNT(&options.mlg_cset), CPU_FFS(&options.mlg_dset) - 1,
+	    options.jumbo_frame_enabled, options.portid);
 }

 int
 main(int argc, char *argv[])
 {
-	unsigned int nb_ports;
-	struct rte_mempool *mbuf_pool;
 	bool has_host_spec { false };
+	struct mem_conf mconf;
+	struct device_conf dconf;

 	ntr_init();

@ -590,7 +506,7 @@ main(int argc, char *argv[])
 	{
 		int c;
 		// parse arguments
-		while ((c = getopt(argc, argv, "hvA:H:mb:X:x:J")) != -1) {
+		while ((c = getopt(argc, argv, "hvA:H:mb:X:x:JSp:")) != -1) {
 			switch (c) {
 			case 'v':
 				ntr_set_level(NTR_DEP_USER1,
@ -601,15 +517,16 @@ main(int argc, char *argv[])
 				rte_exit(EXIT_SUCCESS, "\n");
 			case 'A':
 				cpulist_to_cpuset(optarg, &options.cpu_set);
-				options.num_threads = CPU_COUNT(&options.cpu_set);
+				options.num_threads = CPU_COUNT(
+				    &options.cpu_set);
 				if (options.num_threads == 0) {
 					rte_exit(EXIT_FAILURE,
 					    "must run at least one thread\n");
 				}
 				break;
 			case 'H':
-				if (str_to_netspec(
-					optarg, &options.s_host_spec) != 0) {
+				if (str_to_netspec(optarg,
+					&options.s_host_spec) != 0) {
 					rte_exit(EXIT_FAILURE,
 					    "invalid host spec\n");
 				}
@ -619,7 +536,8 @@ main(int argc, char *argv[])
 				options.mlg_enabled = true;
 				break;
 			case 'b':
-				options.mlg_bps = strtoull(optarg, nullptr, 10);
+				options.mlg_arr_sz = strtoull(optarg, nullptr,
+				    10);
 				break;
 			case 'X':
 				cpulist_to_cpuset(optarg, &options.mlg_dset);
@ -631,10 +549,16 @@ main(int argc, char *argv[])
 				options.jumbo_frame_enabled = true;
 				options.port_mtu = MAX_JUMBO_MTU;
 				break;
+			case 'S':
+				options.mlg_shared_buffer = 1;
+				break;
+			case 'p':
+				options.portid = atoi(optarg);
+				break;
 			default:
 				usage();
-				rte_exit(
-				    EXIT_SUCCESS, "unknown argument: %c", c);
+				rte_exit(EXIT_SUCCESS, "unknown argument: %c",
+				    c);
 			}
 		}
 	}
@ -643,109 +567,98 @@ main(int argc, char *argv[])
 		rte_exit(EXIT_FAILURE, "Must specify host spec\n");
 	}

-	// init nm
-	if (nm_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != 0) {
-		rte_exit(EXIT_FAILURE, "nm init failed!\n");
+	// init libtopo
+	if (topo_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) !=
+	    0) {
+		rte_exit(EXIT_FAILURE, "libtopo init failed!\n");
+	}
+
+	// init libnms
+	if (nms_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != 0) {
+		rte_exit(EXIT_FAILURE, "libnms init failed!\n");
 	}

 	dump_options();

-	// init mlg
-	// if (options.mlg_enabled) {
-	// 	// bool success = false;
-	// 	// options.mlg = new memload_generator(options.mlg_cmask,
-	// 	//     options.mlg_dmask, options.mlg_bps, &success);
-	// 	// if (!success) {
-	// 	// 	rte_exit(EXIT_FAILURE, "failed to init mlg\n");
-	// 	// }
-	// }
-
 	// register dynamic field
-	PROBE_FLAG_OFFSET = rte_mbuf_dynfield_register(
+	struct rte_mbuf_dynfield rte_mbuf_dynfield_probe_flag = {
+		.name = "rte_mbuf_dynfield_probe_valid",
+		.size = sizeof(bool),
+		.align = __alignof__(uint32_t),
+		.flags = 0
+	};
+	options.probe_state_offset = rte_mbuf_dynfield_register(
 	    &rte_mbuf_dynfield_probe_flag);
-	if (PROBE_FLAG_OFFSET < 0) {
-		rte_exit(EXIT_FAILURE, "failed to register dynamic field\n");
+	if (options.probe_state_offset == -1) {
+		rte_exit(EXIT_FAILURE, "failed to register dynamic field: %d\n",
+		    rte_errno);
 	}

-	nb_ports = rte_eth_dev_count_avail();
-	if (nb_ports == 0) {
-		rte_exit(EXIT_FAILURE, "number of ports must be > 0\n");
+	// configure memory and port
+	struct port_conf pconf;
+	portconf_get(options.portid, &pconf);
+	if (!pconf.timesync) {
+		ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
+		    "main: timesync disabled. hw timestamp unavailable.\n ");
 	}
+	dconf.mtu = options.port_mtu;
+	dconf.num_threads = options.num_threads;
+	dconf.portid = options.portid;
+	dconf.rss_hf = pconf.rss_hf;
+	dconf.rx_offloads = pconf.rxoffload;
+	dconf.tx_offloads = pconf.txoffload;
+	dconf.timesync = pconf.timesync;

-	uint16_t portid = rte_eth_find_next(0);
-	if (portid == RTE_MAX_ETHPORTS) {
-		rte_exit(EXIT_FAILURE, "cannot find an available port\n");
-	}
-	options.s_portid = portid;
+	dconf.rx_fn = rx_add_timestamp;
+	dconf.rx_user = nullptr;
+	dconf.rx_ring_sz = 2048;
+	dconf.tx_fn = tx_add_timestamp;
+	dconf.tx_user = nullptr;
+	dconf.tx_ring_sz = 2048;

-	if (rte_eth_macaddr_get(portid, &options.s_host_spec.mac_addr) != 0) {
+	mconf.cache_size = 512;
+	mconf.priv_size = 0;
+	mconf.num_elements = (dconf.rx_ring_sz + dconf.tx_ring_sz) *
+	    rte_lcore_count() / rte_socket_count();
+	mconf.data_room_size = RTE_MBUF_DEFAULT_BUF_SIZE + MAX_JUMBO_MTU -
+	    MAX_STANDARD_MTU;
+	mconf.max_pools = -1;
+
+	dpdk_init(&dconf, &mconf);
+
+	if (rte_eth_macaddr_get(options.portid,
+		&options.s_host_spec.mac_addr) != 0) {
 		rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n",
-		    portid);
-	}
-
-	if (rte_socket_count() > (int)MAX_NODES) {
-		rte_exit(EXIT_FAILURE, "Too many numa nodes\n");
-	}
-
-	for (int i = 0; i < (int)rte_socket_count(); i++) {
-		uint32_t nodeid = i;
-		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
-		    "main: creating mempool for node %d\n", nodeid);
-
-		// create one mbuf pool per socket
-		snprintf(options.mempool_name_buf, MEMPOOL_NAME_BUF_LEN,
-		    "khat_mempool_%d", nodeid);
-		mbuf_pool = rte_pktmbuf_pool_create(options.mempool_name_buf,
-		    MBUF_MAX_COUNT, MBUF_CACHE_SIZE, 0,
-		    	options.jumbo_frame_enabled ?
-		      RTE_MBUF_DEFAULT_BUF_SIZE + (MAX_JUMBO_MTU - MAX_STANDARD_MTU) :
-		      RTE_MBUF_DEFAULT_BUF_SIZE, nodeid);
-		if (mbuf_pool == nullptr) {
-			rte_exit(EXIT_FAILURE, "cannot create mbuf pool: %d\n",
-			    rte_errno);
-		}
-
-		options.s_mempools[nodeid] = mbuf_pool;
-		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
-		    "main: created mempool for node %d\n", nodeid);
+		    options.portid);
 	}

 	// init threads
 	uint32_t cpu_idx = CPU_FFS(&options.cpu_set);
 	uint32_t tid = 0;
-	while(cpu_idx != 0) {
+	while (cpu_idx != 0) {
 		uint32_t lcore_id = cpu_idx - 1;
 		uint32_t node_id = rte_lcore_to_socket_id(lcore_id);
-		auto *tinfo = (struct thread_info *)nm_malloc(node_id, sizeof(struct thread_info));
-		tinfo->cache_lines = nm_malloc(node_id, CACHELINE_SIZE * options.thread_cacheline_cnt);
-		tinfo->load_buffer = nm_malloc(node_id, THREAD_LOAD_BUFFER_SZ);
+		auto *tinfo = (struct thread_info *)nms_malloc(node_id,
+		    sizeof(struct thread_info));
+		tinfo->cache_lines = nms_malloc(node_id,
+		    CACHELINE_SIZE * options.thread_cacheline_cnt);
+		tinfo->load_buffer = nms_malloc(node_id,
+		    THREAD_LOAD_BUFFER_SZ);
 		tinfo->tid = tid;
 		tinfo->lcore_id = lcore_id;
 		tinfo->node_id = node_id;
+		tinfo->rxqid = tid;
+		tinfo->txqid = tid;
 		options.s_thr_info.push_back(tinfo);
 		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
-			"main: thread %d assigned to cpu %d, node %d\n", tinfo->tid,
-			tinfo->lcore_id, nm_get_node_from_core(lcore_id));
+		    "main: thread %d assigned to cpu %d, node %d\n", tinfo->tid,
+		    tinfo->lcore_id, topo_core_to_numa(lcore_id));

 		tid++;
 		CPU_CLR(cpu_idx - 1, &options.cpu_set);
 		cpu_idx = CPU_FFS(&options.cpu_set);
 	}

-	if (port_init(portid, mbuf_pool) != 0) {
-		rte_exit(EXIT_FAILURE, "cannot init port %d\n", portid);
-	}
-
-	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
-	    "Configured port %d on socket %d with mac addr %x:%x:%x:%x:%x:%x\n",
-	    portid, rte_eth_dev_socket_id(portid),
-	    options.s_host_spec.mac_addr.addr_bytes[0],
-	    options.s_host_spec.mac_addr.addr_bytes[1],
-	    options.s_host_spec.mac_addr.addr_bytes[2],
-	    options.s_host_spec.mac_addr.addr_bytes[3],
-	    options.s_host_spec.mac_addr.addr_bytes[4],
-	    options.s_host_spec.mac_addr.addr_bytes[5]);
-
 	sleep(INIT_DELAY);

 	for (int i = 0; i < options.num_threads; i++) {
@ -762,19 +675,35 @@ main(int argc, char *argv[])
 		}
 	}

-	if (options.mlg_enabled)
-		options.mlg->start();
+	// init mlg
+	if (options.mlg_enabled) {
+		bool success = false;
+		memload_generator::memload_generator_options opts;
+		opts.chunk_size = options.mlg_arr_sz;
+		opts.iteration =
+		    memload_generator::memload_generator_options::ITERATION_MAX;
+		opts.shared_buffer = options.mlg_shared_buffer;
+		opts.verbose = (ntr_get_level(NTR_DEP_USER1) -
+				   NTR_LEVEL_WARNING) != 0;
+		options.mlg = new memload_generator(&options.mlg_cset,
+		    &options.mlg_dset, &opts, &success);
+		if (!success) {
+			rte_exit(EXIT_FAILURE, "failed to init mlg\n");
+		}
+	}
+
 	while (true) {
 		usleep(S2US);
 		if (options.mlg_enabled) {
 			uint64_t bps = options.mlg->get_bps();
 			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
-				"main: MLG bps = %ld ~= %ldM\n", bps, bps / 1024 / 1024);
+			    "main: MLG bps = %ld ~= %ldM\n", bps,
+			    bps / 1024 / 1024);
 		}
 	}
-	if (options.mlg_enabled)
-		options.mlg->stop();

+	// shouldn't get here
+	// clean up
 	for (int i = 0; i < options.num_threads; i++) {
 		struct thread_info *tinfo = options.s_thr_info.at(i);
 		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
@ -785,10 +714,9 @@ main(int argc, char *argv[])
 		}
 	}

-	// shouldn't get here
-	// clean up
-	rte_eth_dev_stop(portid);
 	delete options.mlg;

+	dpdk_cleanup(&dconf);
+
 	return 0;
 }
--- a/net/libnetsup/dpdk.cc
+++ b/net/libnetsup/dpdk.cc
@ -0,0 +1,207 @@
+#include "net/netsup.hh"
+#include <cstdlib>
+
+#include "rte_build_config.h"
+#include "rte_common.h"
+#include "rte_config.h"
+#include "rte_ether.h"
+#include "rte_lcore.h"
+#include "rte_mempool.h"
+#include "rte_mbuf.h"
+#include "rte_errno.h"
+#include "rte_ethdev.h"
+
+#include "ntr.h"
+
+static struct rte_mempool *g_mempools[MAX_NUMA_NODES] = {nullptr};
+static unsigned int g_mempool_sz = 0;
+
+static void
+mempool_init(struct mem_conf *mconf)
+{
+	struct rte_mempool * mbuf_pool;
+	char mempool_name[64];
+
+	for (int i = 0; i < (int)rte_socket_count(); i++) {
+		uint32_t nodeid = i;
+		// ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
+		//     "mempool_init: creating mempool for node %d\n", nodeid);
+
+		// create one mbuf pool per socket
+		snprintf(mempool_name, sizeof(mempool_name), "net_mempool_%d", nodeid);
+
+		mbuf_pool = rte_pktmbuf_pool_create(mempool_name, mconf->num_elements, 
+					mconf->cache_size, mconf->priv_size, 
+				mconf->data_room_size, nodeid);
+
+		if (mbuf_pool == nullptr) {
+			rte_exit(EXIT_FAILURE, "cannot create mbuf pool: %d\n", rte_errno);
+		}
+
+		g_mempools[nodeid] = mbuf_pool;
+		g_mempool_sz++;
+		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "mempool_init: created mempool for node %d\n", nodeid);
+	}
+}
+
+struct rte_mempool *
+mempool_get(int nodeid)
+{
+	if ((unsigned int)nodeid < g_mempool_sz) {
+		return g_mempools[nodeid];
+	}
+	return nullptr;
+}
+
+static void
+port_init(struct device_conf *dconf)
+{
+	struct rte_ether_addr addr;
+	struct rte_eth_dev_info dev_info {
+	};
+	struct rte_eth_conf port_conf;
+	struct rte_eth_txconf txconf {
+	};
+	struct rte_eth_rxconf rxconf {
+	};
+	int ret;
+
+	if (rte_eth_dev_count_avail() == 0) {
+		rte_exit(EXIT_FAILURE, "number of ports must be > 0\n");
+	}
+
+	if (!rte_eth_dev_is_valid_port(dconf->portid)) {
+		rte_exit(EXIT_FAILURE, "cannot find port %d\n", dconf->portid);
+	}
+
+	if ((ret = rte_eth_macaddr_get(dconf->portid, &addr)) != 0) {
+		rte_exit(EXIT_FAILURE, "cannot get mac address of port: %d\n", ret);
+	}
+
+	ret = rte_eth_dev_info_get(dconf->portid, &dev_info);
+	if (ret != 0) {
+		rte_exit(EXIT_FAILURE, "failed to get dev info: %d", ret);
+	}
+
+	memset(&port_conf, 0, sizeof(struct rte_eth_conf));
+	port_conf.rxmode.mtu = dconf->mtu;
+	port_conf.rxmode.mq_mode = RTE_ETH_MQ_RX_RSS;
+	port_conf.rx_adv_conf.rss_conf.rss_key = nullptr;
+	port_conf.rx_adv_conf.rss_conf.rss_hf = dconf->rss_hf;
+
+	port_conf.rxmode.offloads = dconf->rx_offloads;
+	port_conf.txmode.offloads = dconf->tx_offloads;
+
+	/* Configure the Ethernet device. */
+	ret = rte_eth_dev_configure(
+	    dconf->portid, dconf->num_threads, dconf->num_threads, &port_conf);
+	if (ret != 0)
+		rte_exit(EXIT_FAILURE, "failed to configure port: %d\n", ret);
+
+	ret = rte_eth_dev_adjust_nb_rx_tx_desc(dconf->portid, &dconf->rx_ring_sz, &dconf->tx_ring_sz);
+	if (ret != 0)
+		rte_exit(EXIT_FAILURE, "failed to set rx tx queue size: %d\n", ret);
+
+	/* Allocate and set up 1 RX queue per thread per Ethernet port. */
+	rxconf = dev_info.default_rxconf;
+	rxconf.offloads = port_conf.rxmode.offloads;
+	for (int i = 0; i < dconf->num_threads; i++) {
+		ret = rte_eth_rx_queue_setup(dconf->portid, i, dconf->rx_ring_sz,
+		    rte_eth_dev_socket_id(dconf->portid), &rxconf, 
+			mempool_get(rte_eth_dev_socket_id(dconf->portid)));
+
+		if (ret < 0)
+			rte_exit(EXIT_FAILURE, "failed to setup rx queue %d: %d\n", i, ret);
+	}
+
+	/* Allocate and set up 1 TX queue per thread per Ethernet port. */
+	txconf = dev_info.default_txconf;
+	txconf.offloads = port_conf.txmode.offloads;
+	for (int i = 0; i < dconf->num_threads; i++) {
+		ret = rte_eth_tx_queue_setup(
+		    dconf->portid, i, dconf->tx_ring_sz, rte_eth_dev_socket_id(dconf->portid), 
+			&txconf);
+		if (ret < 0)
+			rte_exit(EXIT_FAILURE, "failed to setup tx queue %d: %d", i, ret);
+	}
+
+	// set mtu
+	ret = rte_eth_dev_set_mtu(dconf->portid, dconf->mtu);
+	if (ret != 0)
+		rte_exit(EXIT_FAILURE, "failed to set mtu: %d\n", ret);
+
+	ret = rte_eth_dev_start(dconf->portid);
+	if (ret < 0)
+		rte_exit(EXIT_FAILURE, "failed to start port: %d\n", ret);
+
+	if (dconf->timesync) {
+		ret = rte_eth_timesync_enable(dconf->portid);
+		if (ret != 0)
+			rte_exit(EXIT_FAILURE, "failed to enable timesync: %d\n", ret);
+	}
+
+	/* Enable RX in promiscuous mode for the Ethernet device. */
+	ret = rte_eth_promiscuous_enable(dconf->portid);
+	if (ret != 0)
+		rte_exit(EXIT_FAILURE, "failed to enable promiscuous mode: %d\n", ret);
+
+	for (int i = 0; i < dconf->num_threads; i++) {
+		if (dconf->tx_fn != nullptr) {
+			if (rte_eth_add_tx_callback(dconf->portid,
+				i, dconf->tx_fn,
+				dconf->tx_user) == nullptr) {
+				rte_exit(EXIT_FAILURE, "failed to attach callback to tx queue %d\n", i);
+			}
+		}
+		
+		if (dconf->rx_fn != nullptr) {
+			if (rte_eth_add_rx_callback(dconf->portid,
+				i, dconf->rx_fn,
+				dconf->rx_user) == nullptr) {
+				rte_exit(EXIT_FAILURE, "failed to attach callback to rx queue %d\n", i);
+			}
+		}
+	}
+
+	// sync_port_clock(portid);
+
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, 
+	"port_init: configured port %d on socket %d with mac addr %x:%x:%x:%x:%x:%x\n",
+		dconf->portid, rte_eth_dev_socket_id(dconf->portid),
+		addr.addr_bytes[0],
+		addr.addr_bytes[1],
+		addr.addr_bytes[2],
+		addr.addr_bytes[3],
+		addr.addr_bytes[4],
+		addr.addr_bytes[5]);
+}
+
+void
+dpdk_init(struct device_conf *dconf, struct mem_conf *mconf)
+{
+	if (rte_socket_count() > (int)MAX_NUMA_NODES) {
+		rte_exit(EXIT_FAILURE, "too many numa nodes\n");
+	}
+
+	// ensure 1-1 mapping
+	for (int i = 0; i < (int)rte_socket_count(); i++) {
+		if (rte_socket_id_by_idx(i) != i) {
+			rte_exit(EXIT_FAILURE, "socket %d has id %d instead.\n", i, rte_socket_id_by_idx(i));
+		}
+	}
+
+	mempool_init(mconf);
+
+	port_init(dconf);
+}
+
+void
+dpdk_cleanup(struct device_conf * dconf)
+{
+	rte_eth_dev_stop(dconf->portid);
+	rte_eth_dev_close(dconf->portid);
+
+	for (int i = 0; i < (int)rte_socket_count(); i++) {
+		rte_mempool_free(g_mempools[i]);
+	}
+}
--- a/net/libnetsup/portconf.cc
+++ b/net/libnetsup/portconf.cc
@ -0,0 +1,59 @@
+#include "rte_ethdev.h"
+#include "net/netsup.hh"
+#include <cstdlib>
+
+static struct port_conf port_confs[] = {
+	{
+		.driver_name = "net_cxgbe",
+		.rxoffload = RTE_ETH_RX_OFFLOAD_RSS_HASH | RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_IPV4_CKSUM,
+		.txoffload = RTE_ETH_TX_OFFLOAD_UDP_CKSUM | RTE_ETH_TX_OFFLOAD_IPV4_CKSUM,
+		.rss_hf = RTE_ETH_RSS_UDP | RTE_ETH_RSS_FRAG_IPV4,
+		.timesync = false
+	},
+	{
+		.driver_name = "net_i40e",
+		.rxoffload = RTE_ETH_RX_OFFLOAD_RSS_HASH | RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_IPV4_CKSUM,
+		.txoffload = RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE | RTE_ETH_TX_OFFLOAD_UDP_CKSUM | RTE_ETH_TX_OFFLOAD_IPV4_CKSUM,
+		.rss_hf = RTE_ETH_RSS_FRAG_IPV4 | RTE_ETH_RSS_NONFRAG_IPV4_UDP | RTE_ETH_RSS_NONFRAG_IPV4_OTHER | RTE_ETH_RSS_L2_PAYLOAD,
+		.timesync = true
+	},
+	{
+		.driver_name = "net_ice",
+		.rxoffload = RTE_ETH_RX_OFFLOAD_RSS_HASH | RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_IPV4_CKSUM | RTE_ETH_RX_OFFLOAD_TIMESTAMP,
+		.txoffload = RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE | RTE_ETH_TX_OFFLOAD_UDP_CKSUM | RTE_ETH_TX_OFFLOAD_IPV4_CKSUM,
+		.rss_hf = RTE_ETH_RSS_FRAG_IPV4 | RTE_ETH_RSS_NONFRAG_IPV4_UDP | RTE_ETH_RSS_NONFRAG_IPV4_OTHER | RTE_ETH_RSS_L2_PAYLOAD,
+		.timesync = true
+	}
+};
+
+static struct port_conf default_conf = {
+	.driver_name = "default",
+	.rxoffload = RTE_ETH_RX_OFFLOAD_RSS_HASH | RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_IPV4_CKSUM | RTE_ETH_RX_OFFLOAD_TIMESTAMP,
+	.txoffload = RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE | RTE_ETH_TX_OFFLOAD_UDP_CKSUM | RTE_ETH_TX_OFFLOAD_IPV4_CKSUM,
+	.rss_hf = RTE_ETH_RSS_FRAG_IPV4 | RTE_ETH_RSS_NONFRAG_IPV4_UDP | RTE_ETH_RSS_NONFRAG_IPV4_OTHER | RTE_ETH_RSS_L2_PAYLOAD,
+	.timesync = true
+};
+
+static const int port_size = sizeof(port_confs) / sizeof(port_confs[0]);
+
+int
+portconf_get(int portid, struct port_conf * out)
+{
+	struct rte_eth_dev_info dev_info {};
+
+	if (rte_eth_dev_info_get(portid, &dev_info) != 0) {
+		rte_exit(EXIT_FAILURE, "failed to obtain device info for port %d\n", portid);
+	}
+
+	for(int i = 0; i < port_size; i++) {
+		struct port_conf * conf = &port_confs[i];
+		if (strcmp(conf->driver_name, dev_info.driver_name) == 0) {
+			memcpy(out, conf, sizeof(struct port_conf));
+			return 0;
+		}
+	}
+
+	fprintf(stdout, "portconf_get: unable to find matching conf for port %d:%s, returning default conf.\n", portid, dev_info.driver_name);
+	memcpy(out, &default_conf, sizeof(struct port_conf));
+	return -1;
+}
--- a/net/rat.cc
+++ b/net/rat.cc
@ -1,4 +1,14 @@
+#include <atomic>
+#include <cstddef>
+#include <list>
+#include <map>
+#include <mutex>
+#include <random>
+#include <vector>
+
 #include <sys/endian.h>
+
+#include <topo.h>
 #include <rte_byteorder.h>
 #include <rte_common.h>
 #include <rte_config.h>
@ -10,28 +20,15 @@
 #include <rte_mbuf.h>
 #include <unistd.h>

-#include "gen.hh"
-#include "nm.hh"
 #include "ntr.h"
+
+#include "gen.hh"
+#include "net/netsup.hh"
 #include "net/pkt.hh"
-#include "net/util.hh"
+#include "nms.h"

-#include <atomic>
-#include <list>
-#include <map>
-#include <mutex>
-#include <random>
-#include <vector>
-
-#define MBUF_MAX_COUNT (rte_lcore_count() * 4096)
-constexpr static unsigned int MBUF_CACHE_SIZE = 512;
-constexpr static unsigned int RX_RING_SIZE = 2048;
-constexpr static unsigned int TX_RING_SIZE = 2048;
 constexpr static unsigned int BURST_SIZE = 32;

-static const struct rte_eth_conf port_conf_default {
-};
-
 static unsigned int
 epoch_mk(unsigned int id, unsigned int epoch)
 {
@ -60,6 +57,7 @@ struct thread_info {
 	unsigned int lcore_id { 0 };
 	unsigned int rxqid { 0 };
 	unsigned int txqid { 0 };
+	int socket_id;
 	// this field is read by the stat collecting thread
 	std::atomic<int> recved_pkts { 0 };
 	std::atomic<int> lost_pkts { 0 };
@ -74,9 +72,12 @@ struct thread_info {
 	    mtx; // this lock protects data shared between worker threads, i.e.:
 	std::list<struct epoch_info *> recved_epochs;

-	thread_info() : which_rd(), which_rng(which_rd()), which_dice(std::uniform_int_distribution<uint32_t>(0, UINT32_MAX))
+	thread_info()
+	    : which_rd()
+	    , which_rng(which_rd())
+	    , which_dice(std::uniform_int_distribution<uint32_t>(0, UINT32_MAX))
 	{
-		which_rng.seed(nm_get_uptime_ns());
+		which_rng.seed(topo_uptime_ns());
 	}
 };

@ -94,26 +95,22 @@ struct options_t {
 	char ld_gen[256] { "fixed:0" };
 	uint32_t target_qps { 0 };
 	uint32_t depth { 1 };
-	struct net_spec server_spec {
-	};
+	struct net_spec server_spec { };
 	cpuset_t cpu_set = CPUSET_T_INITIALIZER(0x2); // 1 thread @ core 2
 	uint32_t pkt_loss_delay_ms { UINT32_MAX };
 	bool jumbo_frame_enabled { false };
 	int pkt_pad_sz { 0 };
 	int port_mtu { MAX_STANDARD_MTU };
+	int portid { 0 };

 	// states
 	unsigned int s_num_threads { 1 }; // 1 thread
-	struct rte_mempool *mbuf_pool { nullptr };
-	struct net_spec s_host_spec {
-	};
-	struct net_spec s_master_spec {
-	};
+	struct net_spec s_host_spec { };
+	struct net_spec s_master_spec { };
 	struct conn_spec s_master_cspec {
 		.src = &s_host_spec, .src_port = DEFAULT_RAT_PORT,
 		.dst = &s_master_spec, .dst_port = DEFAULT_RAT_PORT,
 	};
-	uint16_t s_portid { 0 };
 	std::vector<struct thread_info *> s_thr_info;
 	std::atomic<int> s_state { STATE_RUNNING }; // default non master mode

@ -124,8 +121,8 @@ struct options_t {
 static struct options_t options;

 static inline void
-calc_stats(
-    uint64_t now, uint32_t *qps, uint32_t *recved_pkt, uint32_t *total_loss)
+calc_stats(uint64_t now, uint32_t *qps, uint32_t *recved_pkt,
+    uint32_t *total_loss)
 {
 	uint32_t recv = 0;
 	uint32_t loss = 0;
@ -159,8 +156,8 @@ proto_loop(struct thread_info *tinfo)
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
 	    "proto_loop <thread %d>: waiting for SYNC from cat\n", tinfo->id);
 	while (options.s_state.load() == STATE_SYNC) {
-		const uint16_t nb_rx = rte_eth_rx_burst(
-		    options.s_portid, tinfo->rxqid, rx_bufs, BURST_SIZE);
+		const uint16_t nb_rx = rte_eth_rx_burst(options.portid,
+		    tinfo->rxqid, rx_bufs, BURST_SIZE);
 		if (nb_rx > 0) {
 			for (int i = 0; i < nb_rx; i++) {
 				struct pkt_hdr *each = check_valid_packet(
@ -195,13 +192,13 @@ proto_loop(struct thread_info *tinfo)
 							    nullptr);

 							if (alloc_pkt_hdr(
-								options
-								    .mbuf_pool,
+								mempool_get(
+								    tinfo
+									->socket_id),
 								PKT_TYPE_SYNC_ACK,
 								&options
 								     .s_master_cspec,
-									 0,
-								&tx_buf,
+								0, &tx_buf,
 								&pkt_data) !=
 							    0) {
 								rte_exit(
@ -209,7 +206,10 @@ proto_loop(struct thread_info *tinfo)
 								    "failed to alloc pkt hdr\n");
 							}

-							tx_burst_all(options.s_portid, tinfo->txqid, &tx_buf, 1);
+							tx_burst_all(
+							    options.portid,
+							    tinfo->txqid,
+							    &tx_buf, 1);

 							expected =
 							    STATE_SYNC_ACK;
@ -240,6 +240,7 @@ proto_loop(struct thread_info *tinfo)
 					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 					    "proto_loop <thread %d>: ignoring invalid packet %p.\n",
 					    tinfo->id, (void *)rx_bufs[i]);
+					//dump_pkt(rx_bufs[i]);
 				}

 				rte_pktmbuf_free(rx_bufs[i]);
@ -268,16 +269,16 @@ pkt_loop(struct thread_info *tinfo)
 	srv_cspec.src = &options.s_host_spec;
 	srv_cspec.dst = &options.server_spec;

-	next_ts = nm_get_uptime_ns();
+	next_ts = topo_uptime_ns();

 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "pkt_loop <thread %d>: entering\n",
 	    tinfo->id);

 	while (options.s_state.load() == STATE_RUNNING) {
-		uint64_t now = nm_get_uptime_ns();
+		uint64_t now = topo_uptime_ns();
 		// always pop incoming packets
-		const uint16_t nb_rx = rte_eth_rx_burst(
-		    options.s_portid, tinfo->rxqid, rx_bufs, BURST_SIZE);
+		const uint16_t nb_rx = rte_eth_rx_burst(options.portid,
+		    tinfo->rxqid, rx_bufs, BURST_SIZE);

 		if (nb_rx > 0) {
 			for (int i = 0; i < nb_rx; i++) {
@ -309,7 +310,8 @@ pkt_loop(struct thread_info *tinfo)
 					    pld_epoch->epoch);
 					id = epoch_get_id(epoch);

-					// printf("Load resp size : %d\n", rx_bufs[i]->data_len);
+					// printf("Load resp size : %d\n",
+					// rx_bufs[i]->data_len);

 					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
 					    "pkt_loop <thread %d>: packet %p epoch 0x%x id %d.\n",
@ -337,7 +339,7 @@ pkt_loop(struct thread_info *tinfo)
 					break;
 				case PKT_TYPE_FIN:
 					if (rte_is_same_ether_addr(
-						&each->eth_hdr.s_addr,
+						&each->eth_hdr.src_addr,
 						&options.s_master_spec
 						     .mac_addr)) {
 						ntr(NTR_DEP_USER1,
@ -364,11 +366,11 @@ pkt_loop(struct thread_info *tinfo)

 						struct pkt_hdr *pkt_hdr;
 						if (alloc_pkt_hdr(
-							options.mbuf_pool,
+							mempool_get(
+							    tinfo->socket_id),
 							PKT_TYPE_FIN_ACK,
 							&options.s_master_cspec,
-							0,
-							&tx_bufs[0],
+							0, &tx_bufs[0],
 							&pkt_hdr) != 0) {
 							rte_exit(EXIT_FAILURE,
 							    "failed to allocate pkt hdr\n");
@ -386,7 +388,9 @@ pkt_loop(struct thread_info *tinfo)
 						    rte_cpu_to_be_32(
 							total_loss);

-						tx_burst_all(options.s_portid, tinfo->txqid, &tx_bufs[0], 1);
+						tx_burst_all(options.portid,
+						    tinfo->txqid, &tx_bufs[0],
+						    1);

 						options.s_state.store(
 						    STATE_FIN);
@ -487,9 +491,9 @@ pkt_loop(struct thread_info *tinfo)
 			// change dst port for every packet for RSS
 			srv_cspec.dst_port = dst_port_gen.next();
 			srv_cspec.src_port = src_port_gen.next();
-			if (alloc_pkt_hdr(options.mbuf_pool, PKT_TYPE_LOAD,
-				&srv_cspec, options.pkt_pad_sz, &tx_bufs[total_send],
-				&pkt_data) != 0) {
+			if (alloc_pkt_hdr(mempool_get(tinfo->socket_id),
+				PKT_TYPE_LOAD, &srv_cspec, options.pkt_pad_sz,
+				&tx_bufs[total_send], &pkt_data) != 0) {
 				rte_exit(EXIT_FAILURE,
 				    "failed to allocate pkt hdr\n");
 			}
@ -497,7 +501,8 @@ pkt_loop(struct thread_info *tinfo)
 			pld_load = (struct pkt_payload_load *)pkt_data->payload;
 			pld_load->load = rte_cpu_to_be_32(
 			    tinfo->load_gen->generate());
-			pld_load->which = rte_cpu_to_be_32(tinfo->which_dice(tinfo->which_rng));
+			pld_load->which = rte_cpu_to_be_32(
+			    tinfo->which_dice(tinfo->which_rng));
 			unsigned int epoch = epoch_mk(tinfo->id, cur_epoch);
 			pld_load->epoch = rte_cpu_to_be_32(epoch);
 			cur_epoch++;
@ -514,26 +519,16 @@ pkt_loop(struct thread_info *tinfo)
 			total_send++;
 		}

-		tx_burst_all(options.s_portid, tinfo->txqid, tx_bufs, total_send);
-		// if (total_send > 0) {
-		// 	const uint16_t nb_tx = rte_eth_tx_burst(
-		// 	    options.s_portid, tinfo->txqid, tx_bufs,
-		// 	    total_send);
-
-		// 	if (nb_tx != total_send) {
-		// 		rte_exit(
-		// 		    EXIT_FAILURE, "failed to send packet\n");
-		// 	}
-		// }
+		tx_burst_all(options.portid, tinfo->txqid, tx_bufs, total_send);

 		// check rage quit only when we have sent a packet
 		if (last_recv_ts == 0) {
-			last_recv_ts = nm_get_uptime_ns();
+			last_recv_ts = topo_uptime_ns();
 		}
-		if (nm_get_uptime_ns() - last_recv_ts >
-		    options.rage_quit_time * MS2NS) {
+		if (topo_uptime_ns() >
+		    options.rage_quit_time * MS2NS + last_recv_ts) {
 			rte_exit(EXIT_FAILURE,
-			    "rat: thread %d waiting too long for resp. I QUIT!!\n",
+			    "rat: thread %d waiting too long for resp. I F QUIT!\n",
 			    tinfo->id);
 		}
 	}
@ -554,16 +549,16 @@ locore_main(void *tif)
 	uint32_t core_id = rte_lcore_id();

 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
-	    "locore_main <thread %d>: running on core %d...\n", tinfo->id,
-	    core_id);
+	    "locore_main <thread %d>: running on core %d rxqid %d txqid %d...\n", tinfo->id,
+	    core_id, tinfo->rxqid, tinfo->txqid);

-	if (rte_eth_dev_socket_id(options.s_portid) > 0 &&
-	    rte_eth_dev_socket_id(options.s_portid) != (int)rte_socket_id()) {
+	if (rte_eth_dev_socket_id(options.portid) > 0 &&
+	    rte_eth_dev_socket_id(options.portid) != (int)rte_socket_id()) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
 		    "locore_main <thread %d>: WARNING, port %d is on remote NUMA node to "
 		    "polling thread.\n\tPerformance will "
 		    "not be optimal.\n",
-		    tinfo->id, options.s_portid);
+		    tinfo->id, options.portid);
 	}

 	if (options.slave_mode == 1) {
@ -575,7 +570,7 @@ locore_main(void *tif)
 	while (options.s_state.load() != STATE_RUNNING) {
 	}
 	// store the current timestamp
-	options.s_ts_begin.store(nm_get_uptime_ns());
+	options.s_ts_begin.store(topo_uptime_ns());
 	pkt_loop(tinfo);

 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main <thread %d>: exited\n",
@ -584,99 +579,6 @@ locore_main(void *tif)
 	return 0;
 }

-static int
-port_init(uint16_t portid, struct rte_mempool *mbuf_pool)
-{
-	struct rte_eth_dev_info dev_info {
-	};
-	struct rte_eth_conf port_conf = port_conf_default;
-	struct rte_eth_txconf txconf {
-	};
-	struct rte_eth_rxconf rxconf {
-	};
-
-	uint16_t nb_rxd = RX_RING_SIZE;
-	uint16_t nb_txd = TX_RING_SIZE;
-
-	if (!rte_eth_dev_is_valid_port(portid)) {
-		return -1;
-	}
-
-	int ret = rte_eth_dev_info_get(portid, &dev_info);
-	if (ret != 0) {
-		return ret;
-	}
-
-	port_conf.rxmode.max_rx_pkt_len = mtu_to_pkt_size(options.port_mtu);;
-	port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
-	port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_NONFRAG_IPV4_UDP |
-	    ETH_RSS_L2_PAYLOAD | ETH_RSS_NONFRAG_IPV4_TCP;
-	port_conf.rx_adv_conf.rss_conf.rss_key = nullptr;
-	port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_UDP_CKSUM;
-	port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_IPV4_CKSUM;
-	port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_RSS_HASH;
-	port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM;
-	port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
-	port_conf.txmode.offloads |= DEV_TX_OFFLOAD_MBUF_FAST_FREE;
-	if (options.jumbo_frame_enabled) {
-		port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_JUMBO_FRAME;
-	}
-
-	/* Configure the Ethernet device. */
-	ret = rte_eth_dev_configure(
-	    portid, options.s_num_threads, options.s_num_threads, &port_conf);
-	if (ret != 0)
-		return ret;
-
-	ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd, &nb_txd);
-	if (ret != 0)
-		return ret;
-
-	/* Allocate and set up 1 RX queue per thread . */
-	rxconf = dev_info.default_rxconf;
-
-	if (options.jumbo_frame_enabled) {
-		rxconf.offloads |= DEV_RX_OFFLOAD_JUMBO_FRAME;
-	}
-	rxconf.offloads = port_conf.rxmode.offloads;
-	for (uint32_t i = 0; i < options.s_num_threads; i++) {
-		ret = rte_eth_rx_queue_setup(portid,
-		    options.s_thr_info.at(i)->rxqid, nb_rxd,
-		    rte_eth_dev_socket_id(portid), &rxconf, mbuf_pool);
-		if (ret < 0)
-			return ret;
-	}
-
-	txconf = dev_info.default_txconf;
-	txconf.offloads = port_conf.txmode.offloads;
-	/* Allocate and set up 1 TX queue per Ethernet port. */
-	for (uint32_t i = 0; i < options.s_num_threads; i++) {
-		ret = rte_eth_tx_queue_setup(portid,
-		    options.s_thr_info.at(i)->txqid, nb_txd,
-		    rte_eth_dev_socket_id(portid), &txconf);
-		if (ret < 0)
-			return ret;
-	}
-
-	// set mtu
-	ret = rte_eth_dev_set_mtu(portid, options.port_mtu);
-	if (ret != 0)
-		return ret;
-
-	ret = rte_eth_dev_start(portid);
-	if (ret < 0)
-		return ret;
-
-	/* Display the port MAC address. */
-	struct rte_ether_addr addr {
-	};
-	ret = rte_eth_macaddr_get(portid, &addr);
-
-	// no promiscuous mode required
-
-	return ret;
-}
-
 static void
 dump_options()
 {
@ -694,13 +596,13 @@ dump_options()
 	    "    depth = %u\n"
 	    "    packet loss time threshold = %u\n"
 	    "    jumbo frame = %d\n"
-	    "    packet pad size = %d\n",
+	    "    packet pad size = %d\n"
+	    "    portid = %d\n",
 	    ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING, options.run_time,
-	    options.s_num_threads, options.rage_quit_time,
-	    options.slave_mode, options.ia_gen, options.ld_gen,
-	    options.target_qps, options.s_host_spec.ip, options.depth,
-	    options.pkt_loss_delay_ms, options.jumbo_frame_enabled,
-	    options.pkt_pad_sz);
+	    options.s_num_threads, options.rage_quit_time, options.slave_mode,
+	    options.ia_gen, options.ld_gen, options.target_qps,
+	    options.s_host_spec.ip, options.depth, options.pkt_loss_delay_ms,
+	    options.jumbo_frame_enabled, options.pkt_pad_sz, options.portid);
 }

 static void
@ -722,14 +624,13 @@ usage()
 	    "    -D: max number of packets in flight\n"
 	    "    -l: packet loss time threshold\n"
 	    "    -J: enable jumbo frame\n"
-	    "    -P: pad load packets to this size\n");
+	    "    -P: pad load packets to this size\n"
+	    "    -p: portid\n");
 }

 int
 main(int argc, char *argv[])
 {
-	unsigned int nb_ports;
-	struct rte_mempool *mbuf_pool;
 	struct thread_info *tinfo;
 	bool has_host_spec = false;

@ -749,8 +650,8 @@ main(int argc, char *argv[])
 	{
 		int c;
 		// parse arguments
-		while ((c = getopt(
-			    argc, argv, "vht:s:SA:i:w:r:q:H:D:l:JP:")) != -1) {
+		while ((c = getopt(argc, argv,
+			    "vht:s:SA:i:w:r:q:H:D:l:JP:p:")) != -1) {
 			switch (c) {
 			case 'v':
 				ntr_set_level(NTR_DEP_USER1,
@ -763,8 +664,8 @@ main(int argc, char *argv[])
 				options.run_time = strtol(optarg, nullptr, 10);
 				break;
 			case 's':
-				if (str_to_netspec(
-					optarg, &options.server_spec) != 0) {
+				if (str_to_netspec(optarg,
+					&options.server_spec) != 0) {
 					rte_exit(EXIT_FAILURE,
 					    "invalid server net spec\n");
 				}
@ -776,9 +677,11 @@ main(int argc, char *argv[])
 				break;
 			case 'A':
 				cpulist_to_cpuset(optarg, &options.cpu_set);
-				options.s_num_threads = CPU_COUNT(&options.cpu_set);
+				options.s_num_threads = CPU_COUNT(
+				    &options.cpu_set);
 				if (options.s_num_threads == 0) {
-					rte_exit(EXIT_FAILURE, "invalid cpu mask %s\n", optarg);
+					rte_exit(EXIT_FAILURE,
+					    "invalid cpu mask %s\n", optarg);
 				}
 				break;
 			case 'i':
@ -790,17 +693,17 @@ main(int argc, char *argv[])
 				    sizeof(options.ld_gen) - 1);
 				break;
 			case 'r':
-				options.rage_quit_time = strtol(
-				    optarg, nullptr, 10);
+				options.rage_quit_time = strtol(optarg, nullptr,
+				    10);
 				break;
 			case 'q':
-				options.target_qps = strtol(
-				    optarg, nullptr, 10);
+				options.target_qps = strtol(optarg, nullptr,
+				    10);
 				break;
 			case 'H':
 				has_host_spec = true;
-				if (str_to_netspec(
-					optarg, &options.s_host_spec) != 0) {
+				if (str_to_netspec(optarg,
+					&options.s_host_spec) != 0) {
 					rte_exit(EXIT_FAILURE,
 					    "invalid host net spec.\n");
 				}
@ -812,8 +715,8 @@ main(int argc, char *argv[])
 				}
 				break;
 			case 'l':
-				options.pkt_loss_delay_ms = strtol(
-				    optarg, nullptr, 10);
+				options.pkt_loss_delay_ms = strtol(optarg,
+				    nullptr, 10);
 				if (options.pkt_loss_delay_ms == 0) {
 					options.pkt_loss_delay_ms = UINT32_MAX;
 				}
@ -823,75 +726,108 @@ main(int argc, char *argv[])
 				options.port_mtu = MAX_JUMBO_MTU;
 				break;
 			case 'P':
-				options.pkt_pad_sz = strtol(
-				    optarg, nullptr, 10);
+				options.pkt_pad_sz = strtol(optarg, nullptr,
+				    10);
+				break;
+			case 'p':
+				options.portid = strtol(optarg, nullptr, 10);
 				break;
 			default:
 				usage();
-				rte_exit(
-				    EXIT_FAILURE, "unknown argument: %c\n", c);
+				rte_exit(EXIT_FAILURE, "unknown argument: %c\n",
+				    c);
 			}
 		}
 	}

-	if (options.pkt_pad_sz != 0 && options.pkt_pad_sz > mtu_to_pkt_size(options.port_mtu)) {
-		rte_exit(EXIT_FAILURE, "pkt_pad_sz is too large for mtu %d\n", options.port_mtu);
+	if (options.pkt_pad_sz != 0 &&
+	    options.pkt_pad_sz > mtu_to_pkt_size(options.port_mtu)) {
+		rte_exit(EXIT_FAILURE, "pkt_pad_sz is too large for mtu %d\n",
+		    options.port_mtu);
 	}

 	if (!has_host_spec) {
 		rte_exit(EXIT_FAILURE, "Must specify host IP.\n");
 	}

-	// init nm
-	if (nm_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != 0) {
-		rte_exit(EXIT_FAILURE, "nm init failed!\n");
+	// init libtopo
+	if (topo_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) !=
+	    0) {
+		rte_exit(EXIT_FAILURE, "libtopo init failed!\n");
+	}
+
+	if (nms_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) !=
+	    0) {
+		rte_exit(EXIT_FAILURE, "libnms init failed!\n");
 	}

 	dump_options();

-	nb_ports = rte_eth_dev_count_avail();
-	if (nb_ports == 0) {
-		rte_exit(EXIT_FAILURE, "number of ports must be > 0\n");
+	// configure memory and port
+	struct port_conf pconf;
+	struct device_conf dconf;
+	struct mem_conf mconf;
+	portconf_get(options.portid, &pconf);
+	if (!pconf.timesync) {
+		ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
+		    "main: timesync disabled. hw timestamp unavailable.\n ");
 	}
+	dconf.mtu = options.port_mtu;
+	dconf.num_threads = options.s_num_threads;
+	dconf.portid = options.portid;
+	dconf.rss_hf = pconf.rss_hf;
+	dconf.rx_offloads = pconf.rxoffload;
+	dconf.tx_offloads = pconf.txoffload;
+	dconf.timesync = pconf.timesync;

-	uint16_t portid = rte_eth_find_next(0);
-	if (portid == RTE_MAX_ETHPORTS) {
-		rte_exit(EXIT_FAILURE, "cannot find an available port\n");
-	}
-	options.s_portid = portid;
+	dconf.rx_fn = nullptr;
+	dconf.rx_user = nullptr;
+	dconf.rx_ring_sz = 2048;
+	dconf.tx_fn = nullptr;
+	dconf.tx_user = nullptr;
+	dconf.tx_ring_sz = 2048;

-	if (rte_eth_macaddr_get(portid, &options.s_host_spec.mac_addr) != 0) {
+	mconf.cache_size = 512;
+	mconf.priv_size = 0;
+	mconf.num_elements = (dconf.rx_ring_sz + dconf.tx_ring_sz) *
+	    rte_lcore_count() / rte_socket_count();
+	mconf.data_room_size = RTE_MBUF_DEFAULT_BUF_SIZE + MAX_JUMBO_MTU -
+	    MAX_STANDARD_MTU;
+	mconf.max_pools = -1;
+
+	dpdk_init(&dconf, &mconf);
+
+	if (rte_eth_macaddr_get(options.portid,
+		&options.s_host_spec.mac_addr) != 0) {
 		rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n",
-		    portid);
+		    options.portid);
 	}

-	// create a mbuf memory pool on the socket
-	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", MBUF_MAX_COUNT,
-	    MBUF_CACHE_SIZE, 0,
-	    options.jumbo_frame_enabled ?
-		      RTE_MBUF_DEFAULT_BUF_SIZE + (MAX_JUMBO_MTU - MAX_STANDARD_MTU) :
-		      RTE_MBUF_DEFAULT_BUF_SIZE,
-	    rte_eth_dev_socket_id(options.s_portid));
-	if (mbuf_pool == nullptr) {
-		rte_exit(EXIT_FAILURE, "cannot create mbuf pool\n");
-	}
-	options.mbuf_pool = mbuf_pool;
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
+	    "Configured port %d with mac addr %x:%x:%x:%x:%x:%x\n",
+	    options.portid, options.s_host_spec.mac_addr.addr_bytes[0],
+	    options.s_host_spec.mac_addr.addr_bytes[1],
+	    options.s_host_spec.mac_addr.addr_bytes[2],
+	    options.s_host_spec.mac_addr.addr_bytes[3],
+	    options.s_host_spec.mac_addr.addr_bytes[4],
+	    options.s_host_spec.mac_addr.addr_bytes[5]);

 	unsigned int cpuset_idx = CPU_FFS(&options.cpu_set);
 	unsigned int tid = 0;
-	while(cpuset_idx != 0) {
+	while (cpuset_idx != 0) {
 		unsigned int lcore_id = cpuset_idx - 1;
 		tinfo = new thread_info;
 		tinfo->ia_gen = createGenerator(options.ia_gen);
 		tinfo->load_gen = createGenerator(options.ld_gen);
 		if (tinfo->ia_gen == nullptr || tinfo->load_gen == nullptr) {
-			rte_exit(
-			    EXIT_FAILURE, "invalid ia_gen or ld_gen string\n");
+			rte_exit(EXIT_FAILURE,
+			    "invalid ia_gen or ld_gen string\n");
 		}
 		tinfo->ia_gen->set_lambda((double)options.target_qps /
 		    (double)(options.s_num_threads));
 		tinfo->id = tid;
 		tinfo->lcore_id = lcore_id;
+		tinfo->socket_id = rte_lcore_to_socket_id(lcore_id);
 		tinfo->rxqid = tid;
 		tinfo->txqid = tid;
 		options.s_thr_info.push_back(tinfo);
@ -901,19 +837,6 @@ main(int argc, char *argv[])
 		cpuset_idx = CPU_FFS(&options.cpu_set);
 	}

-	if (port_init(portid, mbuf_pool) != 0) {
-		rte_exit(EXIT_FAILURE, "cannot init port %d\n", portid);
-	}
-
-	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
-	    "Configured port %d with mac addr %x:%x:%x:%x:%x:%x\n", portid,
-	    options.s_host_spec.mac_addr.addr_bytes[0],
-	    options.s_host_spec.mac_addr.addr_bytes[1],
-	    options.s_host_spec.mac_addr.addr_bytes[2],
-	    options.s_host_spec.mac_addr.addr_bytes[3],
-	    options.s_host_spec.mac_addr.addr_bytes[4],
-	    options.s_host_spec.mac_addr.addr_bytes[5]);
-
 	sleep(INIT_DELAY);

 	for (unsigned int i = 0; i < options.s_num_threads; i++) {
@ -958,7 +881,7 @@ main(int argc, char *argv[])
 	uint32_t qps;
 	uint32_t total_recv;
 	uint32_t total_loss;
-	calc_stats(nm_get_uptime_ns(), &qps, &total_recv, &total_loss);
+	calc_stats(topo_uptime_ns(), &qps, &total_recv, &total_loss);
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "qps = %d, recv = %d, loss = %d\n",
 	    qps, total_recv, total_loss);

@ -969,7 +892,7 @@ main(int argc, char *argv[])
 	}

 	// clean up
-	rte_eth_dev_stop(portid);
+	dpdk_cleanup(&dconf);

 	return 0;
 }
--- a/scripts/libs/libtc.py
+++ b/scripts/libs/libtc.py
@ -1,3 +1,4 @@
+from lib2to3.refactor import get_fixers_from_package
 import subprocess as sp
 import time
 import select
@ -95,12 +96,14 @@ def check_stderr(p, sel, exclude = []):# -> tuple[bool, list[str]]:

 	good = True
 	for e in err:
+		e = e.strip()
 		if len(e) == 0:
 			continue
 		
+		good = False
 		for exc in exclude:
-			if not (exc in err):
-				good = False
+			if exc in e:
+				good = True
 				break
 			
 	return good, err
@ -124,9 +127,10 @@ def thr_check_stderr(p : sp.Popen, name: str, exclude):
 			if not status:
 				errthr_failed = True
 				local_failed = True
-				log_print("Error detected in \"" + name + "\":\n")
+				log_print("Error detected in \"" + name + "\":")
 				for e in err:
-					log_print("        " + e + "\n")
+					log_print("        \"" + e + "\"")
+				log_print("")
 		time.sleep(random.uniform(0.001, 0.1))

 def errthr_start():
--- a/scripts/run.py
+++ b/scripts/run.py
@ -18,15 +18,15 @@ loadgen_load = "fixed:0"
 # pkt_pad
 jumbo_frame_threshold = 1518
 pkt_pads = [
+	#"9018",
+	#1518",
 	"0",
 	"256",
 	"512",
-	"1024",
-	#"1518",
+	"1024"
 	#"2048",
 	#"4096",
-	#"8192",
-	#"9018"
+	#"8192"
 ]

 pkt_pads_depth = {}
@ -74,25 +74,32 @@ memgen_target = [


 # paths
-test_dir = "/numam.d/build"
+test_dir = "/numam.d/build/bin"
 file_dir = os.path.dirname(os.path.realpath(__file__))
 root_dir = os.path.join(file_dir,"..")
 sample_filename = "sample.txt"

 affinity = [
-	#"1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47",
-	"49,51,53,55,57,59,61,63,65,67,69,71,73,75,77,79,81,83,85,87,89,91,93,95"
+    "1,3,5,7,9,11,13,15,17,19,21,23",
+	"65,67,69,71,73,75,77,79,81,83,85,87,89,91,93,95,97,99,101,103,105,107,109,111,113,115,117,119,121,123,125,127",
+	"1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63",
+	"1,3,5,7,9,11,13,15",
+	"17,19,21,23,25,27,29,31",
+	"33,35,37,39,41,43,45,47",
+	"49,51,53,55,57,59,61,63"
 ]

-master = ["skylake2.rcs.uwaterloo.ca"]
-master_spec = ["192.168.123.10@3c:15:fb:c9:f3:36"]
+server = ["skylake5.rcs.uwaterloo.ca"]
+server_spec = ["192.168.123.13@3c:15:fb:c9:f3:28"]
 master_cpumask = "2" # 1 thread

-server = ["icelake2-int.rcs.uwaterloo.ca"]
-server_spec = ["192.168.123.9@3c:ec:ef:61:39:f3"]
+master = ["icelake2-int.rcs.uwaterloo.ca"]
+master_spec = ["192.168.123.9@40:a6:b7:7c:86:10"]
+#server = ["milan1-int.rcs.uwaterloo.ca"]
+#server_spec = ["192.168.123.9@00:07:43:54:37:08"]

-clients = ["skylake3.rcs.uwaterloo.ca", "skylake5.rcs.uwaterloo.ca", "skylake6.rcs.uwaterloo.ca"]# "skylake7.rcs.uwaterloo.ca", "skylake8.rcs.uwaterloo.ca"]
-client_spec = ["192.168.123.12@3c:15:fb:c9:f3:4b", "192.168.123.13@3c:15:fb:c9:f3:28", "192.168.123.14@3c:15:fb:62:9b:2f"] #, "192.168.123.13@3c:15:fb:62:9c:be"]
+clients = ["skylake2.rcs.uwaterloo.ca"]#, "skylake5.rcs.uwaterloo.ca", "skylake6.rcs.uwaterloo.ca"]# "skylake7.rcs.uwaterloo.ca", "skylake8.rcs.uwaterloo.ca"]
+client_spec = ["192.168.123.12@3c:15:fb:c9:f3:36"]#, "192.168.123.13@3c:15:fb:c9:f3:28", "192.168.123.14@3c:15:fb:62:9b:2f"] #, "192.168.123.13@3c:15:fb:62:9c:be"]
 client_cpumask = "1,3,5,7,9,11,13,15,17,19,21,23"

 client_rage_quit = 1000 #1s
@ -191,7 +198,7 @@ def run_exp(affinity : str, ld : int, pkt_pad : str, aff_idx : int):
 		p = sp[0]

 		# launch stderr monitoring thread
-		exclude = ["Pseudo-terminal"]
+		exclude = ["Pseudo-terminal", "ice_", "i40e_"]
 		tc.errthr_create([p], master, exclude)
 		if not client_only:
 			tc.errthr_create(ssrv, server, exclude)
--- a/scripts/setup_dpdk.sh
+++ b/scripts/setup_dpdk.sh
@ -1,7 +1,8 @@
 #!/bin/sh
 dpdk_dir="/dpdk"
+libtopo_dir="/libtopo"
 root="$(dirname "$0")/.."
-servers="skylake3.rcs.uwaterloo.ca" # skylake5.rcs.uwaterloo.ca skylake6.rcs.uwaterloo.ca skylake7.rcs.uwaterloo.ca skylake8.rcs.uwaterloo.ca" 
+servers="skylake2.rcs.uwaterloo.ca skylake3.rcs.uwaterloo.ca skylake5.rcs.uwaterloo.ca skylake6.rcs.uwaterloo.ca icelake2-int.rcs.uwaterloo.ca milan1-int.rcs.uwaterloo.ca" 
 rsync_flags="-az"
 ssh_args="-o StrictHostKeyChecking=no -p77"

@ -18,8 +19,8 @@ echo "USER: $user"
 compile() {
    # separate these functions because we might change kernel (reboot) without needing to recompile
    echo "====================$1===================="
-    #ssh $(echo $ssh_args $user@$1) "sudo reboot"
-    ssh $(echo $ssh_args $user@$1) "sudo sh -c \"rm -rf $dpdk_dir; mkdir -p $dpdk_dir; cd $dpdk_dir; git clone https://git.quacker.org/d/numam-dpdk; cd numam-dpdk; git checkout releases-13.0; meson -Denable_kmods=true build; cd build; ninja install\""
+    ssh $(echo $ssh_args $user@$1) "sudo sh -c \"sudo rm -rf $libtopo_dir; sudo rm -rf /usr/local/include/libtopo; sudo rm -rf /usr/local/lib/libtopo;sudo mkdir -p $libtopo_dir; sudo chmod 777 $libtopo_dir; cd $libtopo_dir; git clone https://git.quacker.org/d/libtopo; cd libtopo; mkdir build; cd build; cmake ../; sudo make install\""
+    ssh $(echo $ssh_args $user@$1) "sudo sh -c \"sudo pkg install -y meson pkgconf py38-pyelftools; sudo rm -rf $dpdk_dir; sudo mkdir -p $dpdk_dir; sudo chmod 777 $dpdk_dir; cd $dpdk_dir; git clone https://git.quacker.org/d/numam-dpdk; cd numam-dpdk; git checkout migration; CC=gcc CXX=g++ meson -Denable_kmods=true build; cd build; ninja install\""
    wait
    echo "$1 Done."
    echo ""
--- a/scripts/setup_program.sh
+++ b/scripts/setup_program.sh
@ -1,7 +1,7 @@
 #!/bin/sh
 dpdk_dir="/numam.d"
 root="$(dirname "$0")/.."
-servers="icelake2-int.rcs.uwaterloo.ca" # skylake7.rcs.uwaterloo.ca skylake8.rcs.uwaterloo.ca icelake2-int.rcs.uwaterloo.ca" 
+servers="skylake2.rcs.uwaterloo.ca skylake3.rcs.uwaterloo.ca skylake5.rcs.uwaterloo.ca skylake6.rcs.uwaterloo.ca icelake2-int.rcs.uwaterloo.ca milan1-int.rcs.uwaterloo.ca" 
 rsync_flags="-rv -e \"ssh -p77\""
 ssh_args="-o StrictHostKeyChecking=no -p77"

--- a/test/ts.cc
+++ b/test/ts.cc
@ -1,84 +0,0 @@
-#include <unistd.h>
-#include "defs.hh"
-#include "nm.hh"
-#include <ctime>
-
-void test(const char * case_name, struct timespec * ts)
-{
-    uint64_t slow;
-    uint64_t fast;
-
-    slow = get_uptime();
-    if (nanosleep(ts, nullptr) != 0) {
-        perror("nanosleep() interrupted!");
-        exit(-1);
-    }
-    slow = get_uptime() - slow;
-
-    fast = nm_get_uptime_ns();
-    if (nanosleep(ts, nullptr) != 0) {
-        perror("nanosleep() interrupted!");
-        exit(-1);
-    }
-    fast = nm_get_uptime_ns() - fast;
-
-    printf("%s: get_uptime(): %lu, nm_get_uptime_ns(): %lu\n", case_name, slow, fast);
-}
-
-
-int main()
-{
-    struct timespec ts;
-    nm_init(0);
-    // 1s
-    ts.tv_nsec = 0;
-    ts.tv_sec = 1;
-    test("1s", &ts);
-
-    // 100ms
-    ts.tv_nsec = 100000000;
-    ts.tv_sec = 0;
-    test("100ms", &ts);
-
-    // 10ms
-    ts.tv_nsec = 10000000;
-    ts.tv_sec = 0;
-    test("10ms", &ts);
-
-    // 1ms
-    ts.tv_nsec = 1000000;
-    ts.tv_sec = 0;
-    test("1ms", &ts);
-
-    // 100us
-    ts.tv_nsec = 100000;
-    ts.tv_sec = 0;
-    test("100us", &ts);
-
-    // 10us
-    ts.tv_nsec = 10000;
-    ts.tv_sec = 0;
-    test("10us", &ts);
-
-    // 1us
-    ts.tv_nsec = 1000;
-    ts.tv_sec = 0;
-    test("1us", &ts);
-
-    // 100ns
-    ts.tv_nsec = 100;
-    ts.tv_sec = 0;
-    test("100ns", &ts);
-
-    // 10ns
-    ts.tv_nsec = 10;
-    ts.tv_sec = 0;
-    test("10ns", &ts);
-
-    // 1ns
-    ts.tv_nsec = 1;
-    ts.tv_sec = 0;
-    test("1ns", &ts);
-
-    return 0;
-}
--- a/tests/nms_test.c
+++ b/tests/nms_test.c
@ -0,0 +1,32 @@
+#include "nms.h"
+#include <assert.h>
+#include <stdio.h>
+
+int main(void)
+{
+    void * ret;
+
+    nms_init(1);
+    // duplicate init
+    nms_init(1);
+
+    // 1G 
+    ret = nms_malloc(0, 1024 * 1024 * 1024);
+    assert(ret != NULL);
+    printf("1G: %p\n", ret);
+
+    // two 511Ms 
+    ret = nms_malloc(0, 511 * 1024 * 1024);
+    assert(ret != NULL);
+    printf("511M: %p\n", ret);
+    ret = nms_malloc(0, 511 * 1024 * 1024);
+    assert(ret != NULL);
+    printf("511M: %p\n", ret);
+
+    // another 1G
+    ret = nms_malloc(0, 1024 * 1024 * 1024);
+    assert(ret != NULL);
+    printf("1G: %p\n", ret);
+
+    return 0;
+}
--- a/util/memloadgen.cc
+++ b/util/memloadgen.cc
@ -1,8 +1,9 @@
-#include "nm.hh"
+#include "gen.hh"
 #include <cstdlib>
 #include "ntr.h"
 #include <getopt.h>
 #include <unistd.h>
+#include <topo.h>

 static void
 usage()
@ -12,7 +13,9 @@ usage()
 		"    -v: verbose mode\n"
 	    "    -b: MLG bytes per second\n"
 	    "    -x: MLG thread affinity mask\n"
-	    "    -X: MLG target domain affinity mask\n");
+	    "    -X: MLG target domain affinity mask\n"
+		"    -S: shared buffer\n"
+		"    -i: iterations\n");
 	fflush(stdout);
 }

@ -21,14 +24,15 @@ int main(int argc, char * argv[])
 	ntr_init();
 	ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_WARNING);

-	unsigned long long mlg_arrsz = 0;
-	uint32_t mlg_iter = -1;
-	unsigned long long mlg_dmask = 0;
-	unsigned long long mlg_cmask = 0;
+	size_t arr_sz = 0;
+	uint32_t iter = -1;
+	cpuset_t threads;
+	int shared_buffer = 0;
+	cpuset_t domain_mask;
 	{
 		int c;
 		// parse arguments
-		while ((c = getopt(argc, argv, "hb:X:x:vi:")) != -1) {
+		while ((c = getopt(argc, argv, "hb:X:x:vi:S")) != -1) {
 			switch (c) {
 			case 'v':
 				ntr_set_level(NTR_DEP_USER1,
@ -38,18 +42,19 @@ int main(int argc, char * argv[])
 				usage();
 				exit(0);
 			case 'b':
-				mlg_arrsz = strtoull(optarg, nullptr, 10);
+				arr_sz = strtoull(optarg, nullptr, 10);
 				break;
 			case 'i':
-				mlg_iter = strtoul(optarg, nullptr, 10);
+				iter = strtoul(optarg, nullptr, 10);
 				break;
 			case 'X':
-				mlg_dmask = strtoull(
-				    optarg, nullptr, 16);
+				cpulist_to_cpuset(optarg, &domain_mask);
 				break;
 			case 'x':
-				mlg_cmask = strtoull(
-				    optarg, nullptr, 16);
+				cpulist_to_cpuset(optarg, &threads);
+				break;
+			case 'S':
+				shared_buffer = 1;
 				break;
 			default:
 				usage();
@ -58,18 +63,22 @@ int main(int argc, char * argv[])
 		}
 	}

-	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "MLG: [size: %llu, iter: %u, threads: 0x%llx, domain: 0x%llx]\n", mlg_arrsz, mlg_iter, mlg_cmask, mlg_dmask);
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "MLG: [size: %ld iter: %u, # threads: 0x%d, domain: 0x%ld]\n", arr_sz, iter, CPU_COUNT(&threads), CPU_FFS(&domain_mask) - 1);

-	// init nm
-	if (nm_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != 0) {
-		fprintf(stderr, "nm init failed!\n");
+	// init topo
+	if (topo_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != 0) {
+		fprintf(stderr, "libtopo init failed!\n");
 		exit(1);
 	}

 	bool success = false;
-    memload_generator * mgen = new memload_generator(mlg_cmask, mlg_dmask, mlg_arrsz, mlg_iter, &success);
+	memload_generator::memload_generator_options opts;
+	opts.chunk_size = arr_sz;
+	opts.iteration = iter;
+	opts.shared_buffer = shared_buffer;
+	opts.verbose = 1;

-	mgen->start();
+    auto mgen = new memload_generator(&threads, &domain_mask, &opts, &success);
 	while(!mgen->check_done()) {
 		usleep(10000);
 	}
@ -78,6 +87,8 @@ int main(int argc, char * argv[])
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
 			"main: MLG bps = %ld ~= %ldM\n", bps, bps / 1024 / 1024);

+	delete mgen;
+
 	return 0;
 }