update various components for new machines

2022-05-25 06:55:01 -04:00 · 2022-05-25 06:55:01 -04:00 · a716583b19
commit a716583b19
parent d217bde46a
24 changed files with 551 additions and 291 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -57,10 +57,15 @@ target_compile_options(birb PRIVATE ${CC_FLAGS} ${SPDK_CFLAGS} ${UUID_CFLAGS})
 target_link_directories(birb PRIVATE ${SPDK_LIBRARY_DIRS} ${SPDK_SYS_STATIC_LIBRARY_DIRS} ${UUID_LIBRARY_DIRS})
 target_link_libraries(birb PRIVATE pthread nm ntr gen -Wl,--whole-archive ${SPDK_LIBRARIES} -Wl,--no-whole-archive ${SPDK_SYS_STATIC_LIBRARIES})

-add_executable(posix EXCLUDE_FROM_ALL storage/birb_posix.cc storage/io_gen.cc)
-target_compile_options(posix PRIVATE ${CC_FLAGS})
-target_link_libraries(posix PRIVATE pthread nm ntr gen)
+add_executable(birb_posix EXCLUDE_FROM_ALL storage/birb_posix.cc storage/io_gen.cc)
+target_compile_options(birb_posix PRIVATE ${CC_FLAGS})
+target_link_libraries(birb_posix PRIVATE pthread nm ntr gen)

 add_executable(memloadgen util/memloadgen.cc)
 target_link_libraries(memloadgen PRIVATE pthread nm ntr)
-target_compile_options(memloadgen PRIVATE ${CC_FLAGS})
+target_compile_options(memloadgen PRIVATE ${CC_FLAGS})
+
+add_executable(test_ts test/ts.cc)
+set_target_properties(test_ts PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/test)
+target_link_libraries(test_ts PRIVATE nm)
+target_compile_options(test_ts PRIVATE ${CC_FLAGS})
--- a/inc/defs.hh
+++ b/inc/defs.hh
@ -1,8 +1,13 @@
 #pragma once

+#include <sys/cpuset.h>
 #include <cstdint>
 #include <cstring>
 #include <immintrin.h>
+#include <ctime>
+#include <cstdio>
+#include <sys/cpuset.h>
+#include <sys/_cpuset.h>

 #define DISALLOW_EVIL_CONSTRUCTORS(TypeName) \
 	TypeName(const TypeName &) = delete; \
@ -30,5 +35,25 @@ cmask_get_num_cpus(const uint64_t mask)
 	return _mm_popcnt_u64(mask);
 }

+static inline uint64_t
+get_uptime()
+{
+	struct timespec tp;
+	clock_gettime(CLOCK_MONOTONIC, &tp);
+    return (tp.tv_sec * S2NS + tp.tv_nsec);
+}
+
+static inline void
+cpulist_to_cpuset(char * cpulist, cpuset_t * cpuset)
+{
+	char * cpu = strtok(cpulist, ",");
+	CPU_ZERO(cpuset);
+
+	while (cpu != nullptr) {
+		CPU_SET(atoi(cpu), cpuset);
+		cpu = strtok(nullptr, ",");
+	}
+}
+
 #define ATTR_UNUSED __attribute__((unused))

--- a/inc/nm.hh
+++ b/inc/nm.hh
@ -1,10 +1,13 @@
 #pragma once

+#include <sys/endian.h>
 #include "gen.hh"
 #include "defs.hh"

 #include <atomic>
 #include <cstdint>
+#include <chrono>
+#include <ctime>

 constexpr static unsigned int NM_LEVEL_NUMA = 0;
 constexpr static unsigned int NM_LEVEL_CPU = 1;
@ -53,13 +56,13 @@ nm_get_node_from_core(int coreid)
 class memload_generator {
    private:
 	DISALLOW_EVIL_CONSTRUCTORS(memload_generator);
-	constexpr static uint32_t FROM_REGION_CNT = 0x2; // 2 regions
 	struct thread_info {
 		pthread_t pthr;
-		std::atomic<uint64_t> num_trans;
-		Generator *ia_gen;
+		uint32_t iter;
+		uint32_t array_size;
 		std::atomic<int> init_status;
 		std::atomic<int> *state;
+		std::atomic<uint32_t> num_trans;
 		constexpr static int INIT_START = 0;
 		constexpr static int INIT_SUCCESS = 1;
 		constexpr static int INIT_FAILED = 2;
@ -67,9 +70,9 @@ class memload_generator {
 		void *to_region;
 		uint32_t to_domainid;
 		uint32_t from_domainid;
+		uint64_t begin_ts;
+		uint64_t stop_ts;
 	};
-	constexpr static uint32_t TRANSACTION_SZ = 0x2000; // transaction sz must >= region_sz
-	constexpr static uint32_t REGION_SZ = 0x2000 * 0x2000; // 64MB per core
 	std::vector<struct thread_info *> thr_infos;

 	std::atomic<int> state;
@ -77,17 +80,18 @@ class memload_generator {
 	constexpr static uint32_t STATE_START = 1;
 	constexpr static uint32_t STATE_STOP = 2;

-	uint64_t begin_ts;
-	uint64_t stop_ts;
+	uint32_t array_size;
+	uint32_t iteration;

 	static void *worker_thrd(void *_tinfo);

    public:
-	memload_generator(uint64_t from_cmask, uint64_t to_cmask, uint64_t bps,
+	memload_generator(uint64_t from_cmask, uint64_t to_cmask, uint32_t array_sz, uint32_t iteration,
 	    bool *success);
 	void start();
 	void stop();
 	uint64_t get_bps();
+	bool check_done();
 	~memload_generator();
 };

--- a/inc/storage/drivers/nvme.hh
+++ b/inc/storage/drivers/nvme.hh
@ -23,6 +23,7 @@ private:
        spdk_nvme_ctrlr ** ctrlr;
        spdk_nvme_ns ** ns;
        const char * dev_name;
+        int valid;
    };

    DISALLOW_EVIL_CONSTRUCTORS(birb_nvme_driver);
--- a/libnm/alloc.cc
+++ b/libnm/alloc.cc
@ -50,7 +50,7 @@ nm_alloc_init()

 		for (unsigned int j = 0; j < MEM_REGION_NUM; j++) {
 			if ((nm_mem_regions[i][j] = mmap(nullptr, MEM_OBJ_NUM * MEM_OBJ_SIZE, PROT_READ | PROT_WRITE,
-			MAP_ANON | MAP_ALIGNED_SUPER | MAP_NOCORE | MAP_PRIVATE,
+			MAP_ANON | MAP_ALIGNED_SUPER | MAP_NOCORE | MAP_PRIVATE | MAP_NOSYNC,
 			-1, 0)) == MAP_FAILED) {
 				if (nm_get_verbose() > 0) {
 					fprintf(stdout, "libnm: mmap failed with %d\n", errno);
--- a/libnm/loadgen.cc
+++ b/libnm/loadgen.cc
@ -1,10 +1,13 @@
 #include "nmp.hh"
+#include <atomic>

 #include <sys/cpuset.h>
 #include <sys/domainset.h>
+#include <sys/endian.h>
 #include <sys/thr.h>
 #include <pthread.h>
 #include <pthread_np.h>
+#include <unistd.h>

 void *
 memload_generator::worker_thrd(void *_tinfo)
@ -19,8 +22,13 @@ memload_generator::worker_thrd(void *_tinfo)
 		    tid, tinfo->from_domainid, tinfo->to_domainid);
 	}

-    tinfo->from_region = nm_malloc(tinfo->from_domainid, REGION_SZ * FROM_REGION_CNT);
-    tinfo->to_region = nm_malloc(tinfo->to_domainid, REGION_SZ * FROM_REGION_CNT);
+	if (tinfo->from_region == nullptr) {
+    	tinfo->from_region = nm_malloc(tinfo->from_domainid, tinfo->array_size);
+	}
+
+	if (tinfo->to_region == nullptr) {
+    	tinfo->to_region = nm_malloc(tinfo->to_domainid, tinfo->array_size);
+	}

    if (tinfo->from_region == nullptr || tinfo->to_region == nullptr) {
 		tinfo->init_status.store(thread_info::INIT_FAILED);
@ -32,16 +40,15 @@ memload_generator::worker_thrd(void *_tinfo)
    }

 	// populate the region with 1/2/3s
-	for(uint i = 0; i < FROM_REGION_CNT; i++) {
-		memset((char*)tinfo->from_region + i * REGION_SZ, i + 1, REGION_SZ);
-	}
+	memset((char*)tinfo->from_region, 1, tinfo->array_size);
+	memset((char*)tinfo->to_region, 2, tinfo->array_size);

 	tinfo->init_status.store(thread_info::INIT_SUCCESS);

 	if (nm_get_verbose() > 0) {
 		fprintf(stdout,
-		    "memload_generator <thread %ld>: init finished, waiting for start...\n",
-		    tid);
+		    "memload_generator <thread %ld>: init finished, from: %p, to: %p, waiting for start...\n",
+		    tid, tinfo->from_region, tinfo->to_region);
 	}

 	while (tinfo->state->load() == STATE_READY) {
@ -51,26 +58,22 @@ memload_generator::worker_thrd(void *_tinfo)
 		fprintf(
 		    stdout, "memload_generator <thread %ld>: running...\n", tid);
 	}
-
-	uint64_t offset = 0;
-	uint64_t next_ts = nm_get_uptime_ns();
+	tinfo->begin_ts = nm_get_uptime_ns();
 	while (tinfo->state->load() == STATE_START) {
-		// generate traffic
-		uint64_t now = nm_get_uptime_ns();
-
-		if (now >= next_ts) {
-			next_ts = next_ts + tinfo->ia_gen->generate() * S2NS;
-			uint64_t to_offset = offset % REGION_SZ;
-			uint64_t from_offset = offset % (REGION_SZ * FROM_REGION_CNT);
-			memcpy((char *)tinfo->to_region + to_offset, (char *)tinfo->from_region + from_offset, TRANSACTION_SZ);
-			offset += TRANSACTION_SZ;
-
+		if (tinfo->num_trans.load() < tinfo->iter) {
+			// generate traffic
+			memcpy((char *)tinfo->to_region, (char *)tinfo->from_region, tinfo->array_size);
 			tinfo->num_trans.fetch_add(1);
+			if (tinfo->num_trans.load() >= tinfo->iter) {
+				tinfo->stop_ts = nm_get_uptime_ns();
+			}
+		} else {
+			usleep(1000 * 100);
 		}
 	}

-	nm_free(tinfo->from_domainid, tinfo->from_region);
-	nm_free(tinfo->to_domainid, tinfo->to_region);
+	//nm_free(tinfo->from_domainid, tinfo->from_region);
+	//nm_free(tinfo->to_domainid, tinfo->to_region);

 	if (nm_get_verbose() > 0) {
 		fprintf(
@ -80,10 +83,12 @@ memload_generator::worker_thrd(void *_tinfo)
 }

 memload_generator::memload_generator(
-    uint64_t from_cmask, uint64_t to_cmask, uint64_t bps, bool *success)
+    uint64_t from_cmask, uint64_t to_cmask, uint32_t array_sz, uint32_t iteration, bool *success)
 {
 	*success = false;
 	state.store(STATE_READY);
+	this->array_size = array_sz;
+	this->iteration = iteration;
 	int nextcore;
 	int to_coreid = cmask_get_next_cpu(&to_cmask);
 	int num_cores = cmask_get_num_cpus(from_cmask);
@ -93,17 +98,22 @@ memload_generator::memload_generator(
 		return;
 	}

-	while ((nextcore = cmask_get_next_cpu(&from_cmask)) != NEXT_CPU_NULL) {
+	void * from = nm_malloc(nm_obj_get_id(nm_obj_find_parent(nm_obj_from_id(NM_LEVEL_CORE, to_coreid), NM_LEVEL_NUMA)), array_sz);
+	nextcore = cmask_get_next_cpu(&from_cmask);
+	void * to = nm_malloc(nm_obj_get_id(nm_obj_find_parent(nm_obj_from_id(NM_LEVEL_CORE, nextcore), NM_LEVEL_NUMA)), array_sz);
+	while (nextcore != NEXT_CPU_NULL) {
 		auto *info = new struct thread_info;
 		pthread_attr_t attr;
-		info->ia_gen = createGenerator("exponential");
-		info->ia_gen->set_lambda(((double)(bps) / (double)num_cores) /
-		    (double)(REGION_SZ));
 		info->num_trans.store(0);
 		info->to_domainid = nm_obj_get_id(nm_obj_find_parent(nm_obj_from_id(NM_LEVEL_CORE, to_coreid), NM_LEVEL_NUMA));
 		info->from_domainid = nm_obj_get_id(nm_obj_find_parent(nm_obj_from_id(NM_LEVEL_CORE, nextcore), NM_LEVEL_NUMA));
 		info->init_status.store(thread_info::INIT_START);
 		info->state = &state;
+		info->iter = this->iteration;
+		info->array_size = this->array_size;
+
+    	info->from_region = from;
+    	info->to_region = to;

 		CPU_ZERO(&cpuset);
 		CPU_SET(nextcore, &cpuset);
@ -118,6 +128,7 @@ memload_generator::memload_generator(
 		}

 		thr_infos.push_back(info);
+		nextcore = cmask_get_next_cpu(&from_cmask);
 	}

 	if (nm_get_verbose() > 0) {
@ -153,31 +164,44 @@ memload_generator::start()
 {
 	if (this->state.load() == STATE_READY) {
 		state.store(STATE_START);
-		begin_ts = nm_get_uptime_ns();
 	}
 }

 void
 memload_generator::stop()
 {
-	if (this->state.load() != STATE_STOP) {
-		stop_ts = nm_get_uptime_ns();
+	if (this->state.load() == STATE_START) {
 		state.store(STATE_STOP);
 	}
 }

+bool
+memload_generator::check_done()
+{
+	bool done = true;
+	for (auto i : thr_infos) {
+		done = done && (i->num_trans.load() >= this->iteration);
+	}
+	if (done) {
+		stop();
+	}
+	return done;
+}
+
 uint64_t
 memload_generator::get_bps()
 {
-	uint64_t now = state.load() == STATE_STOP ? stop_ts :
-							  nm_get_uptime_ns();
-	uint64_t total_transactions = 0;
-	for (auto i : thr_infos) {
-		total_transactions += i->num_trans.load();
+	if (this->state.load() == STATE_STOP) {
+		uint64_t total_transactions = 0;
+		uint64_t total_time = 0;
+		for (auto i : thr_infos) {
+			total_transactions += i->num_trans.load();
+			total_time = (i->stop_ts - i->begin_ts) + total_time;
+		}
+		return (double)(total_transactions * this->array_size) / ((double)total_time / thr_infos.size() / S2NS);
+	} else {
+		return 0;
 	}
-
-	return (double)(TRANSACTION_SZ * total_transactions) /
-	    (double)((now - begin_ts) / (S2NS));
 }

 memload_generator::~memload_generator()
@ -188,7 +212,6 @@ memload_generator::~memload_generator()

 	for (auto i : thr_infos) {
 		pthread_join(i->pthr, nullptr);
-		delete i->ia_gen;
 		delete i;
 	}
 }
--- a/libnm/nm.cc
+++ b/libnm/nm.cc
@ -25,14 +25,17 @@ uint64_t
 nm_get_uptime_ns()
 {
 	unsigned int dummy;
-	return nm_tsc2ns(__rdtscp(&dummy));
+	_mm_lfence();
+	uint64_t tsc = __rdtscp(&dummy);
+	_mm_lfence();
+	return nm_tsc2ns(tsc);
 }

 uint64_t
 nm_tsc2ns(uint64_t tsc)
 {
 	return (uint64_t)(
-	    (double)tsc / (double)sysctl_tsc_freq * (double)1000000000ul);
+	    (double)tsc / (double)sysctl_tsc_freq * S2NS);
 }

 // 0 on success
--- a/net/cat.cc
+++ b/net/cat.cc
@ -22,7 +22,7 @@
 #include <random>
 #include <vector>

-constexpr static unsigned int MBUF_MAX_COUNT = 65536;
+#define MBUF_MAX_COUNT (rte_lcore_count() * 4096)
 constexpr static unsigned int MBUF_CACHE_SIZE = 512;
 constexpr static unsigned int RX_RING_SIZE = 1024;
 constexpr static unsigned int TX_RING_SIZE = 1024;
@ -61,7 +61,7 @@ struct options_t {
 	unsigned int master_mode { 0 };
 	struct net_spec server_spec {
 	};
-	uint64_t cpu_mask { 0x4 }; // 2nd  core
+	cpuset_t cpu_set = CPUSET_T_INITIALIZER(0x2); // 2nd  core
 	std::vector<struct net_spec *> slaves;
 	uint32_t pkt_loss_failure_threshold { 0 };
 	uint32_t pkt_loss_time_ms { UINT32_MAX };
@ -748,14 +748,14 @@ dump_options()
 	    "    run time = %d\n"
 	    "    warmup time = %d\n"
 	    "    output file = %s\n"
-	    "    cpu mask = 0x%lx\n"
+	    "    number of threads = %d\n"
 	    "    interarrival dist = %s\n"
 	    "    target qps = %d\n"
 	    "    host IP = 0x%x\n"
 	    "    pkt loss time = %u\n"
 	    "    pkt loss failure threshold = %u\n",
 	    ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING, options.run_time,
-	    options.warmup_time, options.output, options.cpu_mask,
+	    options.warmup_time, options.output, CPU_COUNT(&options.cpu_set),
 	    options.ia_gen_str, options.target_qps, options.s_host_spec.ip,
 	    options.pkt_loss_time_ms, options.pkt_loss_failure_threshold);

@ -858,8 +858,7 @@ main(int argc, char *argv[])
 				    sizeof(options.output) - 1);
 				break;
 			case 'A':
-				options.cpu_mask = strtoull(
-				    optarg, nullptr, 16);
+				cpulist_to_cpuset(optarg, &options.cpu_set);
 				break;
 			case 'i':
 				strncpy(options.ia_gen_str, optarg,
@ -959,11 +958,12 @@ main(int argc, char *argv[])
 	    options.s_host_spec.mac_addr.addr_bytes[3],
 	    options.s_host_spec.mac_addr.addr_bytes[4],
 	    options.s_host_spec.mac_addr.addr_bytes[5]);
-	uint64_t cmask = options.cpu_mask;
-	const int16_t core_id = cmask_get_next_cpu(&cmask);
-	if (core_id == NEXT_CPU_NULL) {
-		rte_exit(EXIT_FAILURE, "invalid cpu mask 0x%lx\n", cmask);
+
+	int core_id = CPU_FFS(&options.cpu_set);
+	if (core_id == 0) {
+		rte_exit(EXIT_FAILURE, "invalid cpu list\n");
 	}
+	core_id--;

 	sleep(INIT_DELAY);
 	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
@ -1009,7 +1009,6 @@ main(int argc, char *argv[])
 				 << it->srv_sw_rx << ',' << it->srv_sw_tx << ','
 				 << it->srv_hw_rx << ',' << it->srv_hw_tx
 				 << std::endl;
-			printf("Writing ... datapt %p", it);
 		}
 		delete it;
 	}
--- a/net/khat.cc
+++ b/net/khat.cc
@ -15,10 +15,15 @@
 #include "net/util.hh"

 #include <atomic>
+#include <cstdio>
+#include <cstring>
 #include <ctime>
 #include <vector>
+#include <sys/_cpuset.h>
+#include <sys/cpuset.h>
+#include <sys/endian.h>

-constexpr static unsigned int MBUF_MAX_COUNT = 65536;
+#define MBUF_MAX_COUNT (rte_lcore_count() * 4096)
 constexpr static unsigned int MBUF_CACHE_SIZE = 512;
 constexpr static unsigned int RX_RING_SIZE = 2048;
 constexpr static unsigned int TX_RING_SIZE = 2048;
@ -76,16 +81,15 @@ struct probe_state_t {
 struct options_t {
 	// config
 	int num_threads { 1 };
-	uint64_t cpuset { 0x4 };  // 2nd core
-	uint64_t memmask { 0x0 }; // same socket as the NIC
+	cpuset_t cpu_set = CPUSET_T_INITIALIZER(0x2);  // 2nd core
 	char mempool_name_buf[MEMPOOL_NAME_BUF_LEN];
 	bool jumbo_frame_enabled { false }; // setting this to true changes mbuf size and mtu
 	int port_mtu { MAX_STANDARD_MTU };
 	int thread_cacheline_cnt = { 128 };
 	bool mlg_enabled { false };
 	uint64_t mlg_bps { 0 };
-	uint64_t mlg_cmask { 0 };
-	uint64_t mlg_dmask { 0 };
+	cpuset_t mlg_cset = CPUSET_T_INITIALIZER(0x2);
+	cpuset_t mlg_dset = CPUSET_T_INITIALIZER(0x1);
 	memload_generator *mlg { nullptr };
 	// states
 	uint16_t s_portid { 0 };
@ -539,7 +543,6 @@ usage()
 	    "    -v(vv): verbose mode\n"
 	    "    -h: seek help\n"
 	    "    -A: cpu mask for worker threads\n"
-	    "    -M: mempool socket affinity mask\n"
 	    "    -m: enable memory load generator(MLG)\n"
 	    "    -b: MLG bytes per second\n"
 	    "    -x: MLG thread affinity mask\n"
@ -556,15 +559,12 @@ dump_options()
 	    "main: khat configuration:\n"
 	    "          verbosity: +%d\n"
 	    "          thread count: %d\n"
-	    "          cpu mask: 0x%lx\n"
-	    "          mempool mask: 0x%lx\n"
 	    "          ip: 0x%x\n"
-	    "          MLG: %s [bps: %ld, threads: 0x%lx, domain: 0x%lx]\n"
+	    "          MLG: %s [bps: %ld, thread cnt: %d, domain: %ld]\n"
 		"          jumbo frame: %d\n",
-	    ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING,
-	    options.num_threads, options.cpuset, options.memmask,
+	    ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING, options.num_threads,
 	    options.s_host_spec.ip, options.mlg_enabled ? "on" : "off",
-	    options.mlg_bps, options.mlg_cmask, options.mlg_dmask, options.jumbo_frame_enabled);
+	    options.mlg_bps, CPU_COUNT(&options.mlg_cset), CPU_FFS(&options.mlg_dset) - 1, options.jumbo_frame_enabled);
 }

 int
@ -590,7 +590,7 @@ main(int argc, char *argv[])
 	{
 		int c;
 		// parse arguments
-		while ((c = getopt(argc, argv, "hvA:M:H:mb:X:x:J")) != -1) {
+		while ((c = getopt(argc, argv, "hvA:H:mb:X:x:J")) != -1) {
 			switch (c) {
 			case 'v':
 				ntr_set_level(NTR_DEP_USER1,
@ -600,17 +600,13 @@ main(int argc, char *argv[])
 				usage();
 				rte_exit(EXIT_SUCCESS, "\n");
 			case 'A':
-				options.cpuset = strtoull(optarg, nullptr, 16);
-				options.num_threads = cmask_get_num_cpus(
-				    options.cpuset);
+				cpulist_to_cpuset(optarg, &options.cpu_set);
+				options.num_threads = CPU_COUNT(&options.cpu_set);
 				if (options.num_threads == 0) {
 					rte_exit(EXIT_FAILURE,
 					    "must run at least one thread\n");
 				}
 				break;
-			case 'M':
-				options.memmask = strtoull(optarg, nullptr, 16);
-				break;
 			case 'H':
 				if (str_to_netspec(
 					optarg, &options.s_host_spec) != 0) {
@ -626,12 +622,10 @@ main(int argc, char *argv[])
 				options.mlg_bps = strtoull(optarg, nullptr, 10);
 				break;
 			case 'X':
-				options.mlg_dmask = strtoull(
-				    optarg, nullptr, 16);
+				cpulist_to_cpuset(optarg, &options.mlg_dset);
 				break;
 			case 'x':
-				options.mlg_cmask = strtoull(
-				    optarg, nullptr, 16);
+				cpulist_to_cpuset(optarg, &options.mlg_cset);
 				break;
 			case 'J':
 				options.jumbo_frame_enabled = true;
@ -657,14 +651,14 @@ main(int argc, char *argv[])
 	dump_options();

 	// init mlg
-	if (options.mlg_enabled) {
-		bool success = false;
-		options.mlg = new memload_generator(options.mlg_cmask,
-		    options.mlg_dmask, options.mlg_bps, &success);
-		if (!success) {
-			rte_exit(EXIT_FAILURE, "failed to init mlg\n");
-		}
-	}
+	// if (options.mlg_enabled) {
+	// 	// bool success = false;
+	// 	// options.mlg = new memload_generator(options.mlg_cmask,
+	// 	//     options.mlg_dmask, options.mlg_bps, &success);
+	// 	// if (!success) {
+	// 	// 	rte_exit(EXIT_FAILURE, "failed to init mlg\n");
+	// 	// }
+	// }

 	// register dynamic field
 	PROBE_FLAG_OFFSET = rte_mbuf_dynfield_register(
@ -702,7 +696,7 @@ main(int argc, char *argv[])
 		snprintf(options.mempool_name_buf, MEMPOOL_NAME_BUF_LEN,
 		    "khat_mempool_%d", nodeid);
 		mbuf_pool = rte_pktmbuf_pool_create(options.mempool_name_buf,
-		    MBUF_MAX_COUNT * nb_ports, MBUF_CACHE_SIZE, 0,
+		    MBUF_MAX_COUNT, MBUF_CACHE_SIZE, 0,
 		    	options.jumbo_frame_enabled ?
 		      RTE_MBUF_DEFAULT_BUF_SIZE + (MAX_JUMBO_MTU - MAX_STANDARD_MTU) :
 		      RTE_MBUF_DEFAULT_BUF_SIZE, nodeid);
@ -717,20 +711,25 @@ main(int argc, char *argv[])
 	}

 	// init threads
-	uint64_t cpuset = options.cpuset;
-	for (int i = 0; i < options.num_threads; i++) {
-		uint32_t lcore_id = cmask_get_next_cpu(&cpuset);
+	uint32_t cpu_idx = CPU_FFS(&options.cpu_set);
+	uint32_t tid = 0;
+	while(cpu_idx != 0) {
+		uint32_t lcore_id = cpu_idx - 1;
 		uint32_t node_id = rte_lcore_to_socket_id(lcore_id);
 		auto *tinfo = (struct thread_info *)nm_malloc(node_id, sizeof(struct thread_info));
 		tinfo->cache_lines = nm_malloc(node_id, CACHELINE_SIZE * options.thread_cacheline_cnt);
 		tinfo->load_buffer = nm_malloc(node_id, THREAD_LOAD_BUFFER_SZ);
-		tinfo->tid = i;
+		tinfo->tid = tid;
 		tinfo->lcore_id = lcore_id;
 		tinfo->node_id = node_id;
 		options.s_thr_info.push_back(tinfo);
 		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
-		    "main: thread %d assigned to cpu %d, node %d\n", tinfo->tid,
-		    tinfo->lcore_id, nm_get_node_from_core(lcore_id));
+			"main: thread %d assigned to cpu %d, node %d\n", tinfo->tid,
+			tinfo->lcore_id, nm_get_node_from_core(lcore_id));
+		
+		tid++;
+		CPU_CLR(cpu_idx - 1, &options.cpu_set);
+		cpu_idx = CPU_FFS(&options.cpu_set);
 	}

 	if (port_init(portid, mbuf_pool) != 0) {
--- a/net/rat.cc
+++ b/net/rat.cc
@ -1,3 +1,4 @@
+#include <sys/endian.h>
 #include <rte_byteorder.h>
 #include <rte_common.h>
 #include <rte_config.h>
@ -22,7 +23,7 @@
 #include <random>
 #include <vector>

-constexpr static unsigned int MBUF_MAX_COUNT = 65536;
+#define MBUF_MAX_COUNT (rte_lcore_count() * 4096)
 constexpr static unsigned int MBUF_CACHE_SIZE = 512;
 constexpr static unsigned int RX_RING_SIZE = 2048;
 constexpr static unsigned int TX_RING_SIZE = 2048;
@ -92,11 +93,11 @@ struct options_t {
 	char ia_gen[256] { "fixed" };
 	char ld_gen[256] { "fixed:0" };
 	uint32_t target_qps { 0 };
-	uint32_t depth = 1;
+	uint32_t depth { 1 };
 	struct net_spec server_spec {
 	};
-	uint64_t cpu_mask { 0x4 }; // 1 thread @ core 2
-	uint32_t pkt_loss_delay_ms = UINT32_MAX;
+	cpuset_t cpu_set = CPUSET_T_INITIALIZER(0x2); // 1 thread @ core 2
+	uint32_t pkt_loss_delay_ms { UINT32_MAX };
 	bool jumbo_frame_enabled { false };
 	int pkt_pad_sz { 0 };
 	int port_mtu { MAX_STANDARD_MTU };
@ -685,7 +686,6 @@ dump_options()
 	    "    run time = %d\n"
 	    "    num threads = %d\n"
 	    "    rage quit time = %ul\n"
-	    "    cpu mask = 0x%lx\n"
 	    "    slave mode = %d\n"
 	    "    interarrival dist = %s\n"
 	    "    workload dist = %s\n"
@ -696,7 +696,7 @@ dump_options()
 	    "    jumbo frame = %d\n"
 	    "    packet pad size = %d\n",
 	    ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING, options.run_time,
-	    options.s_num_threads, options.rage_quit_time, options.cpu_mask,
+	    options.s_num_threads, options.rage_quit_time,
 	    options.slave_mode, options.ia_gen, options.ld_gen,
 	    options.target_qps, options.s_host_spec.ip, options.depth,
 	    options.pkt_loss_delay_ms, options.jumbo_frame_enabled,
@ -775,14 +775,10 @@ main(int argc, char *argv[])
 				    STATE_SYNC; // set state to wait for SYNC
 				break;
 			case 'A':
-				options.cpu_mask = strtoull(
-				    optarg, nullptr, 16);
-				options.s_num_threads = cmask_get_num_cpus(
-				    options.cpu_mask);
+				cpulist_to_cpuset(optarg, &options.cpu_set);
+				options.s_num_threads = CPU_COUNT(&options.cpu_set);
 				if (options.s_num_threads == 0) {
-					rte_exit(EXIT_FAILURE,
-					    "invalid cpu mask 0x%lx\n",
-					    options.cpu_mask);
+					rte_exit(EXIT_FAILURE, "invalid cpu mask %s\n", optarg);
 				}
 				break;
 			case 'i':
@ -881,8 +877,10 @@ main(int argc, char *argv[])
 	}
 	options.mbuf_pool = mbuf_pool;

-	uint64_t cmask = options.cpu_mask;
-	for (unsigned int i = 0; i < options.s_num_threads; i++) {
+	unsigned int cpuset_idx = CPU_FFS(&options.cpu_set);
+	unsigned int tid = 0;
+	while(cpuset_idx != 0) {
+		unsigned int lcore_id = cpuset_idx - 1;
 		tinfo = new thread_info;
 		tinfo->ia_gen = createGenerator(options.ia_gen);
 		tinfo->load_gen = createGenerator(options.ld_gen);
@ -892,11 +890,15 @@ main(int argc, char *argv[])
 		}
 		tinfo->ia_gen->set_lambda((double)options.target_qps /
 		    (double)(options.s_num_threads));
-		tinfo->id = i;
-		tinfo->lcore_id = cmask_get_next_cpu(&cmask);
-		tinfo->rxqid = i;
-		tinfo->txqid = i;
+		tinfo->id = tid;
+		tinfo->lcore_id = lcore_id;
+		tinfo->rxqid = tid;
+		tinfo->txqid = tid;
 		options.s_thr_info.push_back(tinfo);
+
+		tid++;
+		CPU_CLR(lcore_id, &options.cpu_set);
+		cpuset_idx = CPU_FFS(&options.cpu_set);
 	}

 	if (port_init(portid, mbuf_pool) != 0) {
--- a/scripts/compile.sh
+++ b/scripts/compile.sh
@ -1,38 +0,0 @@
-#!/bin/sh
-test_dir="/numam.d"
-root="$(dirname "$0")/.."
-servers="skylake2.rcs.uwaterloo.ca skylake3.rcs.uwaterloo.ca skylake6.rcs.uwaterloo.ca skylake7.rcs.uwaterloo.ca skylake8.rcs.uwaterloo.ca" 
-rsync_flags="-vchr"
-ssh_args="-o StrictHostKeyChecking=no -p77"
-
-user=$1
-
-if [ -z $user ]
-then 
-    user=$(whoami)
-fi
-
-echo "USER: $user"
-
-compile() {
-    # separate these functions because we might change kernel (reboot) without needing to recompile
-    echo "====================$1===================="
-    echo "Syncing directories..."
-    ssh $(echo $ssh_args $user@$1) "sudo mkdir -p $test_dir"
-    ssh $(echo $ssh_args $user@$1) "sudo chmod 777 $test_dir"
-    rsync $(echo $rsync_flags) -e 'ssh -p 77' $root/ $user@$1:$test_dir/
-    echo "Compiling..."
-    ssh $(echo $ssh_args $user@$1) "sudo pkg install -y hwloc2 cmake; sudo mkdir -p $test_dir/build; cd $test_dir/build; sudo rm -rf *; sudo cmake ../; sudo make clean all -j8" &
-    wait
-    echo "$1 Done."
-    echo ""
-}
-
-i=0
-for server in $servers
-do
-	i=$(expr $i + 1)
-    compile "$server" &
-done
-
-wait
--- a/scripts/libs/libtc.py
+++ b/scripts/libs/libtc.py
@ -85,28 +85,25 @@ def remote_exec(srv, cmd, blocking=True, check=True):
 	return sub


-def scan_stderr(p, exclude = None):
-	for err in p.stderr:
-		fail = True
-		err = err.decode()
-		err = err.strip()
+def check_stderr(p, sel, exclude = []):# -> tuple[bool, list[str]]:
+	max_stderr_rd = 10
+	err = []
+	while sel.poll(1) and max_stderr_rd > 0:
+		err.append(p.stderr.readline().decode().strip())
+		max_stderr_rd = max_stderr_rd - 1

-		#print(err)

-		if len(err) == 0:
+	good = True
+	for e in err:
+		if len(e) == 0:
 			continue
 		
-		if exclude != None:
-			for exc in exclude:
-				if (exc != None) and (re.match(exc, err) != None):
-					fail = False
-					break
-		
-		if fail:
-			log_print("Error detected: " + err)
-			return False
+		for exc in exclude:
+			if not (exc in err):
+				good = False
+				break
 			
-	return True
+	return good, err

 # stderr threads
 errthr_objs = []
@ -116,12 +113,21 @@ errthr_failed = False
 def errthr_get_failed():
 	return errthr_failed

-def thr_check_stderr(p : sp.Popen, exclude):
+def thr_check_stderr(p : sp.Popen, name: str, exclude):
 	global errthr_failed
+	sel = select.poll()
+	sel.register(p.stderr, select.POLLIN)
+	local_failed = False
 	while(not errthr_sigstop):
-		if not scan_stderr(p, exclude=exclude):
-			errthr_failed = True
-		time.sleep(0.5 + random.uniform(-0.1, 0.1))
+		if (not local_failed):
+			status, err = check_stderr(p, sel, exclude=exclude)
+			if not status:
+				errthr_failed = True
+				local_failed = True
+				log_print("Error detected in \"" + name + "\":\n")
+				for e in err:
+					log_print("        " + e + "\n")
+		time.sleep(random.uniform(0.001, 0.1))

 def errthr_start():
 	global errthr_sigstop
@ -129,12 +135,13 @@ def errthr_start():
 	errthr_sigstop = False
 	errthr_failed = False
 	for thr in errthr_objs:
+		thr.daemon = True
 		thr.start()

-def errthr_create(cp, exclude = None):
+def errthr_create(cp, name, exclude = None):
 	global errthr_objs
-	for p in cp:
-		errthr_objs.append(Thread(target = thr_check_stderr, args=(p, exclude)))
+	for i in range(len(cp)):
+		errthr_objs.append(Thread(target = thr_check_stderr, args=(cp[i], name[i], exclude)))

 def errthr_stop():
 	global errthr_objs
--- a/scripts/run.py
+++ b/scripts/run.py
@ -13,15 +13,15 @@ import libpar as par
 import libtc as tc

 # load_gen
-loadgen_load = "fixed:16"
+loadgen_load = "fixed:0"

 # pkt_pad
 jumbo_frame_threshold = 1518
 pkt_pads = [
-	 "0",
-	# "256",
-	# "512",
-	# "1024",
+	"0",
+	"256",
+	"512",
+	"1024",
 	#"1518",
 	#"2048",
 	#"4096",
@ -30,11 +30,11 @@ pkt_pads = [
 ]

 pkt_pads_depth = {}
-pkt_pads_depth["0"] = "0"
+pkt_pads_depth["0"] = "32"
 pkt_pads_depth["256"] = "16"
 pkt_pads_depth["512"] = "8"
 pkt_pads_depth["1024"] = "4"
-pkt_pads_depth["1518"] = "6"
+pkt_pads_depth["1518"] = "4"
 pkt_pads_depth["2048"] = "2"
 pkt_pads_depth["4096"] = "1"
 pkt_pads_depth["8192"] = "1"
@ -80,41 +80,25 @@ root_dir = os.path.join(file_dir,"..")
 sample_filename = "sample.txt"

 affinity = [
-	#"0x20",
-	#"0x2000000",
-	# "0xA0",
-	# "0xAA0",
-	# "0xAAAA0",
-	# "0xAAAAA",
-	# "0xAAAAAA",
-	# "0x2000000",
-	# "0xA000000",
-	# "0xAA000000",
-	# "0xAAAA000000",
-	# "0xAAAAA000000",
-	# "0xAAAAAA000000",
-	"0x2000002",
-	"0xA00000A",
-	"0xAA0000AA",
-	"0xAAAA00AAAA",
-	"0xAAAAAAAAAAAA"
+	#"1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47",
+	"49,51,53,55,57,59,61,63,65,67,69,71,73,75,77,79,81,83,85,87,89,91,93,95"
 ]

 master = ["skylake2.rcs.uwaterloo.ca"]
 master_spec = ["192.168.123.10@3c:15:fb:c9:f3:36"]
-master_cpumask = "0x8" # 1 thread
+master_cpumask = "2" # 1 thread

-server = ["skylake3.rcs.uwaterloo.ca"]
-server_spec = ["192.168.123.9@3c:15:fb:c9:f3:4b"]
+server = ["icelake2-int.rcs.uwaterloo.ca"]
+server_spec = ["192.168.123.9@3c:ec:ef:61:39:f3"]

-clients = ["skylake6.rcs.uwaterloo.ca", "skylake7.rcs.uwaterloo.ca"] # "skylake8.rcs.uwaterloo.ca"]
-client_spec = ["192.168.123.11@3c:15:fb:62:9b:2f", "192.168.123.12@3c:15:fb:c9:f3:44"] # "192.168.123.13@3c:15:fb:62:9c:be"]
-client_cpumask = "0xAAAAAA"
+clients = ["skylake3.rcs.uwaterloo.ca", "skylake5.rcs.uwaterloo.ca", "skylake6.rcs.uwaterloo.ca"]# "skylake7.rcs.uwaterloo.ca", "skylake8.rcs.uwaterloo.ca"]
+client_spec = ["192.168.123.12@3c:15:fb:c9:f3:4b", "192.168.123.13@3c:15:fb:c9:f3:28", "192.168.123.14@3c:15:fb:62:9b:2f"] #, "192.168.123.13@3c:15:fb:62:9c:be"]
+client_cpumask = "1,3,5,7,9,11,13,15,17,19,21,23"

 client_rage_quit = 1000 #1s
-warmup = 10
-duration = 10
-cooldown = 0
+warmup = 5
+duration = 20
+cooldown = 5
 cacheline = 0
 SSH_PARAM = "-o StrictHostKeyChecking=no -p77"
 SSH_USER = "oscar"
@ -147,7 +131,7 @@ def get_client_str():
 	return ret

 def calc_client_ld(ld : int):
-	return 0 if ld == 0 else ((ld - master_qps) / len(clients))
+	return 0 if ld == 0 else (int)((ld - master_qps) / len(clients))

 def run_exp(affinity : str, ld : int, pkt_pad : str, aff_idx : int):
 	while True:
@ -169,6 +153,7 @@ def run_exp(affinity : str, ld : int, pkt_pad : str, aff_idx : int):
 		# start clients
 		tc.log_print("Starting clients...")
 		sclt = []
+		sclt_name = []
 		for i in range(len(clients)):
 			client_cmd = "sudo " + test_dir + "/rat --log-level lib.eal:err -- -S -A " + client_cpumask + \
 				" -i exponential " + \
@ -184,8 +169,9 @@ def run_exp(affinity : str, ld : int, pkt_pad : str, aff_idx : int):
 				client_cmd += " -J "
 			tc.log_print(client_cmd)
 			sclt.append(tc.remote_exec([clients[i]], client_cmd, blocking=False)[0])
+			sclt_name.append(clients[i])

-		time.sleep(1)
+		time.sleep(5)
 		# start master
 		tc.log_print("Starting master...")
 		master_cmd = "sudo " + test_dir + "/cat --log-level lib.eal:err -- " + \
@ -205,19 +191,24 @@ def run_exp(affinity : str, ld : int, pkt_pad : str, aff_idx : int):
 		p = sp[0]

 		# launch stderr monitoring thread
-		exclude = None
-		tc.errthr_create(sp, exclude)
+		exclude = ["Pseudo-terminal"]
+		tc.errthr_create([p], master, exclude)
 		if not client_only:
-			tc.errthr_create(ssrv, exclude)
-		tc.errthr_create(sclt, exclude)
+			tc.errthr_create(ssrv, server, exclude)
+		tc.errthr_create(sclt, sclt_name, exclude)
 		tc.errthr_start()
 		success = False
 		cur = 0
+		# selec = select.poll()
+		# selec.register(p.stdout, select.POLLIN)
 		while True:
 			# either failed or timeout
 			# we use failure detection to save time for long durations
-			if tc.errthr_get_failed() or cur >= int(warmup + duration) * 3 :
+			if tc.errthr_get_failed() or cur >= (warmup + duration) * 3:
 				break
+
+			# while selec.poll(1):
+			# 	print(p.stdout.readline())
 			
 			if p.poll() != None:
 				success = True
@ -293,7 +284,7 @@ def main():
 		elif opt in ('-c'):
 			client_only=True

-	tc.init("~/results.d/numam/" + output_dirname + "_" + datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
+	tc.init("~/results.d/numam_neo/" + output_dirname + "_" + datetime.datetime.now().strftime('%Y%m%d%H%M%S'))

 	tc.log_print("Configuration:\n" + \
 		  "hostfile: " + ("None" if hostfile == None else hostfile) + "\n" \
--- a/scripts/compile_dpdk.sh
+++ b/scripts/compile_dpdk.sh
@ -1,14 +1,12 @@
 #!/bin/sh
 dpdk_dir="/dpdk"
 root="$(dirname "$0")/.."
-servers="skylake2.rcs.uwaterloo.ca skylake3.rcs.uwaterloo.ca skylake6.rcs.uwaterloo.ca skylake7.rcs.uwaterloo.ca skylake8.rcs.uwaterloo.ca" 
-rsync_flags="-vchr"
+servers="skylake3.rcs.uwaterloo.ca" # skylake5.rcs.uwaterloo.ca skylake6.rcs.uwaterloo.ca skylake7.rcs.uwaterloo.ca skylake8.rcs.uwaterloo.ca" 
+rsync_flags="-az"
 ssh_args="-o StrictHostKeyChecking=no -p77"

 user=$1

-iface="ixl0"
-

 if [ -z $user ]
 then 
@ -21,7 +19,7 @@ compile() {
    # separate these functions because we might change kernel (reboot) without needing to recompile
    echo "====================$1===================="
    #ssh $(echo $ssh_args $user@$1) "sudo reboot"
-    ssh $(echo $ssh_args $user@$1) "sudo sh -c \"mkdir -p $dpdk_dir; cd $dpdk_dir; git clone https://git.quacker.org/d/numam-dpdk; cd numam-dpdk; git reset --hard; git clean -f; rm -rf build; meson -Denable_kmods=true build; cd build; ninja install\""
+    ssh $(echo $ssh_args $user@$1) "sudo sh -c \"rm -rf $dpdk_dir; mkdir -p $dpdk_dir; cd $dpdk_dir; git clone https://git.quacker.org/d/numam-dpdk; cd numam-dpdk; git checkout releases-13.0; meson -Denable_kmods=true build; cd build; ninja install\""
    wait
    echo "$1 Done."
    echo ""
--- a/scripts/setup_program.sh
+++ b/scripts/setup_program.sh
@ -0,0 +1,36 @@
+#!/bin/sh
+dpdk_dir="/numam.d"
+root="$(dirname "$0")/.."
+servers="icelake2-int.rcs.uwaterloo.ca" # skylake7.rcs.uwaterloo.ca skylake8.rcs.uwaterloo.ca icelake2-int.rcs.uwaterloo.ca" 
+rsync_flags="-rv -e \"ssh -p77\""
+ssh_args="-o StrictHostKeyChecking=no -p77"
+
+user=$1
+
+
+if [ -z $user ]
+then 
+    user=$(whoami)
+fi
+
+echo "USER: $user"
+
+compile() {
+    # separate these functions because we might change kernel (reboot) without needing to recompile
+    echo "====================$1===================="
+    ssh $(echo $ssh_args $user@$1) "sudo sh -c \"rm -rf $dpdk_dir; mkdir $dpdk_dir; chmod 777 $dpdk_dir\""
+    rsync -rv -e "ssh -p77" $root/ $user@$1:$dpdk_dir/
+    ssh $(echo $ssh_args $user@$1) "sudo sh -c \"cd $dpdk_dir; rm -rf build; mkdir build; cd build; cmake ../; make -j8 khat cat rat\""
+    wait
+    echo "$1 Done."
+    echo ""
+}
+
+i=0
+for server in $servers
+do
+	i=$(expr $i + 1)
+    compile "$server" &
+done
+
+wait
--- a/scripts/storage/parse.py
+++ b/scripts/storage/parse.py
@ -9,50 +9,104 @@ import getopt
 import math
 import concurrent.futures as CF

+columns = [
+    ("Req per second", "rps", ".2f"),
+    ("Bytes per second", "bps", ".2f"),
+    ("Average Latency", "lat_avg", ".2f"),
+    ("50th Latency", "lat_50", ".0f"),
+    ("95th Latency", "lat_95", ".0f"),
+    ("99th Latency", "lat_99", ".0f"),
+    ("Latency stddev", "lat_std", ".2f")
+]
+
+TIME = 30
+REQ_SZ = 4096
+
+class DatObj:
+    def __init__(self, raw : list, time : int, req_sz : int):
+        self.raw = raw
+        self.rps = len(raw) / time
+        self.bps = self.rps * req_sz
+        self.lat_avg = np.average(self.raw)
+        self.lat_99 = np.percentile(self.raw, 99)
+        self.lat_95 = np.percentile(self.raw, 95)
+        self.lat_50 = np.percentile(self.raw, 50)
+        self.lat_std = np.std(self.raw)
+
+def parse_file(lines : list, time : int, req_sz : int) -> DatObj :
+    raw = []
+    for line in lines:
+        if len(line) > 0:
+            raw.append(int(line))
+    return DatObj(raw, time, req_sz)
+
+def output_col():
+    ret = "Benchmark"
+    for name,_,_ in columns:
+        ret = ret + "," + name + "," + name + " (NUMA)" + "," + "% change"
+    return ret
+
+def get_attr_or_none(obj, attr):
+    if (obj != None):
+        val = getattr(obj, attr)
+    else:
+        val = None
+    return val
+
+def output_objs(name: str, obj : DatObj, obj_numa : DatObj):
+    ret = name
+    for _, attr, fmt in columns:
+        val = get_attr_or_none(obj, attr)
+        val_numa = get_attr_or_none(obj_numa, attr)
+ 
+        ret = ret + "," + (format(val, fmt) if val != None else "N/A") 
+        ret = ret + "," + (format(val_numa, fmt) if val_numa != None else "N/A") 
+        
+        if val == None or val_numa == None:
+            ret = ret + "," + "N/A"
+        else:
+            ret = ret + "," + format((val_numa - val) / val * 100, ".2f") + "%"
+    return ret
+
+def process_file(f : str, obj_map):
+    with open(f, "r") as fp:
+        lines = fp.readlines()
+    
+    bench_name = os.path.basename(f)
+    obj_map[bench_name] = parse_file(lines, TIME, REQ_SZ)
+    print("Processed file " + f + ". Benchmark name: " + bench_name)
+
+def process_dir(path : str, obj_map):
+    files = [os.path.abspath(os.path.join(path, x)) for x in os.listdir(path)]
+    for f in files:
+        if (".sh" in f):
+            continue
+        if (os.path.isfile(f)):
+            process_file(f, obj_map)
+
 def main():
    datdir = None
-    options = getopt.getopt(sys.argv[1:], 'f:')[0]
+    options = getopt.getopt(sys.argv[1:], 'd:')[0]

    for opt, arg in options:
-        if opt in ('-f'):
+        if opt in ('-d'):
            datdir = arg

    if datdir == None:
-        raise Exception("Must specify -f parameter")
+        raise Exception("Must specify -d parameter")

-    with open(datdir) as file:
-        lines = file.readlines()
+    obj_map = dict()
+    process_dir(datdir, obj_map)

-    datapts = []
-    for line in lines:
-        if len(line) > 0:
-            datapts.append(int(line))
-
-
-    runtime = 10
-    req = len(datapts)
-    blk_size = 4096
-    bps = blk_size * req
-
-    
-
-    avg_lat = np.average(datapts)
-    tail99_lat = np.percentile(datapts, 99)
-    tail95_lat = np.percentile(datapts, 95)
-    med_lat = np.percentile(datapts, 50)
-    std_dev = np.std(datapts)
-
-    print("Runtime: " + str(runtime) + "s\n"
-          "Requests: " + str(req) + "\n"
-          "Request size: " + str(blk_size) + " bytes\n"
-          "Request per second: " + str(int(req/runtime)) + "\n"
-          "Bytes per second: " + str(bps) + " bytes = " + str(int(bps/1024/1024)) + " MB\n"
-          "Average Latency: " + str(int(avg_lat)) + "\n"
-          "99th Latency: " + str(int(tail99_lat)) + "\n"
-          "95th Latency: " + str(int(tail95_lat)) + "\n"
-          "50th Latency: " + str(int(med_lat)) + "\n"
-          "stddev: " + str(std_dev) + "\n")
+    with open("results.csv", "w") as f:
+        f.write(output_col())
+        f.write("\n")

+        for bench in obj_map:
+            if bench.endswith("_numa"):
+                continue
+            f.write(output_objs(bench, obj_map[bench], obj_map.get(bench+"_numa")))
+            f.write("\n")

 if __name__ == "__main__":
    main()
--- a/scripts/storage/test_posix.sh
+++ b/scripts/storage/test_posix.sh
@ -0,0 +1,19 @@
+# rand_read
+sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D /dev/nvd0 -P R,100 -Q 3 -o rand_read
+sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D /dev/nvd0 -P R,100 -Q 3 -o rand_read_numa
+
+# rand_write
+sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D /dev/nvd0 -P R,0 -Q 3 -o rand_write
+sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D /dev/nvd0 -P R,0 -Q 3 -o rand_write_numa
+
+# mono_read
+sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D /dev/nvd0 -P M,100 -Q 3 -o mono_read
+sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D /dev/nvd0 -P M,100 -Q 3 -o mono_read_numa
+
+# mono_write
+sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D /dev/nvd0 -P M,0 -Q 3 -o mono_write
+sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D /dev/nvd0 -P M,0 -Q 3 -o mono_write_numa
+
+# mixed
+sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D /dev/nvd0 -P R,70 -Q 3 -o mixed_read
+sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D /dev/nvd0 -P R,70 -Q 3 -o mixed_read_numa
--- a/scripts/storage/test_spdk_bdev.sh
+++ b/scripts/storage/test_spdk_bdev.sh
@ -0,0 +1,19 @@
+# rand_read
+sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P R,100 -Q 3 -o rand_read -k bdev
+sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P R,100 -Q 3 -o rand_read_numa -k bdev
+
+# rand_write
+sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P R,0 -Q 3 -o rand_write -k bdev
+sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P R,0 -Q 3 -o rand_write_numa -k bdev
+
+# mono_read
+sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P M,100 -Q 3 -o mono_read -k bdev
+sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P M,100 -Q 3 -o mono_read_numa -k bdev
+
+# mono_write
+sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P M,0 -Q 3 -o mono_write -k bdev
+sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P M,0 -Q 3 -o mono_write_numa -k bdev
+
+# mixed
+sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P R,70 -Q 3 -o mixed_read -k bdev
+sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P R,70 -Q 3 -o mixed_read_numa -k bdev
--- a/scripts/storage/test_spdk_nvme.sh
+++ b/scripts/storage/test_spdk_nvme.sh
@ -0,0 +1,19 @@
+# rand_read
+sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P R,100 -Q 3 -o rand_read -k bdev
+sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P R,100 -Q 3 -o rand_read_numa -k bdev
+
+# rand_write
+sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P R,0 -Q 3 -o rand_write -k bdev
+sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P R,0 -Q 3 -o rand_write_numa -k bdev
+
+# mono_read
+sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P M,100 -Q 3 -o mono_read -k bdev
+sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P M,100 -Q 3 -o mono_read_numa -k bdev
+
+# mono_write
+sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P M,0 -Q 3 -o mono_write -k bdev
+sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P M,0 -Q 3 -o mono_write_numa -k bdev
+
+# mixed
+sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P R,70 -Q 3 -o mixed_read -k bdev
+sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P R,70 -Q 3 -o mixed_read_numa -k bdev
--- a/storage/birb.cc
+++ b/storage/birb.cc
@ -460,8 +460,8 @@ worker_thread_main(void * arg)

 	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: started...\n", ctx->tid);
 	
-	/* random delay 0-100 ms */
-	usleep(((rand() * nm_get_uptime_ns()) % 100) * 1000);
+	/* random delay 0-100 us */
+	usleep(nm_get_uptime_ns() % 100);

 	next_ts = get_cur_ts_nano();
 	
@ -477,7 +477,7 @@ worker_thread_main(void * arg)
 				ctx->overhead_min = overhead;
 			}

-			ctx->overhead_avg = ctx->overhead_avg * ctx->overhead_cnt + ctx->overhead_avg;
+			ctx->overhead_avg = ctx->overhead_avg * ctx->overhead_cnt + overhead;
 			ctx->overhead_cnt++;
 			ctx->overhead_avg /= ctx->overhead_cnt;
 		}
@ -657,7 +657,7 @@ birb_main(void * arg1 UNUSED)
 		CPU_SET(cur_core, &scpuset);
 		pthread_attr_init(&attr);
 		pthread_attr_setaffinity_np(&attr, sizeof(cpuset_t), &scpuset);
-		rc = pthread_create(&ctx->sys_thread, nullptr, worker_thread_main, ctx);
+		rc = pthread_create(&ctx->sys_thread, &attr, worker_thread_main, ctx);
 		if (rc != 0) {
 			ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to create sys thread: %d\n", rc);
 			rc = EINVAL;
--- a/storage/birb_posix.cc
+++ b/storage/birb_posix.cc
@ -221,15 +221,15 @@ worker_thread_main(void * arg)

 	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: started...\n", ctx->tid);
 	
-	/* random delay 0-100 ms */
-	usleep(((rand() * nm_get_uptime_ns()) % 100) * 1000);
+	/* random delay 0-100 us */
+	usleep(nm_get_uptime_ns() % 100);

 	next_ts = get_cur_ts_nano();
 	
 	while (true) {
-		uint64_t cur_loop_ts = get_cur_ts_nano();
+		uint64_t cur_ts = get_cur_ts_nano();
 		if (last_loop_ts > 0) {
-			uint64_t overhead = cur_loop_ts - last_loop_ts;
+			uint64_t overhead = cur_ts - last_loop_ts;
 			if (ctx->overhead_max < overhead) {
 				ctx->overhead_max = overhead;
 			}
@ -238,14 +238,13 @@ worker_thread_main(void * arg)
 				ctx->overhead_min = overhead;
 			}

-			ctx->overhead_avg = ctx->overhead_avg * ctx->overhead_cnt + ctx->overhead_avg;
+			ctx->overhead_avg = ctx->overhead_avg * ctx->overhead_cnt + overhead;
 			ctx->overhead_cnt++;
 			ctx->overhead_avg /= ctx->overhead_cnt;
 		}
-		last_loop_ts = cur_loop_ts;
+		last_loop_ts = cur_ts;

 		// process io completion
-		uint64_t cur_ts = get_cur_ts_nano();
 		auto itr = prog_ios.begin();
 		while (itr != prog_ios.end()) {
 			int err;
@ -301,10 +300,8 @@ worker_thread_main(void * arg)
 					io_req->aio.aio_offset = a_offset;

 					if(io_ctx.op == IOGEN_READ) {
-						io_req->aio.aio_lio_opcode = LIO_READ;
 						rc = aio_read(&io_req->aio);
 					} else {
-						io_req->aio.aio_lio_opcode = LIO_WRITE;
 						rc = aio_write(&io_req->aio);
 					}

@ -312,6 +309,7 @@ worker_thread_main(void * arg)
 						ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "thread %d: failed to issue io %d, retrying...\n", ctx->tid, errno);
 					} else {
 						free_ios.pop_front();
+						prog_ios.push_back(io_req);
 						next_ts = next_ts + ia_gen->generate() * S2NS;
 					}
 				}
@ -434,7 +432,7 @@ birb_main()
 		CPU_SET(cur_core, &scpuset);
 		pthread_attr_init(&attr);
 		pthread_attr_setaffinity_np(&attr, sizeof(cpuset_t), &scpuset);
-		rc = pthread_create(&ctx->sys_thread, nullptr, worker_thread_main, ctx);
+		rc = pthread_create(&ctx->sys_thread, &attr, worker_thread_main, ctx);
 		if (rc != 0) {
 			ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to create sys thread: %d\n", rc);
 			rc = EINVAL;
--- a/storage/drivers/nvme.cc
+++ b/storage/drivers/nvme.cc
@ -39,6 +39,7 @@ birb_nvme_driver::attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *t

 	*ctx->ns = ns;
 	*ctx->ctrlr = ctrlr;
+	ctx->valid = 1;
 }

 bool
@ -65,6 +66,7 @@ birb_nvme_driver::birb_nvme_driver(const char * dev_name) : status(BIRB_FAIL),
 	ctx.ctrlr = &this->ctrlr;
 	ctx.ns = &this->ns;
 	ctx.dev_name = dev_name;
+	ctx.valid = 0;

    spdk_nvme_trid_populate_transport(&trid, SPDK_NVME_TRANSPORT_PCIE);
 	snprintf(trid.subnqn, sizeof(trid.subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN);
@ -75,6 +77,12 @@ birb_nvme_driver::birb_nvme_driver(const char * dev_name) : status(BIRB_FAIL),
 		goto end;
 	}

+	if (ctx.valid != 1) {
+		rc = EINVAL;
+		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "birb_nvme_driver: could not find device: %s\n", dev_name);
+		goto end;
+	}
+
 	if (spdk_nvme_ns_get_csi(this->ns) == SPDK_NVME_CSI_ZNS) {
 		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "birb_nvme_driver: zoned nvme namespace is unsupported\n");
 		spdk_nvme_detach(this->ctrlr);
--- a/test/ts.cc
+++ b/test/ts.cc
@ -0,0 +1,84 @@
+#include <unistd.h>
+#include "defs.hh"
+#include "nm.hh"
+#include <ctime>
+
+void test(const char * case_name, struct timespec * ts)
+{
+    uint64_t slow;
+    uint64_t fast;
+
+    slow = get_uptime();
+    if (nanosleep(ts, nullptr) != 0) {
+        perror("nanosleep() interrupted!");
+        exit(-1);
+    }
+    slow = get_uptime() - slow;
+
+    fast = nm_get_uptime_ns();
+    if (nanosleep(ts, nullptr) != 0) {
+        perror("nanosleep() interrupted!");
+        exit(-1);
+    }
+    fast = nm_get_uptime_ns() - fast;
+
+    printf("%s: get_uptime(): %lu, nm_get_uptime_ns(): %lu\n", case_name, slow, fast);
+}
+
+
+int main()
+{
+    struct timespec ts;
+    nm_init(0);
+    // 1s
+    ts.tv_nsec = 0;
+    ts.tv_sec = 1;
+    test("1s", &ts);
+
+    // 100ms
+    ts.tv_nsec = 100000000;
+    ts.tv_sec = 0;
+    test("100ms", &ts);
+
+    // 10ms
+    ts.tv_nsec = 10000000;
+    ts.tv_sec = 0;
+    test("10ms", &ts);
+
+    // 1ms
+    ts.tv_nsec = 1000000;
+    ts.tv_sec = 0;
+    test("1ms", &ts);
+
+    // 100us
+    ts.tv_nsec = 100000;
+    ts.tv_sec = 0;
+    test("100us", &ts);
+
+    // 10us
+    ts.tv_nsec = 10000;
+    ts.tv_sec = 0;
+    test("10us", &ts);
+
+    // 1us
+    ts.tv_nsec = 1000;
+    ts.tv_sec = 0;
+    test("1us", &ts);
+
+    // 100ns
+    ts.tv_nsec = 100;
+    ts.tv_sec = 0;
+    test("100ns", &ts);
+
+    // 10ns
+    ts.tv_nsec = 10;
+    ts.tv_sec = 0;
+    test("10ns", &ts);
+
+    // 1ns
+    ts.tv_nsec = 1;
+    ts.tv_sec = 0;
+    test("1ns", &ts);
+
+    return 0;
+}
--- a/util/memloadgen.cc
+++ b/util/memloadgen.cc
@ -1,4 +1,5 @@
 #include "nm.hh"
+#include <cstdlib>
 #include "ntr.h"
 #include <getopt.h>
 #include <unistd.h>
@ -20,13 +21,14 @@ int main(int argc, char * argv[])
 	ntr_init();
 	ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_WARNING);

-	unsigned long long mlg_bps = 0;
+	unsigned long long mlg_arrsz = 0;
+	uint32_t mlg_iter = -1;
 	unsigned long long mlg_dmask = 0;
 	unsigned long long mlg_cmask = 0;
 	{
 		int c;
 		// parse arguments
-		while ((c = getopt(argc, argv, "hb:X:x:v")) != -1) {
+		while ((c = getopt(argc, argv, "hb:X:x:vi:")) != -1) {
 			switch (c) {
 			case 'v':
 				ntr_set_level(NTR_DEP_USER1,
@ -36,7 +38,10 @@ int main(int argc, char * argv[])
 				usage();
 				exit(0);
 			case 'b':
-				mlg_bps = strtoull(optarg, nullptr, 10);
+				mlg_arrsz = strtoull(optarg, nullptr, 10);
+				break;
+			case 'i':
+				mlg_iter = strtoul(optarg, nullptr, 10);
 				break;
 			case 'X':
 				mlg_dmask = strtoull(
@ -53,7 +58,7 @@ int main(int argc, char * argv[])
 		}
 	}

-	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "MLG: [bps: %lld, threads: 0x%llx, domain: 0x%llx]\n", mlg_bps, mlg_cmask, mlg_dmask);
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "MLG: [size: %llu, iter: %u, threads: 0x%llx, domain: 0x%llx]\n", mlg_arrsz, mlg_iter, mlg_cmask, mlg_dmask);

 	// init nm
 	if (nm_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != 0) {
@ -62,18 +67,17 @@ int main(int argc, char * argv[])
 	}

 	bool success = false;
-    memload_generator * mgen = new memload_generator(mlg_cmask, mlg_dmask, mlg_bps, &success);
+    memload_generator * mgen = new memload_generator(mlg_cmask, mlg_dmask, mlg_arrsz, mlg_iter, &success);

 	mgen->start();
-	
-	while(true)
-	{
-		unsigned long bps = mgen->get_bps();
-		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
-				"main: MLG bps = %ld ~= %ldM\n", bps, bps / 1024 / 1024);
-		sleep(1);
+	while(!mgen->check_done()) {
+		usleep(10000);
 	}

+	unsigned long bps = mgen->get_bps();
+	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
+			"main: MLG bps = %ld ~= %ldM\n", bps, bps / 1024 / 1024);
+
 	return 0;
 }