numam/net/rat.cc

#include <atomic>
#include <cstddef>
#include <cstdlib>
#include <list>
#include <map>
#include <mutex>
#include <random>
#include <vector>

#include <sys/endian.h>

#include <topo.h>
#include <rte_byteorder.h>
#include <rte_common.h>
#include <rte_config.h>
#include <rte_eal.h>
#include <rte_ethdev.h>
#include <rte_ether.h>
#include <rte_launch.h>
#include <rte_lcore.h>
#include <rte_mbuf.h>
#include <unistd.h>

#include "ntr.h"

#include "gen.hh"
#include "net/netsup.hh"
#include "net/pkt.hh"
#include "nms.h"

constexpr static unsigned int BURST_SIZE = 32;

static unsigned int
epoch_mk(unsigned int id, unsigned int epoch)
{
	return (id << 24) | epoch;
}

static unsigned int
epoch_get_id(unsigned int epoch)
{
	return epoch >> 24;
}

static unsigned int
epoch_get_epoch(unsigned int epoch)
{
	return epoch & 0x00FFFFFF;
}

struct epoch_info {
	unsigned int epoch;
	uint64_t ts;
};

struct thread_info {
	unsigned int id { 0 };
	unsigned int lcore_id { 0 };
	unsigned int rxqid { 0 };
	unsigned int txqid { 0 };
	int socket_id;
	// this field is read by the stat collecting thread
	std::atomic<int> recved_pkts { 0 };
	std::atomic<int> lost_pkts { 0 };

	Generator *ia_gen { nullptr };
	Generator *load_gen0 { nullptr };
	Generator *load_gen1 { nullptr };

	std::mutex
	    mtx; // this lock protects data shared between worker threads, i.e.:
	std::list<struct epoch_info *> recved_epochs;

	thread_info() = default;
};

constexpr static int STATE_SYNC = 0;	 // waiting for SYNC
constexpr static int STATE_SYNC_ACK = 1; // Waiting for sending SYNC_ACK
constexpr static int STATE_RUNNING = 2;	 // Running
constexpr static int STATE_FIN = 3;	 // FIN received

constexpr static int WORKLOAD_MAX_ARGS = 2;

struct options_t {
	unsigned int run_time { 5 };
	// parameters
	int slave_mode { 0 };
	uint32_t rage_quit_time { UINT32_MAX };
	char ia_gen[256] { "fixed:0" };
	char load_gen[WORKLOAD_MAX_ARGS][256] = {{"fixed:0"}, {"fixed:0"}};
	uint32_t workload_type {LOAD_TYPE_CPU};
	uint32_t target_qps { 0 };
	uint32_t depth { 1 };
	struct net_spec server_spec { };
	cpuset_t cpu_set = CPUSET_T_INITIALIZER(0x2); // 1 thread @ core 2
	uint32_t pkt_loss_delay_ms { UINT32_MAX };
	bool jumbo_frame_enabled { false };
	int pkt_pad_sz { 0 };
	int port_mtu { MAX_STANDARD_MTU };
	int portid { 0 };

	// states
	unsigned int s_num_threads { 1 }; // 1 thread
	struct net_spec s_host_spec { };
	struct net_spec s_master_spec { };
	struct conn_spec s_master_cspec {
		.src = &s_host_spec, .src_port = DEFAULT_RAT_PORT,
		.dst = &s_master_spec, .dst_port = DEFAULT_RAT_PORT,
	};
	std::vector<struct thread_info *> s_thr_info;
	std::atomic<int> s_state { STATE_RUNNING }; // default non master mode

	// states for qps
	std::atomic<uint64_t> s_ts_begin { 0 };
};

static struct options_t options;

static inline void
calc_stats(uint64_t now, uint32_t *qps, uint32_t *recved_pkt,
    uint32_t *total_loss)
{
	uint32_t recv = 0;
	uint32_t loss = 0;

	for (auto i : options.s_thr_info) {
		recv += i->recved_pkts.load();
		loss += i->lost_pkts.load();
	}

	if (recved_pkt != nullptr) {
		*recved_pkt = recv;
	}

	if (total_loss != nullptr) {
		*total_loss = loss;
	}

	if (qps != nullptr) {
		*qps = (uint32_t)((double)(recv) /
		    ((double)(now - options.s_ts_begin.load()) / (double)S2NS));
	}
}

static void
proto_loop(struct thread_info *tinfo)
{
	struct rte_mbuf *tx_buf;
	struct rte_mbuf *rx_bufs[BURST_SIZE];
	struct pkt_hdr *pkt_data;

	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
	    "proto_loop <thread %d>: waiting for SYNC from cat\n", tinfo->id);
	while (options.s_state.load() == STATE_SYNC) {
		const uint16_t nb_rx = rte_eth_rx_burst(options.portid,
		    tinfo->rxqid, rx_bufs, BURST_SIZE);
		if (nb_rx > 0) {
			for (int i = 0; i < nb_rx; i++) {
				struct pkt_hdr *each = check_valid_packet(
				    rx_bufs[i], &options.s_host_spec.mac_addr);

				if (each != nullptr) {
					uint16_t type = rte_be_to_cpu_16(
					    each->type);
					if (type == PKT_TYPE_SYNC) {
						int expected = STATE_SYNC;

						ntr(NTR_DEP_USER1,
						    NTR_LEVEL_INFO,
						    "proto_loop <thread %d>: received SYNC from cat\n",
						    tinfo->id);

						if (!options.s_state
							 .compare_exchange_strong(
							     expected,
							     STATE_SYNC_ACK)) {
							// someone barged in,
							// listen to that guy
							ntr(NTR_DEP_USER1,
							    NTR_LEVEL_WARNING,
							    "proto_loop <thread %d>: failed to cmpxchg sync_recv.\n",
							    tinfo->id);
						} else {
							pkt_hdr_to_netspec(each,
							    &options
								 .s_master_spec,
							    nullptr, nullptr,
							    nullptr);

							if (alloc_pkt_hdr(
								mempool_get(
								    tinfo
									->socket_id),
								PKT_TYPE_SYNC_ACK,
								&options
								     .s_master_cspec,
								0, &tx_buf,
								&pkt_data) !=
							    0) {
								rte_exit(
								    EXIT_FAILURE,
								    "failed to alloc pkt hdr\n");
							}

							tx_burst_all(
							    options.portid,
							    tinfo->txqid,
							    &tx_buf, 1);

							expected =
							    STATE_SYNC_ACK;
							// we've done our job,
							// set off the threads
							if (!options.s_state
								 .compare_exchange_strong(
								     expected,
								     STATE_RUNNING)) {
								rte_exit(
								    EXIT_FAILURE,
								    "state unexpectedly changed\n");
							}

							ntr(NTR_DEP_USER1,
							    NTR_LEVEL_INFO,
							    "proto_loop <thread %d>: sent SYNC_ACK to cat\n",
							    tinfo->id);
						}
					} else {
						ntr(NTR_DEP_USER1,
						    NTR_LEVEL_DEBUG,
						    "proto_loop <thread %d>: ignoring invalid packet %p type %d.\n",
						    tinfo->id,
						    (void *)rx_bufs[i], type);
					}
				} else {
					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
					    "proto_loop <thread %d>: ignoring invalid packet %p.\n",
					    tinfo->id, (void *)rx_bufs[i]);
					//dump_pkt(rx_bufs[i]);
				}

				rte_pktmbuf_free(rx_bufs[i]);
			}
		}
	}

	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
	    "proto_loop <thread %d>: exiting loop...\n", tinfo->id);
}

static void
pkt_loop(struct thread_info *tinfo)
{
	struct rte_mbuf *tx_bufs[BURST_SIZE];
	struct rte_mbuf *rx_bufs[BURST_SIZE];
	std::vector<struct epoch_info *> recved_epochs;
	std::map<unsigned int, struct epoch_info *> sent_epochs;
	uint64_t cur_epoch = 0;
	uint64_t next_ts;
	uint64_t last_recv_ts = 0;
	struct conn_spec srv_cspec;
	rdport_generator src_port_gen(MIN_RANDOM_PORT);
	rdport_generator dst_port_gen(MIN_RANDOM_PORT);

	srv_cspec.src = &options.s_host_spec;
	srv_cspec.dst = &options.server_spec;

	next_ts = topo_uptime_ns();

	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "pkt_loop <thread %d>: entering\n",
	    tinfo->id);

	while (options.s_state.load() == STATE_RUNNING) {
		uint64_t now = topo_uptime_ns();
		// always pop incoming packets
		const uint16_t nb_rx = rte_eth_rx_burst(options.portid,
		    tinfo->rxqid, rx_bufs, BURST_SIZE);

		if (nb_rx > 0) {
			for (int i = 0; i < nb_rx; i++) {
				struct pkt_hdr *each = check_valid_packet(
				    rx_bufs[i], &options.s_host_spec.mac_addr);

				if (each == nullptr) {
					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
					    "pkt_loop <thread %d>: ignoring invalid packet %p.\n",
					    tinfo->id, (void *)rx_bufs[i]);
					rte_pktmbuf_free(rx_bufs[i]);
					continue;
				}

				uint16_t type = rte_be_to_cpu_16(each->type);
				NTR_PKT(NTR_DEP_USER1, NTR_LEVEL_DEBUG, each,
				    "locore_main <thread %d>: ", tinfo->id);
				struct pkt_payload_epoch *pld_epoch;
				struct epoch_info *einfo;
				uint32_t epoch;
				uint32_t id;
				struct thread_info *other_t;
				int int_expected = STATE_RUNNING;
				switch (type) {
				case PKT_TYPE_LOAD_RESP:
					pld_epoch = (struct pkt_payload_epoch *)
							each->payload;
					epoch = rte_be_to_cpu_32(
					    pld_epoch->epoch);
					id = epoch_get_id(epoch);

					// printf("Load resp size : %d\n",
					// rx_bufs[i]->data_len);

					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
					    "pkt_loop <thread %d>: packet %p epoch 0x%x id %d.\n",
					    tinfo->id, (void *)rx_bufs[i],
					    epoch, id);

					if (id >= options.s_num_threads) {
						ntr(NTR_DEP_USER1,
						    NTR_LEVEL_WARNING,
						    "pkt_loop <thread %d>: packet %p invalid id %d.\n",
						    tinfo->id,
						    (void *)rx_bufs[i], id);
						break;
					}

					einfo = new struct epoch_info;
					einfo->epoch = epoch;
					einfo->ts = now;

					other_t = options.s_thr_info.at(id);
					other_t->mtx.lock();
					other_t->recved_epochs.push_back(einfo);
					other_t->mtx.unlock();

					break;
				case PKT_TYPE_FIN:
					if (rte_is_same_ether_addr(
						&each->eth_hdr.src_addr,
						&options.s_master_spec
						     .mac_addr)) {
						ntr(NTR_DEP_USER1,
						    NTR_LEVEL_DEBUG,
						    "pkt_loop <thread %d>: recved FIN from cat.\n",
						    tinfo->id);
						// master told us to stop!
						if (!options.s_state
							 .compare_exchange_strong(
							     int_expected,
							     STATE_FIN)) {
							ntr(NTR_DEP_USER1,
							    NTR_LEVEL_WARNING,
							    "pkt_loop <thread %d>: failed to cmpxchg state.\n",
							    tinfo->id);
						}

						uint32_t qps;
						uint32_t total_recv;
						uint32_t total_loss;

						calc_stats(now, &qps,
						    &total_recv, &total_loss);

						struct pkt_hdr *pkt_hdr;
						if (alloc_pkt_hdr(
							mempool_get(
							    tinfo->socket_id),
							PKT_TYPE_FIN_ACK,
							&options.s_master_cspec,
							0, &tx_bufs[0],
							&pkt_hdr) != 0) {
							rte_exit(EXIT_FAILURE,
							    "failed to allocate pkt hdr\n");
						}

						auto pld_qps =
						    (struct pkt_payload_qps *)
							pkt_hdr->payload;
						pld_qps->qps = rte_cpu_to_be_32(
						    qps);
						pld_qps->recved_pkts =
						    rte_cpu_to_be_32(
							total_recv);
						pld_qps->lost_pkts =
						    rte_cpu_to_be_32(
							total_loss);

						tx_burst_all(options.portid,
						    tinfo->txqid, &tx_bufs[0],
						    1);

						options.s_state.store(
						    STATE_FIN);

						ntr(NTR_DEP_USER1,
						    NTR_LEVEL_DEBUG,
						    "pkt_loop <thread %d>: sent FIN_ACK to cat. QPS = %d.\n",
						    tinfo->id, qps);
					} else {
						ntr(NTR_DEP_USER1,
						    NTR_LEVEL_WARNING,
						    "pkt_loop <thread %d>: invalid FIN packet from a different cat.\n",
						    tinfo->id);
					}
					break;
				default:
					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
					    "pkt_loop: ignoring packet %p with unknown type %d.\n",
					    (void *)rx_bufs[i], type);
				}

				rte_pktmbuf_free(rx_bufs[i]);
			}
		}

		// dequeue receved epochs
		struct epoch_info *einfo;
		tinfo->mtx.lock();
		while (!tinfo->recved_epochs.empty()) {
			// only dequeue, process later
			einfo = tinfo->recved_epochs.front();
			tinfo->recved_epochs.pop_front();

			// XXX: might call into the allocator
			// otherwise we need to have an array and do batching
			// => complex code and don't think it's worth it
			recved_epochs.push_back(einfo);
		}
		tinfo->mtx.unlock();

		if (!recved_epochs.empty())
			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
			    "pkt_loop <thread %d>: dequeued %lu received epochs\n",
			    tinfo->id, recved_epochs.size());

		// process epochs
		while (!recved_epochs.empty()) {
			einfo = recved_epochs.back();
			recved_epochs.pop_back();

			auto it = sent_epochs.find(einfo->epoch);
			if (it != sent_epochs.end()) {
				ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
				    "pkt_loop <thread %d>: received epoch 0x%x\n",
				    tinfo->id, epoch_get_epoch(einfo->epoch));

				if (einfo->ts > last_recv_ts) {
					last_recv_ts = einfo->ts;
				}
				delete it->second;
				sent_epochs.erase(it);
				tinfo->recved_pkts.fetch_add(1);
			} else {
				// we recved an epoch we never sent
				ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
				    "pkt_loop <thread %d>: received epoch 0x%x but never sent it. Packet loss?\n",
				    tinfo->id, einfo->epoch);
			}
			delete einfo;
		}

		// handle packet loss
		for (auto it = sent_epochs.begin(); it != sent_epochs.end();) {
			einfo = it->second;
			if (now - einfo->ts >
			    options.pkt_loss_delay_ms * MS2NS) {
				// timed out
				ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
				    "pkt_loop <thread %d>: epoch 0x%x is lost after not receiving for too long\n",
				    tinfo->id, einfo->epoch);

				delete it->second;
				it = sent_epochs.erase(it);
				tinfo->lost_pkts.fetch_add(1);
			} else {
				++it;
			}
		}

		// check to send the next packet
		uint32_t total_send = 0;
		while (now >= next_ts && sent_epochs.size() < options.depth &&
		    total_send < BURST_SIZE) {
			struct pkt_payload_load *pld_load;
			struct pkt_hdr *pkt_data;
			next_ts += (int)(tinfo->ia_gen->generate() * S2NS);

			// change dst port for every packet for RSS
			srv_cspec.dst_port = dst_port_gen.next();
			srv_cspec.src_port = src_port_gen.next();
			if (alloc_pkt_hdr(mempool_get(tinfo->socket_id),
				PKT_TYPE_LOAD, &srv_cspec, options.pkt_pad_sz,
				&tx_bufs[total_send], &pkt_data) != 0) {
				rte_exit(EXIT_FAILURE,
				    "failed to allocate pkt hdr\n");
			}

			pld_load = (struct pkt_payload_load *)pkt_data->payload;
			pld_load->type = rte_cpu_to_be_32(options.workload_type);
			pld_load->arg0 = rte_cpu_to_be_32((uint32_t)tinfo->load_gen0->generate());
			pld_load->arg1 = rte_cpu_to_be_32((uint32_t)tinfo->load_gen1->generate());
			unsigned int epoch = epoch_mk(tinfo->id, cur_epoch);
			pld_load->epoch = rte_cpu_to_be_32(epoch);
			cur_epoch++;

			einfo = new struct epoch_info;
			einfo->epoch = epoch;
			einfo->ts = now;
			sent_epochs.insert({ epoch, einfo });

			ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
			    "pkt_loop <thread %d>: sending packet %p with epoch 0x%x\n",
			    tinfo->id, (void *)tx_bufs[total_send], epoch);

			total_send++;
		}

		tx_burst_all(options.portid, tinfo->txqid, tx_bufs, total_send);

		// check rage quit only when we have sent a packet
		if (last_recv_ts == 0) {
			last_recv_ts = topo_uptime_ns();
		}
		if (topo_uptime_ns() >
		    options.rage_quit_time * MS2NS + last_recv_ts) {
			rte_exit(EXIT_FAILURE,
			    "rat: thread %d waiting too long for resp. I F QUIT!\n",
			    tinfo->id);
		}
	}

	// clean up
	for (auto it = sent_epochs.begin(); it != sent_epochs.end();) {
		delete it->second;
		++it;
	}
	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
	    "pkt_loop <thread %d>: exiting loop...\n", tinfo->id);
}

static int
locore_main(void *tif)
{
	auto tinfo = (struct thread_info *)tif;
	uint32_t core_id = rte_lcore_id();

	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
	    "locore_main <thread %d>: running on core %d rxqid %d txqid %d...\n", tinfo->id,
	    core_id, tinfo->rxqid, tinfo->txqid);

	if (rte_eth_dev_socket_id(options.portid) > 0 &&
	    rte_eth_dev_socket_id(options.portid) != (int)rte_socket_id()) {
		ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
		    "locore_main <thread %d>: WARNING, port %d is on remote NUMA node to "
		    "polling thread.\n\tPerformance will "
		    "not be optimal.\n",
		    tinfo->id, options.portid);
	}

	if (options.slave_mode == 1) {
		// perform rat protocol
		proto_loop(tinfo);
	}

	// wait for the primary thread sending SYNC_ACK
	while (options.s_state.load() != STATE_RUNNING) {
	}
	// store the current timestamp
	options.s_ts_begin.store(topo_uptime_ns());
	pkt_loop(tinfo);

	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main <thread %d>: exited\n",
	    tinfo->id);

	return 0;
}

static void
dump_options()
{
	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
	    "Configuration:\n"
	    "    verbosity = +%d\n"
	    "    run time = %d\n"
	    "    num threads = %d\n"
	    "    rage quit time = %ul\n"
	    "    slave mode = %d\n"
	    "    interarrival dist = %s\n"
	    "    workload type = %d\n"
		"    workload arg0 = %s\n"
		"    workload arg1 = %s\n"
	    "    qps = %d\n"
	    "    host IP = 0x%x\n"
	    "    depth = %u\n"
	    "    packet loss time threshold = %u\n"
	    "    jumbo frame = %d\n"
	    "    packet pad size = %d\n"
	    "    portid = %d\n",
	    ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING, options.run_time,
	    options.s_num_threads, options.rage_quit_time, options.slave_mode,
	    options.ia_gen, options.workload_type, options.load_gen[0], options.load_gen[1], options.target_qps,
	    options.s_host_spec.ip, options.depth, options.pkt_loss_delay_ms,
	    options.jumbo_frame_enabled, options.pkt_pad_sz, options.portid);
}

static void
usage()
{
	fprintf(stdout,
	    "Usage:\n"
	    "    -v(vv): verbose mode\n"
	    "    -h: display the information\n"
	    "    -t: run time\n"
	    "    -s: server net spec\n"
	    "    -S: slave(rat) mode\n"
	    "    -A: affinity mask\n"
	    "    -i: inter-arrival time distribution\n"
	    "    -w: workload type\n"
		"	 -w (repeated): workload arg0 distribution\n"
		"	 -w (repeated): workload arg1 distribution\n"
	    "    -r: rage quit time (in ms)\n"
	    "    -q: target QPS\n"
	    "    -H: host net spec\n"
	    "    -D: max number of packets in flight\n"
	    "    -l: packet loss time threshold\n"
	    "    -J: enable jumbo frame\n"
	    "    -P: pad load packets to this size\n"
	    "    -p: portid\n");
}

int
main(int argc, char *argv[])
{
	struct thread_info *tinfo;
	bool has_host_spec = false;

	ntr_init();

	// init dpdk
	int ret = rte_eal_init(argc, argv);
	if (ret < 0) {
		rte_exit(EXIT_FAILURE, "rte_eal_init failed!\n");
	}

	argc -= ret;
	argv += ret;

	// set warning level
	ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_WARNING);
	{
		int c;
		int num_of_ws = 0;
		// parse arguments
		while ((c = getopt(argc, argv,
			    "vht:s:SA:i:w:r:q:H:D:l:JP:p:")) != -1) {
			switch (c) {
			case 'v':
				ntr_set_level(NTR_DEP_USER1,
				    ntr_get_level(NTR_DEP_USER1) + 1);
				break;
			case 'h':
				usage();
				rte_exit(EXIT_SUCCESS, "\n");
			case 't':
				options.run_time = strtol(optarg, nullptr, 10);
				break;
			case 's':
				if (str_to_netspec(optarg,
					&options.server_spec) != 0) {
					rte_exit(EXIT_FAILURE,
					    "invalid server net spec\n");
				}
				break;
			case 'S':
				options.slave_mode = 1;
				options.s_state =
				    STATE_SYNC; // set state to wait for SYNC
				break;
			case 'A':
				cpulist_to_cpuset(optarg, &options.cpu_set);
				options.s_num_threads = CPU_COUNT(
				    &options.cpu_set);
				if (options.s_num_threads == 0) {
					rte_exit(EXIT_FAILURE,
					    "invalid cpu mask %s\n", optarg);
				}
				break;
			case 'i':
				strncpy(options.ia_gen, optarg,
				    sizeof(options.ia_gen) - 1);
				break;
			case 'w':
				if (num_of_ws == 0) {
					options.workload_type = strtol(optarg, NULL, 10);
					if (options.workload_type >= LOAD_TYPE_MAX) {
						rte_exit(EXIT_FAILURE,
					    	"invalid workload type %s\n", optarg);
					}
				} else if (num_of_ws <= WORKLOAD_MAX_ARGS) {
					strncpy(options.load_gen[num_of_ws - 1], optarg, 255);
				}

				num_of_ws++;
				break;
			case 'r':
				options.rage_quit_time = strtol(optarg, nullptr,
				    10);
				break;
			case 'q':
				options.target_qps = strtol(optarg, nullptr,
				    10);
				break;
			case 'H':
				has_host_spec = true;
				if (str_to_netspec(optarg,
					&options.s_host_spec) != 0) {
					rte_exit(EXIT_FAILURE,
					    "invalid host net spec.\n");
				}
				break;
			case 'D':
				options.depth = strtol(optarg, nullptr, 10);
				if (options.depth == 0) {
					options.depth = UINT32_MAX;
				}
				break;
			case 'l':
				options.pkt_loss_delay_ms = strtol(optarg,
				    nullptr, 10);
				if (options.pkt_loss_delay_ms == 0) {
					options.pkt_loss_delay_ms = UINT32_MAX;
				}
				break;
			case 'J':
				options.jumbo_frame_enabled = true;
				options.port_mtu = MAX_JUMBO_MTU;
				break;
			case 'P':
				options.pkt_pad_sz = strtol(optarg, nullptr,
				    10);
				break;
			case 'p':
				options.portid = strtol(optarg, nullptr, 10);
				break;
			default:
				usage();
				rte_exit(EXIT_FAILURE, "unknown argument: %c\n",
				    c);
			}
		}
	}

	if (options.pkt_pad_sz != 0 &&
	    options.pkt_pad_sz > mtu_to_pkt_size(options.port_mtu)) {
		rte_exit(EXIT_FAILURE, "pkt_pad_sz is too large for mtu %d\n",
		    options.port_mtu);
	}

	if (!has_host_spec) {
		rte_exit(EXIT_FAILURE, "Must specify host IP.\n");
	}

	// init libtopo
	if (topo_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) !=
	    0) {
		rte_exit(EXIT_FAILURE, "libtopo init failed!\n");
	}

	if (nms_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) !=
	    0) {
		rte_exit(EXIT_FAILURE, "libnms init failed!\n");
	}

	dump_options();

	// configure memory and port
	struct port_conf pconf;
	struct device_conf dconf;
	struct mem_conf mconf;
	portconf_get(options.portid, &pconf);
	if (!pconf.timesync) {
		ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
		    "main: timesync disabled. hw timestamp unavailable.\n ");
	}
	dconf.mtu = options.port_mtu;
	CPU_COPY(&options.cpu_set, &dconf.core_affinity);
	dconf.portid = options.portid;
	dconf.rss_hf = pconf.rss_hf;
	dconf.rx_offloads = pconf.rxoffload;
	dconf.tx_offloads = pconf.txoffload;
	dconf.timesync = pconf.timesync;

	dconf.rx_fn = nullptr;
	dconf.rx_user = nullptr;
	dconf.rx_ring_sz = 2048;
	dconf.tx_fn = nullptr;
	dconf.tx_user = nullptr;
	dconf.tx_ring_sz = 2048;

	mconf.cache_size = 512;
	mconf.priv_size = 0;
	mconf.num_elements = (dconf.rx_ring_sz + dconf.tx_ring_sz) *
	    rte_lcore_count() / rte_socket_count();
	mconf.data_room_size = RTE_MBUF_DEFAULT_BUF_SIZE + MAX_JUMBO_MTU -
	    MAX_STANDARD_MTU;
	mconf.max_pools = -1;

	dpdk_init(&dconf, &mconf);

	if (rte_eth_macaddr_get(options.portid,
		&options.s_host_spec.mac_addr) != 0) {
		rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n",
		    options.portid);
	}

	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
	    "Configured port %d with mac addr %x:%x:%x:%x:%x:%x\n",
	    options.portid, options.s_host_spec.mac_addr.addr_bytes[0],
	    options.s_host_spec.mac_addr.addr_bytes[1],
	    options.s_host_spec.mac_addr.addr_bytes[2],
	    options.s_host_spec.mac_addr.addr_bytes[3],
	    options.s_host_spec.mac_addr.addr_bytes[4],
	    options.s_host_spec.mac_addr.addr_bytes[5]);

	unsigned int cpuset_idx = CPU_FFS(&options.cpu_set);
	unsigned int tid = 0;
	while (cpuset_idx != 0) {
		unsigned int lcore_id = cpuset_idx - 1;
		tinfo = new thread_info;
		tinfo->ia_gen = createGenerator(options.ia_gen);
		tinfo->load_gen0 = createGenerator(options.load_gen[0]);
		tinfo->load_gen1 = createGenerator(options.load_gen[1]);
		if (tinfo->ia_gen == nullptr || tinfo->load_gen0 == nullptr || tinfo->load_gen1 == nullptr) {
			rte_exit(EXIT_FAILURE,
			    "invalid ia_gen or ld_gen string\n");
		}
		tinfo->ia_gen->set_lambda((double)options.target_qps /
		    (double)(options.s_num_threads));
		tinfo->id = tid;
		tinfo->lcore_id = lcore_id;
		tinfo->socket_id = rte_lcore_to_socket_id(lcore_id);
		tinfo->rxqid = tid;
		tinfo->txqid = tid;
		options.s_thr_info.push_back(tinfo);

		tid++;
		CPU_CLR(lcore_id, &options.cpu_set);
		cpuset_idx = CPU_FFS(&options.cpu_set);
	}

	sleep(INIT_DELAY);

	for (unsigned int i = 0; i < options.s_num_threads; i++) {
		tinfo = options.s_thr_info.at(i);
		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
		    "main: launching thread %d on locore %d\n", tinfo->id,
		    tinfo->lcore_id);
		if (rte_eal_remote_launch(locore_main,
			(void *)options.s_thr_info.at(i),
			tinfo->lcore_id) != 0) {
			rte_exit(EXIT_FAILURE,
			    "failed to launch function on locore %d\n",
			    tinfo->lcore_id);
		}
	}

	// poor man's timer
	uint32_t second = 0;
	// this loop exit is signaled by SYNC_FIN in slave mode and by itself in
	// non slave mode
	while (options.s_state.load() != STATE_FIN) {
		if (options.slave_mode != 1) {
			if (second >= options.run_time) {
				options.s_state.store(STATE_FIN);
				break;
			}
			usleep(1 * S2US);
			second++;
		}
	}

	for (unsigned int i = 0; i < options.s_num_threads; i++) {
		tinfo = options.s_thr_info.at(i);
		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
		    "main: waiting for locore %d...\n", tinfo->lcore_id);
		if (rte_eal_wait_lcore(tinfo->lcore_id) != 0) {
			rte_exit(EXIT_FAILURE, "failed to wait for locore %d\n",
			    tinfo->lcore_id);
		}
	}

	uint32_t qps;
	uint32_t total_recv;
	uint32_t total_loss;
	calc_stats(topo_uptime_ns(), &qps, &total_recv, &total_loss);
	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "qps = %d, recv = %d, loss = %d\n",
	    qps, total_recv, total_loss);

	for (auto each : options.s_thr_info) {
		delete each->load_gen0;
		delete each->load_gen1;
		delete each->ia_gen;
		delete each;
	}

	// clean up
	dpdk_cleanup(&dconf);

	return 0;
}