#include <sys/endian.h>
#include <sys/errno.h>
#include <sys/signal.h>
#include <sys/types.h>
#include <fcntl.h>
#include <getopt.h>
#include <pthread.h>
#include <pthread_np.h>
#include <threads.h>
#include <unistd.h>
#include <aio.h>
#include <getopt.h>
#include <sys/ioctl.h>
#include <sys/disk.h>

#include <cerrno>
#include <cstddef>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <chrono>
#include <list>
#include <set>

#include "gen.hh"
#include "ntr.h"
#include "defs.hh"
#include "nm.hh"
#include "storage/io_gen.hh"

static inline uint64_t get_cur_ts_nano()
{
    return std::chrono::duration_cast<std::chrono::nanoseconds>
              (std::chrono::high_resolution_clock::now().time_since_epoch()).count();
}

/*
 * We'll use this struct to gather housekeeping hello_context to pass between
 * our events and callbacks.
 */
static constexpr unsigned long MAX_SPEC_LEN = 32;
static constexpr unsigned long MAX_DEV_NAME_LEN = 32;
static constexpr unsigned long MAX_OUTPUT_FILE_LEN = 256;
struct options_t {
	// args
	int verbosity = NTR_LEVEL_DEFAULT;
	int num_threads = 1;
	unsigned long cpumask = 1;
	char pattern_spec[MAX_SPEC_LEN] = "R,100";
	char ia_spec[MAX_SPEC_LEN] = "fixed";
	
	unsigned int time = 5;
	unsigned int warmup = 2;
	unsigned int queue_depth = 1;
	char dev_name[MAX_DEV_NAME_LEN] = "Malloc0";
	char driver_name[MAX_DEV_NAME_LEN] = "bdev";
	unsigned int read_pct = 0;
	io_generator_address_mode addr_mode = IOGEN_ADDR_UNIFORM_RANDOM;

	char output_file[MAX_OUTPUT_FILE_LEN] = "output.txt";

	unsigned long req_size = 4096;
	unsigned long rps = 0;
};


std::atomic<int> worker_thread_init_cnt(0);
std::atomic<int> worker_thread_stop_cnt(0);
std::atomic<int> worker_start(0);
std::atomic<int> worker_stop(0);
static struct options_t options;

struct io_record {
	uint64_t start_ts;
	uint64_t end_ts;
};

struct io_request {
	uint64_t start_ts;
	io_generator_opcode op;
	char * user_buf;
	char * dma_buf;
	struct aiocb aio;
};

struct thread_context {
	unsigned int tid;
	unsigned int coreid;
	unsigned int sockid;
	pthread_t sys_thread;
	int disk_fd;

	unsigned long start_region_offset;
	unsigned long start_region_length;

	/* modified by worker threads */
	std::list<io_record *> *io_records;
	uint64_t overhead_avg;
	uint32_t overhead_cnt;
	uint64_t overhead_max;
	uint64_t overhead_min;
};

static void dump_options()
{
	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: Options:\n"
					"    dev name: %s\n"
					"    driver name: %s\n"
                    "    worker threads: 0x%lx\n"
					"    number of threads: %d\n"
					"    IO request size: %lu\n"
					"    IO requests per second: %lu\n"
					"    IO pattern: %s\n"
					"    IO queue depth: %d\n"
					"    IO addressing mode: %d\n"
					"    read percent: %u\n"
					"    inter-arrival dist: %s\n"
					"    run time: %d\n"
					"    warmup time: %d\n"
					"    output file: %s\n",
					options.dev_name,
					options.driver_name,
					options.cpumask,
					options.num_threads,
					options.req_size,
					options.rps,
					options.pattern_spec,
					options.queue_depth,
					options.addr_mode,
					options.read_pct,
					options.ia_spec,
					options.time,
					options.warmup,
					options.output_file
	);
}

static void usage()
{
	fprintf(stdout, 
		" -V(VV): verbose mode\n"
		" -D: dev name\n"
		" -k: driver to use (default bdev)\n"
		" -a: worker threads spec (0x3 = spawn 2 threads on core 1 & 2)\n"
		" -b: IO request size\n"
		" -q: IO requests per second\n"
		" -P: IO request pattern\n"
		" -Q: IO request queue depth\n"
		" -I: inter-arrival time distribution\n"
		" -t: total run time\n"
		" -w: warm up time\n"
		" -o: latency response output file\n");
}

static void * 
worker_thread_main(void * arg)
{
	int rc = 0;
 
	auto *ctx = (struct thread_context *)arg;
	std::list<struct io_request *> free_ios;
	std::list<struct io_request *> prog_ios;

	Generator * ia_gen = nullptr;
	io_generator * io_gen = nullptr;

	struct io_generator_ctx io_ctx;
	uint64_t next_ts;
	uint64_t a_offset;
	uint64_t last_loop_ts = 0;

	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: init...\n", ctx->tid);

	// create io request objects
	for (unsigned int i = 0; i < options.queue_depth; i++) {
		auto buf = (char *)nm_malloc(ctx->sockid, options.req_size);
		auto user_buf = (char *)nm_malloc(ctx->sockid, options.req_size);

		if (buf == nullptr || user_buf == nullptr) {
			ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: could not allocate buffers!\n", ctx->tid);
			rc = ENOMEM;
			goto cleanup;
		}

		auto io_req = new struct io_request;
		io_req->dma_buf = buf;
		io_req->user_buf = user_buf;
		io_req->aio.aio_fildes = ctx->disk_fd;
		io_req->aio.aio_nbytes = options.req_size;
		io_req->aio.aio_buf = buf;
		io_req->aio.aio_sigevent.sigev_notify = SIGEV_NONE;
		io_req->aio.aio_reqprio = 0;

		free_ios.push_back(io_req);
	}

	// init thread local states
	ia_gen = createGenerator(options.ia_spec);
	if (ia_gen == nullptr) {
		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: could not allocate ia generator!\n", ctx->tid);
		rc = EINVAL;
		goto cleanup;
	}
	ia_gen->set_lambda((double)options.rps / (double)(options.num_threads));

	io_gen = new io_generator(options.req_size, ctx->start_region_length, options.read_pct, options.addr_mode);
	if (io_gen == nullptr) {
		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: could not allocate ia generator!\n", ctx->tid);
		rc = EINVAL;
		goto cleanup;
	}

	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: init complete.\n", ctx->tid);

	worker_thread_init_cnt.fetch_add(1);

	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: waiting for start...\n", ctx->tid);

	while (worker_start.load() == 0) {}

	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: started...\n", ctx->tid);
	
	/* random delay 0-100 us */
	usleep(nm_get_uptime_ns() % 100);

	next_ts = get_cur_ts_nano();
	
	while (true) {
		uint64_t cur_ts = get_cur_ts_nano();
		if (last_loop_ts > 0) {
			uint64_t overhead = cur_ts - last_loop_ts;
			if (ctx->overhead_max < overhead) {
				ctx->overhead_max = overhead;
			}

			if (ctx->overhead_min > overhead) {
				ctx->overhead_min = overhead;
			}

			ctx->overhead_avg = ctx->overhead_avg * ctx->overhead_cnt + overhead;
			ctx->overhead_cnt++;
			ctx->overhead_avg /= ctx->overhead_cnt;
		}
		last_loop_ts = cur_ts;

		// process io completion
		auto itr = prog_ios.begin();
		while (itr != prog_ios.end()) {
			int err;
			struct io_request * ioreq = *itr;
			if ((err = aio_error(&ioreq->aio)) != EINPROGRESS) {
				if (err == 0) {
					auto rec = new struct io_record;
					rec->start_ts = ioreq->start_ts;
					rec->end_ts = cur_ts;

					ctx->io_records->push_back(rec);
					if (ioreq->op == IOGEN_READ) {
						memcpy(ioreq->user_buf, ioreq->dma_buf, options.req_size);
					}
					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d <worker_io_complete>: completed io request type %d\n", ctx->tid, ioreq->op);
					
				} else {
					ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "thread %d: aio failed with %d...\n", ctx->tid, err);
				}

				if (aio_return(&ioreq->aio) == -1) {
					ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: aio_return failed with %d...\n", ctx->tid, errno);
					exit(errno);
				}

				/* cleanup */
				itr = prog_ios.erase(itr);
				free_ios.push_back(ioreq);
			} else {
				++itr;
			}
		}

		if (worker_stop.load() == 1) {
			if (free_ios.size() >= options.queue_depth) {
				break;
			}
		} else {
			if (!free_ios.empty()) {
				auto io_req = free_ios.front();

				cur_ts = get_cur_ts_nano();

				if (cur_ts >= next_ts) {
					io_gen->issue(&io_ctx, io_req->dma_buf);

					a_offset = io_ctx.offset + ctx->start_region_offset;

					ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: issuing IO type %d at offset 0x%lx size 0x%lx...\n", ctx->tid, io_ctx.op, a_offset, io_ctx.size);

					io_req->start_ts = cur_ts;
					io_req->op = io_ctx.op;
					io_req->aio.aio_offset = a_offset;

					if(io_ctx.op == IOGEN_READ) {
						rc = aio_read(&io_req->aio);
					} else {
						rc = aio_write(&io_req->aio);
					}

					if (rc != 0) {
						ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "thread %d: failed to issue io %d, retrying...\n", ctx->tid, errno);
					} else {
						free_ios.pop_front();
						prog_ios.push_back(io_req);
						next_ts = next_ts + ia_gen->generate() * S2NS;
					}
				}
			}
		}
	}

cleanup:
	while (!free_ios.empty()) {
		auto req = free_ios.front();
		free_ios.pop_front();
		nm_free(ctx->sockid, req->dma_buf);
		nm_free(ctx->sockid, req->user_buf);
	}

	if (ia_gen != nullptr) {
		delete ia_gen;
	}

	if (io_gen != nullptr) {
		delete io_gen;
	}

	worker_thread_stop_cnt.fetch_add(1);

	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: stopped...\n", ctx->tid);

	return nullptr;
}


static void
parse_pattern(char * pattern, unsigned int * read_pct, io_generator_address_mode * addr_mode)
{
	char * token = strtok(pattern, ",");

	if (strcmp(token, "M") == 0) {
		*addr_mode = IOGEN_ADDR_MONOTONIC_INCREASING;
	} else {
		*addr_mode = IOGEN_ADDR_UNIFORM_RANDOM;
	}

	token = strtok(nullptr, ",");
	*read_pct = strtoull(token, nullptr, 10);
}

static void
birb_main()
{
	int rc = 0;
	std::list<struct thread_context *> worker_threads;
	std::ofstream output_file;

	unsigned long record_cutoff_time = 0;
	unsigned long current_s = 0;
	unsigned int total_reqs = 0;
	unsigned int tid = 0;
	unsigned long per_thread_cap = 0;
	int cur_core;
	int disk_fd;
	off_t disk_size;
	u_int disk_sec_size;

	/* initialize driver */
	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: initializing device driver for device %s\n", options.dev_name);
	disk_fd = open(options.dev_name, O_RDWR | O_DIRECT);
	if (disk_fd == -1) {
		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to open device - %d\n", errno);
		exit(errno);
	}

	rc = ioctl(disk_fd, DIOCGMEDIASIZE, &disk_size);
	if (rc == -1) {
		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to obtain disk size - %d\n", errno);
		exit(errno);
	}

	rc = ioctl(disk_fd, DIOCGSECTORSIZE, &disk_sec_size);
	if (rc == -1) {
		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to obtain disk sector size - %d\n", errno);
		exit(errno);
	}

	per_thread_cap = disk_size / options.num_threads;
	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: initialized device with capacity %zu bytes ~= %zu MB, sector %u bytes\n", disk_size, disk_size / 1024 / 1024, disk_sec_size);

	parse_pattern(options.pattern_spec, &options.read_pct, &options.addr_mode);
	dump_options();
	
	output_file.open(options.output_file, std::ofstream::out);
	if (!output_file) {
		ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to open output file %s\n", options.output_file);
		rc = EINVAL;
		goto end;
	}

	cur_core = cmask_get_next_cpu(&options.cpumask);
	while(cur_core != NEXT_CPU_NULL) {
		auto * ctx = new struct thread_context;
		memset(ctx, 0, sizeof(struct thread_context));

		if (ctx == NULL) {
			ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to alloc thread ctx.\n");
			exit(ENOMEM);
		}

		ctx->tid = tid++;
	
		ctx->sockid = nm_get_node_from_core(cur_core);
		ctx->coreid = cur_core;
		ctx->io_records = new std::list<struct io_record *>();
		ctx->start_region_length = per_thread_cap;
		ctx->start_region_offset = per_thread_cap * ctx->tid;
		ctx->disk_fd = disk_fd;

		// create sys thread
		pthread_attr_t attr;
		cpuset_t scpuset;
		CPU_ZERO(&scpuset);
		CPU_SET(cur_core, &scpuset);
		pthread_attr_init(&attr);
		pthread_attr_setaffinity_np(&attr, sizeof(cpuset_t), &scpuset);
		rc = pthread_create(&ctx->sys_thread, &attr, worker_thread_main, ctx);
		if (rc != 0) {
			ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to create sys thread: %d\n", rc);
			rc = EINVAL;
			goto end;
		}
		worker_threads.push_back(ctx);
		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: created worker thread %d on core %d socket %d offset 0x%lx length %ld\n", ctx->tid, cur_core, ctx->sockid, 
																																	ctx->start_region_offset, 
																																	ctx->start_region_length);

		cur_core = cmask_get_next_cpu(&options.cpumask);
	}

	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "main: waiting for worker thread init...\n");
	while(worker_thread_init_cnt.load() < options.num_threads) {
	}

	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "main: starting worker threads...\n");
	worker_start.store(1);

	/* main event loop */
	while(current_s < options.time) {
		if (current_s >= options.warmup && record_cutoff_time == 0) {
			record_cutoff_time = get_cur_ts_nano();
		}
		usleep(1 * S2US);
		current_s++;
	}

	ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "main: stopping worker threads...\n");
	worker_stop.store(1);

	while(worker_thread_stop_cnt.load() < options.num_threads) {
	}

	// keep stats
	for (struct thread_context * tctx : worker_threads) {
		uint64_t last_ts = 0;
		uint64_t processed = 0;
		for (struct io_record * r : *tctx->io_records) {
			if (r->start_ts >= record_cutoff_time) {
				if (r->end_ts > last_ts) {
					last_ts = r->end_ts;
				}

				processed++;
				output_file << r->end_ts - r->start_ts << std::endl;
				total_reqs++;
			}
		}

		ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: thread %d processed requests: %lu, last request %lu. Overhead - avg %lu min %lu max %lu\n", 
											tctx->tid, processed, last_ts, tctx->overhead_avg, tctx->overhead_min, tctx->overhead_max);
	}

	ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: total requests: %u, bytes per second: %lu\n", 
						total_reqs, total_reqs * options.req_size / (options.time - options.warmup));

end:
	if (disk_fd != -1) {
		close(disk_fd);
	}

	output_file.close();

	for (struct thread_context * tctx : worker_threads) {
		for (struct io_record * r : *tctx->io_records) {
			delete r;
		}
		delete tctx->io_records;
		delete tctx;
	}

	return;
}

int
main(int argc, char **argv)
{
	ntr_init();
	ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_INFO);

	int c;
	while (( c = getopt(argc, argv, "VD:k:a:b:q:Q:P:I:t:w:o:")) != -1)
	{
		switch (c) {
			case 'V':
				ntr_set_level(NTR_DEP_USER1,
					ntr_get_level(NTR_DEP_USER1) + 1);
				break;
			case 'D':
				strncpy(options.dev_name, optarg, MAX_DEV_NAME_LEN);
				break;
			case 'k':
				strncpy(options.driver_name, optarg, MAX_DEV_NAME_LEN);
				break;
			case 'a':
				options.cpumask = strtoull(optarg, nullptr, 16);
				options.num_threads = cmask_get_num_cpus(
					options.cpumask);

				if (options.num_threads == 0) {
					fprintf(stderr,
						"must run at least one thread\n");
					return EINVAL;
				}
				break;
			case 'b':
				options.req_size = strtoull(
					optarg, nullptr, 10);
				break;
			case 'q':
				options.rps = strtoull(
					optarg, nullptr, 10);
				break;
			case 'Q':
				options.queue_depth = strtoull(
					optarg, nullptr, 10);
				break;
			case 'P':
				strncpy(options.pattern_spec, optarg, MAX_SPEC_LEN);
				break;
			case 'I':
				strncpy(options.ia_spec, optarg, MAX_SPEC_LEN);
				break;
			case 't':
				options.time = strtoull(
					optarg, nullptr, 10);
				break;
			case 'w':
				options.warmup = strtoull(
					optarg, nullptr, 10);
				break;
			case 'o':
				strncpy(options.output_file, optarg, MAX_OUTPUT_FILE_LEN);
				break;
			case 'h':
				usage();
				exit(0);
			default:
				usage();
				exit(EINVAL);
		}
	}

	nm_init(options.verbosity);
	birb_main();

	return 0;
}