nvme/perf: add software-based latency tracking

The latency tracking is done with ranges of bucket arrays. The bucket for any given I/O is determined solely by TSC deltas - any translation to microseconds is only done after the test is finished and statistics are printed. Each range has a number of buckets determined by a NUM_BUCKETS_PER_RANGE value which is currently set to 128. The buckets in ranges 0 and 1 each map to one specific TSC delta. The buckets in subsequent ranges each map to twice as many TSC deltas as buckets in the previous range: Range 0: 1 TSC each - 128 buckets cover deltas 0 to 127 Range 1: 1 TSC each - 128 buckets cover deltas 128 to 255 Range 2: 2 TSC each - 128 buckets cover deltas 256 to 511 Range 3: 4 TSC each - 128 buckets cover deltas 512 to 1023 Range 4: 8 TSC each - 128 buckets cover deltas 1024 to 2047 Range 5: 16 TSC each - 128 buckets cover deltas 2048 to 4095 etc. While here, change some variable names and usage messages to differentiate between the existing latency tracking via vendor-specific NVMe log pages on Intel NVMe SSDs, and the newly added latency tracking done in software. Signed-off-by: Jim Harris <james.r.harris@intel.com> Change-Id: I299f1c1f6dbfa7ea0e73085f7a685e71fc687a2b
2017-05-15 15:57:26 -07:00 · 2017-05-15 15:57:26 -07:00 · 8de75f8107
commit 8de75f8107
parent a591161cb2
2 changed files with 144 additions and 9 deletions
--- a/examples/nvme/perf/perf.c
+++ b/examples/nvme/perf/perf.c
@ -81,6 +81,40 @@ struct ns_entry {
 	char			name[1024];
 };

+/*
+ * Latency tracking is done with ranges of bucket arrays.  The bucket
+ * for any given I/O is determined solely by the TSC delta - any
+ * translation to microseconds is only done after the test is finished
+ * and statistics are printed.
+ *
+ * Each range has a number of buckets determined by NUM_BUCKETS_PER_RANGE
+ * which is 128.  The buckets in ranges 0 and 1 each map to one specific
+ * TSC delta.  The buckets in subsequent ranges each map to twice as many
+ * TSC deltas as buckets in the range before it:
+ *
+ * Range 0:  1 TSC each - 128 buckets cover 0 to 127 (2^7-1)
+ * Range 1:  1 TSC each - 128 buckets cover 128 to 255 (2^8-1)
+ * Range 2:  2 TSC each - 128 buckets cover 256 to 511 (2^9-1)
+ * Range 3:  4 TSC each - 128 buckets cover 512 to 1023 (2^10-1)
+ * Range 4:  8 TSC each - 128 buckets cover 1024 to 2047 (2^11-1)
+ * Range 5: 16 TSC each - 128 buckets cover 2048 to 4095 (2^12-1)
+ * ...
+ * Range 55: 2^54 TSC each - 128 buckets cover 2^61 to 2^62-1
+ * Range 56: 2^55 TSC each - 128 buckets cover 2^62 to 2^63-1
+ * Range 57: 2^56 TSC each - 128 buckets cover 2^63 to 2^64-1
+ *
+ * On a 2.3GHz processor, this strategy results in 50ns buckets in the
+ * 7-14us range (sweet spot for Intel Optane SSD latency testing).
+ *
+ * Buckets can be made more granular by increasing BUCKET_SHIFT.  This
+ * comes at the cost of additional storage per namespace context to
+ * store the bucket data.
+ */
+#define BUCKET_SHIFT 7
+#define NUM_BUCKETS_PER_RANGE (1ULL << BUCKET_SHIFT)
+#define BUCKET_MASK (NUM_BUCKETS_PER_RANGE - 1)
+#define NUM_BUCKET_RANGES (64 - BUCKET_SHIFT + 1)
+
 struct ns_worker_ctx {
 	struct ns_entry		*entry;
 	uint64_t		io_completed;
@ -105,6 +139,8 @@ struct ns_worker_ctx {
 	} u;

 	struct ns_worker_ctx	*next;
+
+	uint64_t		bucket[NUM_BUCKET_RANGES][NUM_BUCKETS_PER_RANGE];
 };

 struct perf_task {
@ -124,7 +160,8 @@ struct worker_thread {

 static int g_outstanding_commands;

-static bool g_latency_tracking_enable = false;
+static bool g_latency_ssd_tracking_enable = false;
+static int g_latency_sw_tracking_level = 0;

 static struct rte_mempool *task_pool;

@ -160,6 +197,63 @@ static int g_aio_optind; /* Index of first AIO filename in argv */
 static void
 task_complete(struct perf_task *task);

+static uint32_t
+get_bucket_range(uint64_t tsc)
+{
+	uint32_t clz, range;
+
+	assert(tsc != 0);
+
+	clz = __builtin_clzll(tsc);
+
+	if (clz <= NUM_BUCKET_RANGES) {
+		range = NUM_BUCKET_RANGES - clz;
+	} else {
+		range = 0;
+	}
+
+	return range;
+}
+
+static uint32_t
+get_bucket_index(uint64_t tsc, uint32_t range)
+{
+	uint32_t shift;
+
+	if (range == 0) {
+		shift = 0;
+	} else {
+		shift = range - 1;
+	}
+
+	return (tsc >> shift) & BUCKET_MASK;
+}
+
+static double
+get_us_from_bucket(uint32_t range, uint32_t index)
+{
+	uint64_t tsc;
+
+	index += 1;
+	if (range > 0) {
+		tsc = 1ULL << (range + BUCKET_SHIFT - 1);
+		tsc += (uint64_t)index << (range - 1);
+	} else {
+		tsc = index;
+	}
+
+	return (double)tsc * 1000 * 1000 / g_tsc_rate;
+}
+
+static void
+track_latency(struct ns_worker_ctx *ns_ctx, uint64_t tsc)
+{
+	uint32_t range = get_bucket_range(tsc);
+	uint32_t index = get_bucket_index(tsc, range);
+
+	ns_ctx->bucket[range][index]++;
+}
+
 static void
 register_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns)
 {
@ -277,7 +371,7 @@ register_ctrlr(struct spdk_nvme_ctrlr *ctrlr)
 	entry->next = g_controllers;
 	g_controllers = entry;

-	if (g_latency_tracking_enable &&
+	if (g_latency_ssd_tracking_enable &&
 	    spdk_nvme_ctrlr_is_feature_supported(ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING))
 		set_latency_tracking_feature(ctrlr, true);

@ -494,7 +588,9 @@ task_complete(struct perf_task *task)
 	if (ns_ctx->max_tsc < tsc_diff) {
 		ns_ctx->max_tsc = tsc_diff;
 	}
-
+	if (g_latency_sw_tracking_level > 0) {
+		track_latency(ns_ctx, tsc_diff);
+	}
 	rte_mempool_put(task_pool, task);

 	/*
@ -655,7 +751,8 @@ static void usage(char *program_name)
 	printf("\t[-w io pattern type, must be one of\n");
 	printf("\t\t(read, write, randread, randwrite, rw, randrw)]\n");
 	printf("\t[-M rwmixread (100 for reads, 0 for writes)]\n");
-	printf("\t[-l enable latency tracking, default: disabled]\n");
+	printf("\t[-L enable latency tracking via sw, default: disabled]\n");
+	printf("\t[-l enable latency tracking via ssd (if supported), default: disabled]\n");
 	printf("\t[-t time in seconds]\n");
 	printf("\t[-c core mask for I/O submission/completion.]\n");
 	printf("\t\t(default: 1)]\n");
@ -731,6 +828,41 @@ print_performance(void)
 	       sum_ave_latency / ns_count, sum_min_latency / ns_count,
 	       sum_max_latency / ns_count);
 	printf("\n");
+
+	if (g_latency_sw_tracking_level == 0) {
+		return;
+	}
+	worker = g_workers;
+	while (worker) {
+		ns_ctx = worker->ns_ctx;
+		while (ns_ctx) {
+			uint64_t i, j, so_far = 0;
+			float so_far_pct = 0;
+			double last_bucket, bucket = 0;
+
+			printf("Latency data for %-43.43s from core %u:\n", ns_ctx->entry->name, worker->lcore);
+			printf("=========================================================================\n");
+			printf("       Range in us     Cumulative    IO count\n");
+
+			for (i = 0; i < NUM_BUCKET_RANGES; i++) {
+				for (j = 0; j < NUM_BUCKETS_PER_RANGE; j++) {
+					so_far += ns_ctx->bucket[i][j];
+					so_far_pct = (float)so_far * 100 / total_io_completed;
+					last_bucket = bucket;
+					bucket = get_us_from_bucket(i, j);
+					if (ns_ctx->bucket[i][j] == 0) {
+						continue;
+					}
+					printf("%9.3f - %9.3f: %9.4f%%  (%9ju)\n",
+					       last_bucket, bucket, so_far_pct, ns_ctx->bucket[i][j]);
+				}
+			}
+			printf("\n");
+			ns_ctx = ns_ctx->next;
+		}
+		worker = worker->next;
+	}
+
 }

 static void
@ -804,7 +936,7 @@ static void
 print_stats(void)
 {
 	print_performance();
-	if (g_latency_tracking_enable) {
+	if (g_latency_ssd_tracking_enable) {
 		if (g_rw_percentage != 0) {
 			print_latency_statistics("Read", SPDK_NVME_INTEL_LOG_READ_CMD_LATENCY);
 		}
@ -866,7 +998,7 @@ parse_args(int argc, char **argv)
 	g_core_mask = NULL;
 	g_max_completions = 0;

-	while ((op = getopt(argc, argv, "c:d:i:lm:q:r:s:t:w:M:")) != -1) {
+	while ((op = getopt(argc, argv, "c:d:i:lm:q:r:s:t:w:LM:")) != -1) {
 		switch (op) {
 		case 'c':
 			g_core_mask = optarg;
@ -878,7 +1010,7 @@ parse_args(int argc, char **argv)
 			g_shm_id = atoi(optarg);
 			break;
 		case 'l':
-			g_latency_tracking_enable = true;
+			g_latency_ssd_tracking_enable = true;
 			break;
 		case 'm':
 			g_max_completions = atoi(optarg);
@ -901,6 +1033,9 @@ parse_args(int argc, char **argv)
 		case 'w':
 			workload_type = optarg;
 			break;
+		case 'L':
+			g_latency_sw_tracking_level++;
+			break;
 		case 'M':
 			g_rw_percentage = atoi(optarg);
 			mix_specified = true;
@ -1126,7 +1261,7 @@ unregister_controllers(void)
 	while (entry) {
 		struct ctrlr_entry *next = entry->next;
 		spdk_free(entry->latency_page);
-		if (g_latency_tracking_enable &&
+		if (g_latency_ssd_tracking_enable &&
 		    spdk_nvme_ctrlr_is_feature_supported(entry->ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING))
 			set_latency_tracking_feature(entry->ctrlr, false);
 		spdk_nvme_detach(entry->ctrlr);
--- a/test/lib/nvme/nvme.sh
+++ b/test/lib/nvme/nvme.sh
@ -30,7 +30,7 @@ done
 timing_exit identify

 timing_enter perf
-$rootdir/examples/nvme/perf/perf -q 128 -w read -s 12288 -t 1
+$rootdir/examples/nvme/perf/perf -q 128 -w read -s 12288 -t 1 -L
 timing_exit perf

 timing_enter reserve