diff --git a/examples/ioat/Makefile b/examples/ioat/Makefile
index 17c41aac28..cc105f3197 100644
--- a/examples/ioat/Makefile
+++ b/examples/ioat/Makefile
@@ -34,7 +34,7 @@
 SPDK_ROOT_DIR := $(CURDIR)/../..
 include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
 
-DIRS-y += perf verify
+DIRS-y += perf verify kperf
 
 .PHONY: all clean $(DIRS-y)
 
diff --git a/examples/ioat/kperf/.gitignore b/examples/ioat/kperf/.gitignore
new file mode 100644
index 0000000000..bc25a0bd3e
--- /dev/null
+++ b/examples/ioat/kperf/.gitignore
@@ -0,0 +1 @@
+ioat_kperf
diff --git a/examples/ioat/kperf/Makefile b/examples/ioat/kperf/Makefile
new file mode 100644
index 0000000000..a4cfdfa6f2
--- /dev/null
+++ b/examples/ioat/kperf/Makefile
@@ -0,0 +1,53 @@
+#
+#  BSD LICENSE
+#
+#  Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(CURDIR)/../../..
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+APP = ioat_kperf
+
+C_SRCS := ioat_kperf.c
+
+CFLAGS += -I.
+
+LIBS += -lrt
+
+all: $(APP)
+
+$(APP): $(OBJS)
+	$(LINK_C)
+
+clean:
+	$(CLEAN_C) $(APP)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.deps.mk
diff --git a/examples/ioat/kperf/README b/examples/ioat/kperf/README
new file mode 100644
index 0000000000..61bb8e153d
--- /dev/null
+++ b/examples/ioat/kperf/README
@@ -0,0 +1,42 @@
+IOAT Kernel Driver Test Tool
+============================
+
+For the purpose to make performance comparison with user space IOAT
+driver, we developed the test tool based on IOAT kernel driver, the
+test tool contains 2 components: kernel test module and user space
+application. The kernel test module allocated one kernel thread for
+each DMA channel, and the kernel thread did not pin to specified
+CPU cores, but ensure all the thread run in the same NUMA socket
+with DMA channel, and the user space application communicated
+with kernel test module via sysfs interface.
+
+Building & Usage
+================
+
+1. Compile and load the kernel test module first.
+
+    modprobe -v ioatdma
+    cd kmod && make && insmod dmaperf.ko
+
+2. Run the test application.
+
+    Parameters:
+    [-h usage]
+    [-n number of DMA channels]
+    [-q queue depth, per DMA channel]
+    [-s [n^2] transfer size, per descriptor]
+    [-t total [n^2] data to tranfer, per DMA channel]
+
+    For example: ./ioat_kperf -n 4 -q 128 -s 12 -t 32
+
+    Total 4 Channels, Queue_Depth 128, Transfer Size 4096 Bytes, Total Transfer Size 4 GB
+    Running I/O . . . .
+    Channel 0 Performance Data 1414 MB/s
+    Channel 1 Performance Data 1413 MB/s
+    Channel 2 Performance Data 1413 MB/s
+    Channel 3 Performance Data 1415 MB/s
+
+OS Support
+==========
+We have tested several Linux distributions, currently Fedora 21/22 with kernel
+version >= 3.17 are supported.
diff --git a/examples/ioat/kperf/ioat_kperf.c b/examples/ioat/kperf/ioat_kperf.c
new file mode 100644
index 0000000000..335adf41f5
--- /dev/null
+++ b/examples/ioat/kperf/ioat_kperf.c
@@ -0,0 +1,320 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+#include <assert.h>
+
+static int
+check_modules(char *driver_name)
+{
+	FILE *fd;
+	const char *proc_modules = "/proc/modules";
+	char buffer[256];
+
+	fd = fopen(proc_modules, "r");
+	if (!fd)
+		return -1;
+
+	while (fgets(buffer, sizeof(buffer), fd)) {
+		if (strstr(buffer, driver_name) == NULL)
+			continue;
+		else {
+			fclose(fd);
+			return 0;
+		}
+	}
+	fclose(fd);
+
+	return -1;
+}
+
+static int
+get_u32_from_file(const char *sysfs_file, uint32_t *value)
+{
+	FILE *f;
+	char buf[BUFSIZ];
+
+	f = fopen(sysfs_file, "r");
+	if (f == NULL) {
+		return -1;
+	}
+
+	if (fgets(buf, sizeof(buf), f) != NULL) {
+		*value = strtoul(buf, NULL, 10);
+	}
+
+	fclose(f);
+
+	return 0;
+}
+
+static int
+get_str_from_file(const char *sysfs_file, char *buf, int len)
+{
+	FILE *f;
+
+	f = fopen(sysfs_file, "r");
+	if (f == NULL) {
+		return -1;
+	}
+
+	if (fgets(buf, len, f) != NULL) {
+		fclose(f);
+		return 0;
+	}
+
+	fclose(f);
+	return -1;
+}
+
+static int
+put_u32_to_file(const char *sysfs_file, uint32_t value)
+{
+	FILE *f;
+	int n;
+	char buf[BUFSIZ];
+
+	f = fopen(sysfs_file, "w");
+	if (f == NULL) {
+		return -1;
+	}
+
+	n = snprintf(buf, sizeof(buf), "%ul", value);
+	if ((n < 0) || (n >= (int)sizeof(buf))) {
+		fclose(f);
+		return -1;
+	}
+
+	if (fwrite(buf, n, 1, f) == 0) {
+		fclose(f);
+		return -1;
+	}
+
+	fclose(f);
+	return 0;
+}
+
+static int
+get_u64_from_file(const char *sysfs_file, uint64_t *value)
+{
+	FILE *f;
+	char buf[BUFSIZ];
+
+	f = fopen(sysfs_file, "r");
+	if (f == NULL) {
+		return -1;
+	}
+
+	if (fgets(buf, sizeof(buf), f) != NULL) {
+		*value = strtoull(buf, NULL, 10);
+	}
+
+	fclose(f);
+
+	return 0;
+}
+
+static void
+usage(char *program_name)
+{
+	printf("%s options\n", program_name);
+	printf("\t[-h usage]\n");
+	printf("\t[-n number of DMA channels]\n");
+	printf("\t[-q queue depth, per DMA channel]\n");
+	printf("\t[-s [n^2] transfer size, per descriptor]\n");
+	printf("\t[-t total [n^2] data to tranfer, per DMA channel]\n");
+}
+
+int main(int argc, char *argv[])
+{
+	int op;
+	int rc;
+	char buf[BUFSIZ];
+	uint32_t i, threads = 0;
+	uint32_t ring_size, queue_depth = 0;
+	uint32_t transfer_size, order = 0;
+	uint64_t total_size, copied = 0;
+	uint64_t elapsed_time = 0;
+	char channel[1024];
+
+	if (check_modules("ioatdma")) {
+		fprintf(stderr, "Ioat driver not loaded,"
+			" run `modprove -v ioatdma` first\n");
+		return -1;
+	}
+	if (check_modules("dmaperf")) {
+		fprintf(stderr, "Kernel Ioat test driver not loaded,"
+			" run `insmod dmaperf.ko` in the kmod directory\n");
+		return -1;
+	}
+
+	rc = get_u32_from_file("/sys/module/ioatdma/parameters/ioat_ring_alloc_order",
+			       &order);
+	if (rc < 0) {
+		fprintf(stderr, "Cannot get default ioat queue depth\n");
+		return -1;
+	}
+	ring_size = 1UL << order;
+
+	while ((op = getopt(argc, argv, "h:n:q:s:t:")) != -1) {
+		switch (op) {
+		case 'n':
+			threads = atoi(optarg);
+			rc = put_u32_to_file("/sys/kernel/debug/dmaperf/dmaperf/threads", threads);
+			if (rc < 0) {
+				fprintf(stderr, "Cannot set dma channels\n");
+				return -1;
+			}
+			break;
+		case 'q':
+			queue_depth = atoi(optarg);
+			if (queue_depth > ring_size) {
+				fprintf(stderr, "Max Ioat DMA ring size %d\n", ring_size);
+				return -1;
+			}
+			rc = put_u32_to_file("/sys/kernel/debug/dmaperf/dmaperf/queue_depth", queue_depth);
+			if (rc < 0) {
+				fprintf(stderr, "Cannot set queue depth\n");
+				return -1;
+			}
+			break;
+		case 's':
+			order = atoi(optarg);
+			rc = put_u32_to_file("/sys/kernel/debug/dmaperf/dmaperf/transfer_size_order", order);
+			if (rc < 0) {
+				fprintf(stderr, "Cannot set descriptor transfer size order\n");
+				return -1;
+			}
+			break;
+		case 't':
+			order = atoi(optarg);
+			rc = put_u32_to_file("/sys/kernel/debug/dmaperf/dmaperf/total_size_order", order);
+			if (rc < 0) {
+				fprintf(stderr, "Cannot set channel total transfer size order\n");
+				return -1;
+			}
+			break;
+		case 'h' :
+			usage(argv[0]);
+			exit(0);
+		default:
+			usage(argv[0]);
+			exit(1);
+		}
+	}
+
+	/* get driver configuration */
+	rc = get_u32_from_file("/sys/kernel/debug/dmaperf/dmaperf/transfer_size_order",
+			       &order);
+	if (rc < 0) {
+		fprintf(stderr, "Cannot get channel descriptor transfer size\n");
+		return -1;
+	}
+	transfer_size = 1UL << order;
+
+	rc = get_u32_from_file("/sys/kernel/debug/dmaperf/dmaperf/total_size_order",
+			       &order);
+	if (rc < 0) {
+		fprintf(stderr, "Cannot get channel total transfer size\n");
+		return -1;
+	}
+	total_size = 1ULL << order;
+
+	rc = get_u32_from_file("/sys/kernel/debug/dmaperf/dmaperf/threads",
+			       &threads);
+	if (rc < 0) {
+		fprintf(stderr, "Cannot get dma channel threads\n");
+		return -1;
+	}
+
+	rc = get_u32_from_file("/sys/kernel/debug/dmaperf/dmaperf/queue_depth",
+			       &queue_depth);
+	if (rc < 0) {
+		fprintf(stderr, "Cannot get queue depth\n");
+		return -1;
+	}
+
+	fprintf(stdout,
+		"Total %d Channels, Queue_Depth %d, Transfer Size %d Bytes, Total Transfer Size %"PRIu64" GB\n",
+		threads, queue_depth, transfer_size, total_size >> 30ULL);
+
+	/* run the channels */
+	rc = put_u32_to_file("/sys/kernel/debug/dmaperf/dmaperf/run", 1);
+	if (rc < 0) {
+		fprintf(stderr, "Cannot run the channels\n");
+		return -1;
+	}
+
+	fprintf(stdout, "Running I/O ");
+	fflush(stdout);
+	/* wait all the channels to be idle */
+	while (!get_str_from_file("/sys/kernel/debug/dmaperf/dmaperf/status", buf, BUFSIZ)) {
+		if (strstr(buf, "idle") != NULL) {
+			fprintf(stdout, "\n");
+			fflush(stdout);
+			sleep(1);
+			break;
+		}
+		fprintf(stdout, ". ");
+		fflush(stdout);
+		sleep(1);
+	}
+
+	/* collect each channel performance data */
+
+	for (i = 0; i < threads; i++) {
+		/* total data transfer length for the DMA channel in Bytes */
+		sprintf(channel, "/sys/kernel/debug/dmaperf/dmaperf/thread_%u/copied", i);
+		rc = get_u64_from_file(channel, &copied);
+		if (rc < 0) {
+			fprintf(stderr, "Cannot get channel copied bytes\n");
+			return -1;
+		}
+		/* time in microseconds for total data transfer length */
+		sprintf(channel, "/sys/kernel/debug/dmaperf/dmaperf/thread_%u/elapsed_time", i);
+		rc = get_u64_from_file(channel, &elapsed_time);
+		if (rc < 0) {
+			fprintf(stderr, "Cannot get channel elapsed time\n");
+			return -1;
+		}
+		assert(elapsed_time != 0);
+		fprintf(stdout, "Channel %d Performance Data %"PRIu64" MB/s\n",
+			i, copied / elapsed_time);
+	}
+
+	return 0;
+}
diff --git a/examples/ioat/kperf/kmod/Makefile b/examples/ioat/kperf/kmod/Makefile
new file mode 100644
index 0000000000..3f03823233
--- /dev/null
+++ b/examples/ioat/kperf/kmod/Makefile
@@ -0,0 +1,42 @@
+#
+#  BSD LICENSE
+#
+#  Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+obj-m := dmaperf.o
+dmaperf-y := dma_perf.o
+
+KDIR := /lib/modules/$(shell uname -r)/build
+
+all:
+	make -C $(KDIR) M=$(shell pwd) modules
+clean:
+	make -C $(KDIR) M=$(shell pwd) clean
diff --git a/examples/ioat/kperf/kmod/dma_perf.c b/examples/ioat/kperf/kmod/dma_perf.c
new file mode 100644
index 0000000000..12937405cd
--- /dev/null
+++ b/examples/ioat/kperf/kmod/dma_perf.c
@@ -0,0 +1,669 @@
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ *   redistributing this file, you may do so under either license.
+ *
+ *   GPL LICENSE SUMMARY
+ *
+ *   Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of version 2 of the GNU General Public License as
+ *   published by the Free Software Foundation.
+ *
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copy
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *   PCIe DMA Perf Linux driver
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/wait.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/time.h>
+#include <linux/timer.h>
+#include <linux/dma-mapping.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/dmaengine.h>
+#include <linux/delay.h>
+#include <linux/printk.h>
+#include <linux/nodemask.h>
+
+#define DRIVER_NAME		"dma_perf"
+#define DRIVER_DESCRIPTION	"PCIe DMA Performance Measurement Tool"
+
+#define DRIVER_LICENSE		"Dual BSD/GPL"
+#define DRIVER_VERSION		"1.0"
+#define DRIVER_AUTHOR		"Dave Jiang <dave.jiang@intel.com>"
+
+#define MAX_THREADS		32
+#define MAX_TEST_SIZE		1024 * 1024	/* 1M */
+#define DMA_CHANNELS_PER_NODE	8
+
+MODULE_LICENSE(DRIVER_LICENSE);
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_AUTHOR("Changpeng Liu <changpeng.liu@intel.com>");
+MODULE_DESCRIPTION(DRIVER_DESCRIPTION);
+
+static struct dentry *perf_debugfs_dir;
+static struct perf_ctx *g_perf = NULL;
+
+static unsigned int seg_order = 12; /* 4K */
+static unsigned int queue_depth = 256;
+static unsigned int run_order = 32; /* 4G */
+
+struct perf_mw {
+	size_t		buf_size;
+	void		*virt_addr;
+};
+
+struct perf_ctx;
+
+struct pthr_ctx {
+	struct dentry		*debugfs_thr_dir;
+	struct dentry		*debugfs_copied;
+	struct dentry		*debugfs_elapsed_time;
+	struct device		*dev;
+	int			node;
+	wait_queue_head_t	wq;
+	struct perf_mw		mw;
+	struct task_struct	*thread;
+	struct perf_ctx		*perf;
+	atomic_t		dma_sync;
+	struct dma_chan		*dma_chan;
+	int			dma_up;
+	int			dma_down;
+	int			dma_prep_err;
+	u64			copied;
+	u64			elapsed_time;
+};
+
+struct perf_ctx {
+	spinlock_t		db_lock;
+	struct dentry		*debugfs_node_dir;
+	struct dentry		*debugfs_run;
+	struct dentry		*debugfs_threads;
+	struct dentry		*debugfs_queue_depth;
+	struct dentry		*debugfs_transfer_size_order;
+	struct dentry		*debugfs_total_size_order;
+	struct dentry		*debugfs_status;
+	u8			numa_nodes;
+	u8			perf_threads;
+	bool			run;
+	struct pthr_ctx		pthr_ctx[MAX_THREADS];
+	atomic_t		tsync;
+};
+
+static void perf_free_mw(struct pthr_ctx *pctx);
+static int perf_set_mw(struct pthr_ctx *pctx, size_t size);
+
+static void perf_copy_callback(void *data)
+{
+	struct pthr_ctx *pctx = data;
+
+	atomic_dec(&pctx->dma_sync);
+	pctx->dma_down++;
+
+	wake_up(&pctx->wq);
+}
+
+static ssize_t perf_copy(struct pthr_ctx *pctx, char *dst,
+			 char *src, size_t size)
+{
+	struct dma_async_tx_descriptor *txd;
+	struct dma_chan *chan = pctx->dma_chan;
+	struct dma_device *device;
+	struct dmaengine_unmap_data *unmap;
+	dma_cookie_t cookie;
+	size_t src_off, dst_off;
+	int retries = 0;
+
+	if (!chan) {
+		printk("DMA engine does not exist\n");
+		return -EINVAL;
+	}
+
+	device = chan->device;
+	src_off = (size_t)src & ~PAGE_MASK;
+	dst_off = (size_t)dst & ~PAGE_MASK;
+
+	if (!is_dma_copy_aligned(device, src_off, dst_off, size))
+		return -ENODEV;
+
+	unmap = dmaengine_get_unmap_data(device->dev, 2, GFP_NOWAIT);
+	if (!unmap)
+		return -ENOMEM;
+
+	unmap->len = size;
+	unmap->addr[0] = dma_map_page(device->dev, virt_to_page(src),
+				      src_off, size, DMA_TO_DEVICE);
+	if (dma_mapping_error(device->dev, unmap->addr[0]))
+		goto err_get_unmap;
+
+	unmap->to_cnt = 1;
+
+	unmap->addr[1] = dma_map_page(device->dev, virt_to_page(dst),
+				      dst_off, size, DMA_FROM_DEVICE);
+	if (dma_mapping_error(device->dev, unmap->addr[1]))
+		goto err_get_unmap;
+	unmap->from_cnt = 1;
+
+dma_prep_retry:
+	txd = device->device_prep_dma_memcpy(chan, unmap->addr[1],
+					     unmap->addr[0],
+					     size, DMA_PREP_INTERRUPT);
+	if (!txd) {
+		if (retries++ > 20) {
+			pctx->dma_prep_err++;
+			goto err_get_unmap;
+		} else {
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(50);
+			goto dma_prep_retry;
+		}
+	}
+
+	txd->callback = perf_copy_callback;
+	txd->callback_param = pctx;
+	dma_set_unmap(txd, unmap);
+
+	cookie = dmaengine_submit(txd);
+	if (dma_submit_error(cookie))
+		goto err_set_unmap;
+
+	atomic_inc(&pctx->dma_sync);
+
+	pctx->dma_up++;
+	dma_async_issue_pending(chan);
+
+	return size;
+
+err_set_unmap:
+	dmaengine_unmap_put(unmap);
+err_get_unmap:
+	dmaengine_unmap_put(unmap);
+	return 0;
+}
+
+static int perf_move_data(struct pthr_ctx *pctx, char *dst, char *src,
+			  u64 buf_size, u64 win_size, u64 total)
+{
+	int chunks, total_chunks, i;
+	int copied_chunks = 0;
+	u64 result;
+	char *tmp = dst;
+	u64 perf, diff_us;
+	ktime_t kstart, kstop, kdiff;
+
+	chunks = win_size / buf_size;
+	total_chunks = total / buf_size;
+
+	printk("%s: chunks: %d total_chunks: %d\n", current->comm, chunks, total_chunks);
+
+	kstart = ktime_get();
+
+	for (i = 0; i < total_chunks; i++) {
+
+		wait_event_interruptible(pctx->wq, atomic_read(&pctx->dma_sync) < queue_depth);
+
+		result = perf_copy(pctx, tmp, src, buf_size);
+		pctx->copied += result;
+		copied_chunks++;
+		if (copied_chunks == chunks) {
+			tmp = dst;
+			copied_chunks = 0;
+		} else
+			tmp += buf_size;
+	}
+
+	printk("%s: All DMA descriptors submitted\n", current->comm);
+
+	/* FIXME: need a timeout here eventually */
+	while (atomic_read(&pctx->dma_sync) != 0)
+		msleep(1);
+
+	pr_info("%s: dma_up: %d  dma_down: %d dma_prep_err: %d\n",
+		current->comm, pctx->dma_up, pctx->dma_down,
+		pctx->dma_prep_err);
+
+	kstop = ktime_get();
+	kdiff = ktime_sub(kstop, kstart);
+	diff_us = ktime_to_us(kdiff);
+
+	pr_info("%s: copied %Lu bytes\n", current->comm, pctx->copied);
+
+	pr_info("%s: lasted %Lu usecs\n", current->comm, diff_us);
+
+	perf = pctx->copied / diff_us;
+
+	pr_info("%s: MBytes/s: %Lu\n", current->comm, perf);
+
+	pctx->elapsed_time = diff_us;
+
+	return 0;
+}
+
+static bool perf_dma_filter_fn(struct dma_chan *chan, void *node)
+{
+	return dev_to_node(&chan->dev->device) == (int)(unsigned long)node;
+}
+
+static int dma_perf_thread(void *data)
+{
+	struct pthr_ctx *pctx = data;
+	struct perf_ctx *perf = pctx->perf;
+	struct perf_mw *mw = &pctx->mw;
+	char *dst;
+	u64 win_size, buf_size, total;
+	void *src;
+	int rc, node;
+	struct dma_chan *dma_chan = NULL;
+
+	pr_info("kthread %s starting...\n", current->comm);
+
+	node = pctx->node;
+
+	if (!pctx->dma_chan) {
+		dma_cap_mask_t dma_mask;
+
+		dma_cap_zero(dma_mask);
+		dma_cap_set(DMA_MEMCPY, dma_mask);
+		dma_chan = dma_request_channel(dma_mask, perf_dma_filter_fn,
+					       (void *)(unsigned long)node);
+		if (!dma_chan) {
+			pr_warn("%s: cannot acquire DMA channel, quitting\n",
+				current->comm);
+			return -ENODEV;
+		}
+		pctx->dma_chan = dma_chan;
+		pctx->dev = dma_chan->device->dev;
+	}
+
+	src = kmalloc_node(MAX_TEST_SIZE, GFP_KERNEL, node);
+	if (!src) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	rc = perf_set_mw(pctx, MAX_TEST_SIZE);
+	if (rc < 0) {
+		pr_err("%s: set mw failed\n", current->comm);
+		rc = -ENXIO;
+		goto err;
+	}
+
+	win_size = mw->buf_size;
+	buf_size = 1ULL << seg_order;
+	total = 1ULL << run_order;
+
+	if (buf_size > MAX_TEST_SIZE)
+		buf_size = MAX_TEST_SIZE;
+
+	dst = (char *)mw->virt_addr;
+
+	atomic_inc(&perf->tsync);
+	while (atomic_read(&perf->tsync) != perf->perf_threads)
+		schedule();
+
+	rc = perf_move_data(pctx, dst, src, buf_size, win_size, total);
+
+	atomic_dec(&perf->tsync);
+
+	if (rc < 0) {
+		pr_err("%s: failed\n", current->comm);
+		rc = -ENXIO;
+		goto err;
+	}
+
+	return 0;
+
+err:
+	if (src)
+		kfree(src);
+
+	if (dma_chan) {
+		dma_release_channel(dma_chan);
+		pctx->dma_chan = NULL;
+	}
+
+	return rc;
+}
+
+static void perf_free_mw(struct pthr_ctx *pctx)
+{
+	struct perf_mw *mw = &pctx->mw;
+
+	if (!mw->virt_addr)
+		return;
+
+	kfree(mw->virt_addr);
+	mw->buf_size = 0;
+	mw->virt_addr = NULL;
+}
+
+static int perf_set_mw(struct pthr_ctx *pctx, size_t size)
+{
+	struct perf_mw *mw = &pctx->mw;
+
+	if (!size)
+		return -EINVAL;
+
+	mw->buf_size = size;
+
+	mw->virt_addr = kmalloc_node(size, GFP_KERNEL, pctx->node);
+
+	if (!mw->virt_addr) {
+		mw->buf_size = 0;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static ssize_t debugfs_run_read(struct file *filp, char __user *ubuf,
+				size_t count, loff_t *offp)
+{
+	struct perf_ctx *perf = filp->private_data;
+	char *buf;
+	ssize_t ret, out_offset;
+
+	if (!perf)
+		return 0;
+
+	buf = kmalloc(64, GFP_KERNEL);
+	out_offset = snprintf(buf, 64, "%d\n", perf->run);
+	ret = simple_read_from_buffer(ubuf, count, offp, buf, out_offset);
+	kfree(buf);
+
+	return ret;
+}
+
+static ssize_t debugfs_run_write(struct file *filp, const char __user *ubuf,
+				 size_t count, loff_t *offp)
+{
+	struct perf_ctx *perf = filp->private_data;
+	int node, i;
+
+	if (perf->perf_threads == 0)
+		return 0;
+
+	if (atomic_read(&perf->tsync) == 0)
+		perf->run = false;
+
+	if (perf->run == true) {
+		/* lets stop the threads */
+		perf->run = false;
+		for (i = 0; i < MAX_THREADS; i++) {
+			if (perf->pthr_ctx[i].thread) {
+				kthread_stop(perf->pthr_ctx[i].thread);
+				perf->pthr_ctx[i].thread = NULL;
+			} else
+				break;
+		}
+	} else {
+		perf->run = true;
+
+		if (perf->perf_threads > MAX_THREADS) {
+			perf->perf_threads = MAX_THREADS;
+			pr_info("Reset total threads to: %u\n", MAX_THREADS);
+		}
+
+		/* no greater than 1M */
+		if (seg_order > 20) {
+			seg_order = 20;
+			pr_info("Fix seg_order to %u\n", seg_order);
+		}
+
+		if (run_order < seg_order) {
+			run_order = seg_order;
+			pr_info("Fix run_order to %u\n", run_order);
+		}
+
+		/* launch kernel thread */
+		for (i = 0; i < perf->perf_threads; i++) {
+			struct pthr_ctx *pctx;
+
+			pctx = &perf->pthr_ctx[i];
+			atomic_set(&pctx->dma_sync, 0);
+			pctx->perf = perf;
+			pctx->elapsed_time = 0;
+			pctx->copied = 0;
+
+			init_waitqueue_head(&pctx->wq);
+
+			/* NUMA socket node */
+			pctx->node = i / DMA_CHANNELS_PER_NODE;
+			node = pctx->node;
+
+			pctx->thread =
+				kthread_create_on_node(dma_perf_thread,
+						       (void *)pctx,
+						       node, "dma_perf %d", i);
+			if (pctx->thread)
+				wake_up_process(pctx->thread);
+			else {
+				perf->run = false;
+				for (i = 0; i < MAX_THREADS; i++) {
+					if (pctx->thread) {
+						kthread_stop(pctx->thread);
+						pctx->thread = NULL;
+					} else
+						break;
+				}
+			}
+
+			if (perf->run == false)
+				return -ENXIO;
+		}
+
+	}
+
+	return count;
+}
+
+static const struct file_operations dma_perf_debugfs_run = {
+	.owner = THIS_MODULE,
+	.open = simple_open,
+	.read = debugfs_run_read,
+	.write = debugfs_run_write,
+};
+
+static ssize_t debugfs_status_read(struct file *filp, char __user *ubuf,
+				   size_t count, loff_t *offp)
+{
+	struct perf_ctx *perf = filp->private_data;
+	char *buf;
+	ssize_t ret, out_offset;
+
+	if (!perf)
+		return 0;
+
+	buf = kmalloc(64, GFP_KERNEL);
+	out_offset = snprintf(buf, 64, "%s\n", atomic_read(&perf->tsync) ? "running" : "idle");
+	ret = simple_read_from_buffer(ubuf, count, offp, buf, out_offset);
+	kfree(buf);
+
+	return ret;
+}
+
+static const struct file_operations dma_perf_debugfs_status = {
+	.owner = THIS_MODULE,
+	.open = simple_open,
+	.read = debugfs_status_read,
+};
+
+static int perf_debugfs_setup(struct perf_ctx *perf)
+{
+
+	int i;
+	char temp_name[64];
+
+	if (!perf_debugfs_dir)
+		return -ENODEV;
+
+	perf->debugfs_node_dir = debugfs_create_dir("dmaperf",
+				 perf_debugfs_dir);
+	if (!perf->debugfs_node_dir)
+		return -ENODEV;
+
+	perf->debugfs_run = debugfs_create_file("run", S_IRUSR | S_IWUSR,
+						perf->debugfs_node_dir, perf,
+						&dma_perf_debugfs_run);
+	if (!perf->debugfs_run)
+		return -ENODEV;
+
+	perf->debugfs_status = debugfs_create_file("status", S_IRUSR,
+			       perf->debugfs_node_dir, perf,
+			       &dma_perf_debugfs_status);
+	if (!perf->debugfs_status)
+		return -ENODEV;
+
+	perf->debugfs_threads = debugfs_create_u8("threads", S_IRUSR | S_IWUSR,
+				perf->debugfs_node_dir,
+				&perf->perf_threads);
+	if (!perf->debugfs_threads)
+		return -ENODEV;
+
+	perf->debugfs_queue_depth = debugfs_create_u32("queue_depth", S_IRUSR | S_IWUSR,
+				    perf->debugfs_node_dir,
+				    &queue_depth);
+	if (!perf->debugfs_queue_depth)
+		return -ENODEV;
+
+	perf->debugfs_transfer_size_order = debugfs_create_u32("transfer_size_order", S_IRUSR | S_IWUSR,
+					    perf->debugfs_node_dir,
+					    &seg_order);
+	if (!perf->debugfs_transfer_size_order)
+		return -ENODEV;
+
+	perf->debugfs_total_size_order = debugfs_create_u32("total_size_order", S_IRUSR | S_IWUSR,
+					 perf->debugfs_node_dir,
+					 &run_order);
+	if (!perf->debugfs_total_size_order)
+		return -ENODEV;
+
+	for (i = 0; i < MAX_THREADS; i++) {
+		struct pthr_ctx *pctx = &perf->pthr_ctx[i];
+		sprintf(temp_name, "thread_%d", i);
+
+		pctx->debugfs_thr_dir = debugfs_create_dir(temp_name, perf->debugfs_node_dir);
+		if (!pctx->debugfs_thr_dir)
+			return -ENODEV;
+
+		pctx->debugfs_copied = debugfs_create_u64("copied", S_IRUSR,
+				       pctx->debugfs_thr_dir,
+				       &pctx->copied);
+		if (!pctx->debugfs_copied)
+			return -ENODEV;
+
+		pctx->debugfs_elapsed_time = debugfs_create_u64("elapsed_time", S_IRUSR,
+					     pctx->debugfs_thr_dir,
+					     &pctx->elapsed_time);
+		if (!pctx->debugfs_elapsed_time)
+			return -ENODEV;
+	}
+
+	return 0;
+}
+
+static int perf_probe(void)
+{
+	struct perf_ctx *perf;
+	int rc = 0;
+
+	perf = kzalloc_node(sizeof(*perf), GFP_KERNEL, 0);
+	if (!perf) {
+		rc = -ENOMEM;
+		goto err_perf;
+	}
+
+	perf->numa_nodes = num_online_nodes();
+	perf->perf_threads = 1;
+	atomic_set(&perf->tsync, 0);
+	perf->run = false;
+	spin_lock_init(&perf->db_lock);
+
+	if (debugfs_initialized() && !perf_debugfs_dir) {
+		perf_debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL);
+		if (!perf_debugfs_dir)
+			goto err_ctx;
+
+		rc = perf_debugfs_setup(perf);
+		if (rc)
+			goto err_ctx;
+	}
+
+	g_perf = perf;
+	return 0;
+
+err_ctx:
+	kfree(perf);
+err_perf:
+	return rc;
+}
+
+static void perf_remove(void)
+{
+	int i;
+	struct perf_ctx *perf = g_perf;
+
+	if (perf_debugfs_dir) {
+		debugfs_remove_recursive(perf_debugfs_dir);
+		perf_debugfs_dir = NULL;
+	}
+
+	for (i = 0; i < MAX_THREADS; i++) {
+		struct pthr_ctx *pctx = &perf->pthr_ctx[i];
+		if (pctx->dma_chan)
+			dma_release_channel(pctx->dma_chan);
+		perf_free_mw(pctx);
+	}
+
+	kfree(perf);
+}
+
+static int __init perf_init_module(void)
+{
+	printk("DMA Performance Test Init\n");
+	return perf_probe();
+}
+module_init(perf_init_module);
+
+static void __exit perf_exit_module(void)
+{
+	printk("DMA Performance Test Exit\n");
+	perf_remove();
+}
+module_exit(perf_exit_module);