vhost: integrate dmadev in asynchronous data-path

Since dmadev is introduced in 21.11, to avoid the overhead of vhost DMA abstraction layer and simplify application logics, this patch integrates dmadev in asynchronous data path. Signed-off-by: Jiayu Hu <jiayu.hu@intel.com> Signed-off-by: Sunil Pai G <sunil.pai.g@intel.com> Tested-by: Yvonne Yang <yvonnex.yang@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
2022-02-09 07:51:45 -05:00 · 2022-02-09 07:51:45 -05:00 · 53d3f4778c
commit 53d3f4778c
parent 94005e4640
14 changed files with 699 additions and 595 deletions
--- a/doc/guides/prog_guide/vhost_lib.rst
+++ b/doc/guides/prog_guide/vhost_lib.rst
@ -105,13 +105,13 @@ The following is an overview of some key Vhost API functions:

  - ``RTE_VHOST_USER_ASYNC_COPY``

-    Asynchronous data path will be enabled when this flag is set. Async data
-    path allows applications to register async copy devices (typically
-    hardware DMA channels) to the vhost queues. Vhost leverages the copy
-    device registered to free CPU from memory copy operations. A set of
-    async data path APIs are defined for DPDK applications to make use of
-    the async capability. Only packets enqueued/dequeued by async APIs are
-    processed through the async data path.
+    Asynchronous data path will be enabled when this flag is set. Async
+    data path allows applications to enable DMA acceleration for vhost
+    queues. Vhost leverages the registered DMA channels to free CPU from
+    memory copy operations in data path. A set of async data path APIs are
+    defined for DPDK applications to make use of the async capability. Only
+    packets enqueued/dequeued by async APIs are processed through the async
+    data path.

    Currently this feature is only implemented on split ring enqueue data
    path.
@ -218,52 +218,30 @@ The following is an overview of some key Vhost API functions:

  Enable or disable zero copy feature of the vhost crypto backend.

-* ``rte_vhost_async_channel_register(vid, queue_id, config, ops)``
+* ``rte_vhost_async_dma_configure(dma_id, vchan_id)``

-  Register an async copy device channel for a vhost queue after vring
-  is enabled. Following device ``config`` must be specified together
-  with the registration:
+  Tell vhost which DMA vChannel is going to use. This function needs to
+  be called before register async data-path for vring.

-  * ``features``
+* ``rte_vhost_async_channel_register(vid, queue_id)``

-    This field is used to specify async copy device features.
+  Register async DMA acceleration for a vhost queue after vring is enabled.

-    ``RTE_VHOST_ASYNC_INORDER`` represents the async copy device can
-    guarantee the order of copy completion is the same as the order
-    of copy submission.
+* ``rte_vhost_async_channel_register_thread_unsafe(vid, queue_id)``

-    Currently, only ``RTE_VHOST_ASYNC_INORDER`` capable device is
-    supported by vhost.
-
-  Applications must provide following ``ops`` callbacks for vhost lib to
-  work with the async copy devices:
-
-  * ``transfer_data(vid, queue_id, descs, opaque_data, count)``
-
-    vhost invokes this function to submit copy data to the async devices.
-    For non-async_inorder capable devices, ``opaque_data`` could be used
-    for identifying the completed packets.
-
-  * ``check_completed_copies(vid, queue_id, opaque_data, max_packets)``
-
-    vhost invokes this function to get the copy data completed by async
-    devices.
-
-* ``rte_vhost_async_channel_register_thread_unsafe(vid, queue_id, config, ops)``
-
-  Register an async copy device channel for a vhost queue without
-  performing any locking.
+  Register async DMA acceleration for a vhost queue without performing
+  any locking.

  This function is only safe to call in vhost callback functions
  (i.e., struct rte_vhost_device_ops).

 * ``rte_vhost_async_channel_unregister(vid, queue_id)``

-  Unregister the async copy device channel from a vhost queue.
+  Unregister the async DMA acceleration from a vhost queue.
  Unregistration will fail, if the vhost queue has in-flight
  packets that are not completed.

-  Unregister async copy devices in vring_state_changed() may
+  Unregister async DMA acceleration in vring_state_changed() may
  fail, as this API tries to acquire the spinlock of vhost
  queue. The recommended way is to unregister async copy
  devices for all vhost queues in destroy_device(), when a
@ -271,24 +249,19 @@ The following is an overview of some key Vhost API functions:

 * ``rte_vhost_async_channel_unregister_thread_unsafe(vid, queue_id)``

-  Unregister the async copy device channel for a vhost queue without
-  performing any locking.
+  Unregister async DMA acceleration for a vhost queue without performing
+  any locking.

  This function is only safe to call in vhost callback functions
  (i.e., struct rte_vhost_device_ops).

-* ``rte_vhost_submit_enqueue_burst(vid, queue_id, pkts, count, comp_pkts, comp_count)``
+* ``rte_vhost_submit_enqueue_burst(vid, queue_id, pkts, count, dma_id, vchan_id)``

  Submit an enqueue request to transmit ``count`` packets from host to guest
-  by async data path. Successfully enqueued packets can be transfer completed
-  or being occupied by DMA engines; transfer completed packets are returned in
-  ``comp_pkts``, but others are not guaranteed to finish, when this API
-  call returns.
+  by async data path. Applications must not free the packets submitted for
+  enqueue until the packets are completed.

-  Applications must not free the packets submitted for enqueue until the
-  packets are completed.
-
-* ``rte_vhost_poll_enqueue_completed(vid, queue_id, pkts, count)``
+* ``rte_vhost_poll_enqueue_completed(vid, queue_id, pkts, count, dma_id, vchan_id)``

  Poll enqueue completion status from async data path. Completed packets
  are returned to applications through ``pkts``.
@ -298,7 +271,7 @@ The following is an overview of some key Vhost API functions:
  This function returns the amount of in-flight packets for the vhost
  queue using async acceleration.

-* ``rte_vhost_clear_queue_thread_unsafe(vid, queue_id, **pkts, count)``
+* ``rte_vhost_clear_queue_thread_unsafe(vid, queue_id, **pkts, count, dma_id, vchan_id)``

  Clear inflight packets which are submitted to DMA engine in vhost async data
  path. Completed packets are returned to applications through ``pkts``.
@ -443,6 +416,29 @@ Finally, a set of device ops is defined for device specific operations:

  Called to get the notify area info of the queue.

+Vhost asynchronous data path
+----------------------------
+
+Vhost asynchronous data path leverages DMA devices to offload memory
+copies from the CPU and it is implemented in an asynchronous way. It
+enables applications, like OVS, to save CPU cycles and hide memory copy
+overhead, thus achieving higher throughput.
+
+Vhost doesn't manage DMA devices and applications, like OVS, need to
+manage and configure DMA devices. Applications need to tell vhost what
+DMA devices to use in every data path function call. This design enables
+the flexibility for applications to dynamically use DMA channels in
+different function modules, not limited in vhost.
+
+In addition, vhost supports M:N mapping between vrings and DMA virtual
+channels. Specifically, one vring can use multiple different DMA channels
+and one DMA channel can be shared by multiple vrings at the same time.
+The reason of enabling one vring to use multiple DMA channels is that
+it's possible that more than one dataplane threads enqueue packets to
+the same vring with their own DMA virtual channels. Besides, the number
+of DMA devices is limited. For the purpose of scaling, it's necessary to
+support sharing DMA channels among vrings.
+
 Recommended IOVA mode in async datapath
 ---------------------------------------

--- a/examples/vhost/Makefile
+++ b/examples/vhost/Makefile
@ -5,7 +5,7 @@
 APP = vhost-switch

 # all source are stored in SRCS-y
-SRCS-y := main.c virtio_net.c ioat.c
+SRCS-y := main.c virtio_net.c

 PKGCONF ?= pkg-config

--- a/examples/vhost/ioat.c
+++ b/examples/vhost/ioat.c
@ -1,218 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2020 Intel Corporation
- */
-
-#include <sys/uio.h>
-#ifdef RTE_RAW_IOAT
-#include <rte_rawdev.h>
-#include <rte_ioat_rawdev.h>
-
-#include "ioat.h"
-#include "main.h"
-
-struct dma_for_vhost dma_bind[MAX_VHOST_DEVICE];
-
-struct packet_tracker {
-	unsigned short size_track[MAX_ENQUEUED_SIZE];
-	unsigned short next_read;
-	unsigned short next_write;
-	unsigned short last_remain;
-	unsigned short ioat_space;
-};
-
-struct packet_tracker cb_tracker[MAX_VHOST_DEVICE];
-
-int
-open_ioat(const char *value)
-{
-	struct dma_for_vhost *dma_info = dma_bind;
-	char *input = strndup(value, strlen(value) + 1);
-	char *addrs = input;
-	char *ptrs[2];
-	char *start, *end, *substr;
-	int64_t vid, vring_id;
-	struct rte_ioat_rawdev_config config;
-	struct rte_rawdev_info info = { .dev_private = &config };
-	char name[32];
-	int dev_id;
-	int ret = 0;
-	uint16_t i = 0;
-	char *dma_arg[MAX_VHOST_DEVICE];
-	int args_nr;
-
-	while (isblank(*addrs))
-		addrs++;
-	if (*addrs == '\0') {
-		ret = -1;
-		goto out;
-	}
-
-	/* process DMA devices within bracket. */
-	addrs++;
-	substr = strtok(addrs, ";]");
-	if (!substr) {
-		ret = -1;
-		goto out;
-	}
-	args_nr = rte_strsplit(substr, strlen(substr),
-			dma_arg, MAX_VHOST_DEVICE, ',');
-	if (args_nr <= 0) {
-		ret = -1;
-		goto out;
-	}
-	while (i < args_nr) {
-		char *arg_temp = dma_arg[i];
-		uint8_t sub_nr;
-		sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
-		if (sub_nr != 2) {
-			ret = -1;
-			goto out;
-		}
-
-		start = strstr(ptrs[0], "txd");
-		if (start == NULL) {
-			ret = -1;
-			goto out;
-		}
-
-		start += 3;
-		vid = strtol(start, &end, 0);
-		if (end == start) {
-			ret = -1;
-			goto out;
-		}
-
-		vring_id = 0 + VIRTIO_RXQ;
-		if (rte_pci_addr_parse(ptrs[1],
-				&(dma_info + vid)->dmas[vring_id].addr) < 0) {
-			ret = -1;
-			goto out;
-		}
-
-		rte_pci_device_name(&(dma_info + vid)->dmas[vring_id].addr,
-				name, sizeof(name));
-		dev_id = rte_rawdev_get_dev_id(name);
-		if (dev_id == (uint16_t)(-ENODEV) ||
-		dev_id == (uint16_t)(-EINVAL)) {
-			ret = -1;
-			goto out;
-		}
-
-		if (rte_rawdev_info_get(dev_id, &info, sizeof(config)) < 0 ||
-		strstr(info.driver_name, "ioat") == NULL) {
-			ret = -1;
-			goto out;
-		}
-
-		(dma_info + vid)->dmas[vring_id].dev_id = dev_id;
-		(dma_info + vid)->dmas[vring_id].is_valid = true;
-		config.ring_size = IOAT_RING_SIZE;
-		config.hdls_disable = true;
-		if (rte_rawdev_configure(dev_id, &info, sizeof(config)) < 0) {
-			ret = -1;
-			goto out;
-		}
-		rte_rawdev_start(dev_id);
-		cb_tracker[dev_id].ioat_space = IOAT_RING_SIZE - 1;
-		dma_info->nr++;
-		i++;
-	}
-out:
-	free(input);
-	return ret;
-}
-
-int32_t
-ioat_transfer_data_cb(int vid, uint16_t queue_id,
-		struct rte_vhost_iov_iter *iov_iter,
-		struct rte_vhost_async_status *opaque_data, uint16_t count)
-{
-	uint32_t i_iter;
-	uint16_t dev_id = dma_bind[vid].dmas[queue_id * 2 + VIRTIO_RXQ].dev_id;
-	struct rte_vhost_iov_iter *iter = NULL;
-	unsigned long i_seg;
-	unsigned short mask = MAX_ENQUEUED_SIZE - 1;
-	unsigned short write = cb_tracker[dev_id].next_write;
-
-	if (!opaque_data) {
-		for (i_iter = 0; i_iter < count; i_iter++) {
-			iter = iov_iter + i_iter;
-			i_seg = 0;
-			if (cb_tracker[dev_id].ioat_space < iter->nr_segs)
-				break;
-			while (i_seg < iter->nr_segs) {
-				rte_ioat_enqueue_copy(dev_id,
-					(uintptr_t)(iter->iov[i_seg].src_addr),
-					(uintptr_t)(iter->iov[i_seg].dst_addr),
-					iter->iov[i_seg].len,
-					0,
-					0);
-				i_seg++;
-			}
-			write &= mask;
-			cb_tracker[dev_id].size_track[write] = iter->nr_segs;
-			cb_tracker[dev_id].ioat_space -= iter->nr_segs;
-			write++;
-		}
-	} else {
-		/* Opaque data is not supported */
-		return -1;
-	}
-	/* ring the doorbell */
-	rte_ioat_perform_ops(dev_id);
-	cb_tracker[dev_id].next_write = write;
-	return i_iter;
-}
-
-int32_t
-ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
-		struct rte_vhost_async_status *opaque_data,
-		uint16_t max_packets)
-{
-	if (!opaque_data) {
-		uintptr_t dump[255];
-		int n_seg;
-		unsigned short read, write;
-		unsigned short nb_packet = 0;
-		unsigned short mask = MAX_ENQUEUED_SIZE - 1;
-		unsigned short i;
-
-		uint16_t dev_id = dma_bind[vid].dmas[queue_id * 2
-				+ VIRTIO_RXQ].dev_id;
-		n_seg = rte_ioat_completed_ops(dev_id, 255, NULL, NULL, dump, dump);
-		if (n_seg < 0) {
-			RTE_LOG(ERR,
-				VHOST_DATA,
-				"fail to poll completed buf on IOAT device %u",
-				dev_id);
-			return 0;
-		}
-		if (n_seg == 0)
-			return 0;
-
-		cb_tracker[dev_id].ioat_space += n_seg;
-		n_seg += cb_tracker[dev_id].last_remain;
-
-		read = cb_tracker[dev_id].next_read;
-		write = cb_tracker[dev_id].next_write;
-		for (i = 0; i < max_packets; i++) {
-			read &= mask;
-			if (read == write)
-				break;
-			if (n_seg >= cb_tracker[dev_id].size_track[read]) {
-				n_seg -= cb_tracker[dev_id].size_track[read];
-				read++;
-				nb_packet++;
-			} else {
-				break;
-			}
-		}
-		cb_tracker[dev_id].next_read = read;
-		cb_tracker[dev_id].last_remain = n_seg;
-		return nb_packet;
-	}
-	/* Opaque data is not supported */
-	return -1;
-}
-
-#endif /* RTE_RAW_IOAT */
--- a/examples/vhost/ioat.h
+++ b/examples/vhost/ioat.h
@ -1,63 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2020 Intel Corporation
- */
-
-#ifndef _IOAT_H_
-#define _IOAT_H_
-
-#include <rte_vhost.h>
-#include <rte_pci.h>
-#include <rte_vhost_async.h>
-
-#define MAX_VHOST_DEVICE 1024
-#define IOAT_RING_SIZE 4096
-#define MAX_ENQUEUED_SIZE 4096
-
-struct dma_info {
-	struct rte_pci_addr addr;
-	uint16_t dev_id;
-	bool is_valid;
-};
-
-struct dma_for_vhost {
-	struct dma_info dmas[RTE_MAX_QUEUES_PER_PORT * 2];
-	uint16_t nr;
-};
-
-#ifdef RTE_RAW_IOAT
-int open_ioat(const char *value);
-
-int32_t
-ioat_transfer_data_cb(int vid, uint16_t queue_id,
-		struct rte_vhost_iov_iter *iov_iter,
-		struct rte_vhost_async_status *opaque_data, uint16_t count);
-
-int32_t
-ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
-		struct rte_vhost_async_status *opaque_data,
-		uint16_t max_packets);
-#else
-static int open_ioat(const char *value __rte_unused)
-{
-	return -1;
-}
-
-static int32_t
-ioat_transfer_data_cb(int vid __rte_unused, uint16_t queue_id __rte_unused,
-		struct rte_vhost_iov_iter *iov_iter __rte_unused,
-		struct rte_vhost_async_status *opaque_data __rte_unused,
-		uint16_t count __rte_unused)
-{
-	return -1;
-}
-
-static int32_t
-ioat_check_completed_copies_cb(int vid __rte_unused,
-		uint16_t queue_id __rte_unused,
-		struct rte_vhost_async_status *opaque_data __rte_unused,
-		uint16_t max_packets __rte_unused)
-{
-	return -1;
-}
-#endif
-#endif /* _IOAT_H_ */
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@ -24,8 +24,9 @@
 #include <rte_ip.h>
 #include <rte_tcp.h>
 #include <rte_pause.h>
+#include <rte_dmadev.h>
+#include <rte_vhost_async.h>

-#include "ioat.h"
 #include "main.h"

 #ifndef MAX_QUEUES
@ -56,6 +57,13 @@
 #define RTE_TEST_TX_DESC_DEFAULT 512

 #define INVALID_PORT_ID 0xFF
+#define INVALID_DMA_ID -1
+
+#define DMA_RING_SIZE 4096
+
+struct dma_for_vhost dma_bind[RTE_MAX_VHOST_DEVICE];
+int16_t dmas_id[RTE_DMADEV_DEFAULT_MAX];
+static int dma_count;

 /* mask of enabled ports */
 static uint32_t enabled_port_mask = 0;
@ -94,10 +102,6 @@ static int client_mode;

 static int builtin_net_driver;

-static int async_vhost_driver;
-
-static char *dma_type;
-
 /* Specify timeout (in useconds) between retries on RX. */
 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
 /* Specify the number of retries on RX. */
@ -191,18 +195,150 @@ struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
 * Every data core maintains a TX buffer for every vhost device,
 * which is used for batch pkts enqueue for higher performance.
 */
-struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * MAX_VHOST_DEVICE];
+struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * RTE_MAX_VHOST_DEVICE];

 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
 				 / US_PER_S * BURST_TX_DRAIN_US)

+static inline bool
+is_dma_configured(int16_t dev_id)
+{
+	int i;
+
+	for (i = 0; i < dma_count; i++)
+		if (dmas_id[i] == dev_id)
+			return true;
+	return false;
+}
+
 static inline int
 open_dma(const char *value)
 {
-	if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0)
-		return open_ioat(value);
+	struct dma_for_vhost *dma_info = dma_bind;
+	char *input = strndup(value, strlen(value) + 1);
+	char *addrs = input;
+	char *ptrs[2];
+	char *start, *end, *substr;
+	int64_t vid;

-	return -1;
+	struct rte_dma_info info;
+	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
+	struct rte_dma_vchan_conf qconf = {
+		.direction = RTE_DMA_DIR_MEM_TO_MEM,
+		.nb_desc = DMA_RING_SIZE
+	};
+
+	int dev_id;
+	int ret = 0;
+	uint16_t i = 0;
+	char *dma_arg[RTE_MAX_VHOST_DEVICE];
+	int args_nr;
+
+	while (isblank(*addrs))
+		addrs++;
+	if (*addrs == '\0') {
+		ret = -1;
+		goto out;
+	}
+
+	/* process DMA devices within bracket. */
+	addrs++;
+	substr = strtok(addrs, ";]");
+	if (!substr) {
+		ret = -1;
+		goto out;
+	}
+
+	args_nr = rte_strsplit(substr, strlen(substr), dma_arg, RTE_MAX_VHOST_DEVICE, ',');
+	if (args_nr <= 0) {
+		ret = -1;
+		goto out;
+	}
+
+	while (i < args_nr) {
+		char *arg_temp = dma_arg[i];
+		uint8_t sub_nr;
+
+		sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
+		if (sub_nr != 2) {
+			ret = -1;
+			goto out;
+		}
+
+		start = strstr(ptrs[0], "txd");
+		if (start == NULL) {
+			ret = -1;
+			goto out;
+		}
+
+		start += 3;
+		vid = strtol(start, &end, 0);
+		if (end == start) {
+			ret = -1;
+			goto out;
+		}
+
+		dev_id = rte_dma_get_dev_id_by_name(ptrs[1]);
+		if (dev_id < 0) {
+			RTE_LOG(ERR, VHOST_CONFIG, "Fail to find DMA %s.\n", ptrs[1]);
+			ret = -1;
+			goto out;
+		}
+
+		/* DMA device is already configured, so skip */
+		if (is_dma_configured(dev_id))
+			goto done;
+
+		if (rte_dma_info_get(dev_id, &info) != 0) {
+			RTE_LOG(ERR, VHOST_CONFIG, "Error with rte_dma_info_get()\n");
+			ret = -1;
+			goto out;
+		}
+
+		if (info.max_vchans < 1) {
+			RTE_LOG(ERR, VHOST_CONFIG, "No channels available on device %d\n", dev_id);
+			ret = -1;
+			goto out;
+		}
+
+		if (rte_dma_configure(dev_id, &dev_config) != 0) {
+			RTE_LOG(ERR, VHOST_CONFIG, "Fail to configure DMA %d.\n", dev_id);
+			ret = -1;
+			goto out;
+		}
+
+		/* Check the max desc supported by DMA device */
+		rte_dma_info_get(dev_id, &info);
+		if (info.nb_vchans != 1) {
+			RTE_LOG(ERR, VHOST_CONFIG, "No configured queues reported by DMA %d.\n",
+					dev_id);
+			ret = -1;
+			goto out;
+		}
+
+		qconf.nb_desc = RTE_MIN(DMA_RING_SIZE, info.max_desc);
+
+		if (rte_dma_vchan_setup(dev_id, 0, &qconf) != 0) {
+			RTE_LOG(ERR, VHOST_CONFIG, "Fail to set up DMA %d.\n", dev_id);
+			ret = -1;
+			goto out;
+		}
+
+		if (rte_dma_start(dev_id) != 0) {
+			RTE_LOG(ERR, VHOST_CONFIG, "Fail to start DMA %u.\n", dev_id);
+			ret = -1;
+			goto out;
+		}
+
+		dmas_id[dma_count++] = dev_id;
+
+done:
+		(dma_info + vid)->dmas[VIRTIO_RXQ].dev_id = dev_id;
+		i++;
+	}
+out:
+	free(input);
+	return ret;
 }

 /*
@ -500,8 +636,6 @@ enum {
 	OPT_CLIENT_NUM,
 #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
 	OPT_BUILTIN_NET_DRIVER_NUM,
-#define OPT_DMA_TYPE            "dma-type"
-	OPT_DMA_TYPE_NUM,
 #define OPT_DMAS                "dmas"
 	OPT_DMAS_NUM,
 };
@ -539,8 +673,6 @@ us_vhost_parse_args(int argc, char **argv)
 				NULL, OPT_CLIENT_NUM},
 		{OPT_BUILTIN_NET_DRIVER, no_argument,
 				NULL, OPT_BUILTIN_NET_DRIVER_NUM},
-		{OPT_DMA_TYPE, required_argument,
-				NULL, OPT_DMA_TYPE_NUM},
 		{OPT_DMAS, required_argument,
 				NULL, OPT_DMAS_NUM},
 		{NULL, 0, 0, 0},
@ -661,10 +793,6 @@ us_vhost_parse_args(int argc, char **argv)
 			}
 			break;

-		case OPT_DMA_TYPE_NUM:
-			dma_type = optarg;
-			break;
-
 		case OPT_DMAS_NUM:
 			if (open_dma(optarg) == -1) {
 				RTE_LOG(INFO, VHOST_CONFIG,
@ -672,7 +800,6 @@ us_vhost_parse_args(int argc, char **argv)
 				us_vhost_usage(prgname);
 				return -1;
 			}
-			async_vhost_driver = 1;
 			break;

 		case OPT_CLIENT_NUM:
@ -841,9 +968,10 @@ complete_async_pkts(struct vhost_dev *vdev)
 {
 	struct rte_mbuf *p_cpl[MAX_PKT_BURST];
 	uint16_t complete_count;
+	int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id;

 	complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
-					VIRTIO_RXQ, p_cpl, MAX_PKT_BURST);
+					VIRTIO_RXQ, p_cpl, MAX_PKT_BURST, dma_id, 0);
 	if (complete_count) {
 		free_pkts(p_cpl, complete_count);
 		__atomic_sub_fetch(&vdev->pkts_inflight, complete_count, __ATOMIC_SEQ_CST);
@ -877,17 +1005,18 @@ static __rte_always_inline void
 drain_vhost(struct vhost_dev *vdev)
 {
 	uint16_t ret;
-	uint32_t buff_idx = rte_lcore_id() * MAX_VHOST_DEVICE + vdev->vid;
+	uint32_t buff_idx = rte_lcore_id() * RTE_MAX_VHOST_DEVICE + vdev->vid;
 	uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
 	struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;

 	if (builtin_net_driver) {
 		ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
-	} else if (async_vhost_driver) {
+	} else if (dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled) {
 		uint16_t enqueue_fail = 0;
+		int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id;

 		complete_async_pkts(vdev);
-		ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ, m, nr_xmit);
+		ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ, m, nr_xmit, dma_id, 0);
 		__atomic_add_fetch(&vdev->pkts_inflight, ret, __ATOMIC_SEQ_CST);

 		enqueue_fail = nr_xmit - ret;
@ -905,7 +1034,7 @@ drain_vhost(struct vhost_dev *vdev)
 				__ATOMIC_SEQ_CST);
 	}

-	if (!async_vhost_driver)
+	if (!dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled)
 		free_pkts(m, nr_xmit);
 }

@ -921,8 +1050,7 @@ drain_vhost_table(void)
 		if (unlikely(vdev->remove == 1))
 			continue;

-		vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE
-						+ vdev->vid];
+		vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + vdev->vid];

 		cur_tsc = rte_rdtsc();
 		if (unlikely(cur_tsc - vhost_txq->pre_tsc
@ -970,7 +1098,7 @@ virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
 		return 0;
 	}

-	vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE + dst_vdev->vid];
+	vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + dst_vdev->vid];
 	vhost_txq->m_table[vhost_txq->len++] = m;

 	if (enable_stats) {
@ -1211,12 +1339,13 @@ drain_eth_rx(struct vhost_dev *vdev)
 	if (builtin_net_driver) {
 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
 						pkts, rx_count);
-	} else if (async_vhost_driver) {
+	} else if (dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled) {
 		uint16_t enqueue_fail = 0;
+		int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id;

 		complete_async_pkts(vdev);
 		enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
-					VIRTIO_RXQ, pkts, rx_count);
+					VIRTIO_RXQ, pkts, rx_count, dma_id, 0);
 		__atomic_add_fetch(&vdev->pkts_inflight, enqueue_count, __ATOMIC_SEQ_CST);

 		enqueue_fail = rx_count - enqueue_count;
@ -1235,7 +1364,7 @@ drain_eth_rx(struct vhost_dev *vdev)
 				__ATOMIC_SEQ_CST);
 	}

-	if (!async_vhost_driver)
+	if (!dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled)
 		free_pkts(pkts, rx_count);
 }

@ -1357,7 +1486,7 @@ destroy_device(int vid)
 	}

 	for (i = 0; i < RTE_MAX_LCORE; i++)
-		rte_free(vhost_txbuff[i * MAX_VHOST_DEVICE + vid]);
+		rte_free(vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]);

 	if (builtin_net_driver)
 		vs_vhost_net_remove(vdev);
@ -1387,18 +1516,20 @@ destroy_device(int vid)
 		"(%d) device has been removed from data core\n",
 		vdev->vid);

-	if (async_vhost_driver) {
+	if (dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled) {
 		uint16_t n_pkt = 0;
+		int16_t dma_id = dma_bind[vid].dmas[VIRTIO_RXQ].dev_id;
 		struct rte_mbuf *m_cpl[vdev->pkts_inflight];

 		while (vdev->pkts_inflight) {
 			n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, VIRTIO_RXQ,
-						m_cpl, vdev->pkts_inflight);
+						m_cpl, vdev->pkts_inflight, dma_id, 0);
 			free_pkts(m_cpl, n_pkt);
 			__atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
 		}

 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
+		dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = false;
 	}

 	rte_free(vdev);
@ -1425,12 +1556,12 @@ new_device(int vid)
 	vdev->vid = vid;

 	for (i = 0; i < RTE_MAX_LCORE; i++) {
-		vhost_txbuff[i * MAX_VHOST_DEVICE + vid]
+		vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]
 			= rte_zmalloc("vhost bufftable",
 				sizeof(struct vhost_bufftable),
 				RTE_CACHE_LINE_SIZE);

-		if (vhost_txbuff[i * MAX_VHOST_DEVICE + vid] == NULL) {
+		if (vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] == NULL) {
 			RTE_LOG(INFO, VHOST_DATA,
 			  "(%d) couldn't allocate memory for vhost TX\n", vid);
 			return -1;
@ -1468,20 +1599,13 @@ new_device(int vid)
 		"(%d) device has been added to data core %d\n",
 		vid, vdev->coreid);

-	if (async_vhost_driver) {
-		struct rte_vhost_async_config config = {0};
-		struct rte_vhost_async_channel_ops channel_ops;
+	if (dma_bind[vid].dmas[VIRTIO_RXQ].dev_id != INVALID_DMA_ID) {
+		int ret;

-		if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
-			channel_ops.transfer_data = ioat_transfer_data_cb;
-			channel_ops.check_completed_copies =
-				ioat_check_completed_copies_cb;
-
-			config.features = RTE_VHOST_ASYNC_INORDER;
-
-			return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
-				config, &channel_ops);
-		}
+		ret = rte_vhost_async_channel_register(vid, VIRTIO_RXQ);
+		if (ret == 0)
+			dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = true;
+		return ret;
 	}

 	return 0;
@ -1502,14 +1626,15 @@ vring_state_changed(int vid, uint16_t queue_id, int enable)
 	if (queue_id != VIRTIO_RXQ)
 		return 0;

-	if (async_vhost_driver) {
+	if (dma_bind[vid].dmas[queue_id].async_enabled) {
 		if (!enable) {
 			uint16_t n_pkt = 0;
+			int16_t dma_id = dma_bind[vid].dmas[VIRTIO_RXQ].dev_id;
 			struct rte_mbuf *m_cpl[vdev->pkts_inflight];

 			while (vdev->pkts_inflight) {
 				n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, queue_id,
-							m_cpl, vdev->pkts_inflight);
+							m_cpl, vdev->pkts_inflight, dma_id, 0);
 				free_pkts(m_cpl, n_pkt);
 				__atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
 			}
@ -1657,6 +1782,24 @@ create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
 }

+static void
+reset_dma(void)
+{
+	int i;
+
+	for (i = 0; i < RTE_MAX_VHOST_DEVICE; i++) {
+		int j;
+
+		for (j = 0; j < RTE_MAX_QUEUES_PER_PORT * 2; j++) {
+			dma_bind[i].dmas[j].dev_id = INVALID_DMA_ID;
+			dma_bind[i].dmas[j].async_enabled = false;
+		}
+	}
+
+	for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++)
+		dmas_id[i] = INVALID_DMA_ID;
+}
+
 /*
 * Main function, does initialisation and calls the per-lcore functions.
 */
@ -1679,6 +1822,9 @@ main(int argc, char *argv[])
 	argc -= ret;
 	argv += ret;

+	/* initialize dma structures */
+	reset_dma();
+
 	/* parse app arguments */
 	ret = us_vhost_parse_args(argc, argv);
 	if (ret < 0)
@ -1754,11 +1900,18 @@ main(int argc, char *argv[])
 	if (client_mode)
 		flags |= RTE_VHOST_USER_CLIENT;

+	for (i = 0; i < dma_count; i++) {
+		if (rte_vhost_async_dma_configure(dmas_id[i], 0) < 0) {
+			RTE_LOG(ERR, VHOST_PORT, "Failed to configure DMA in vhost.\n");
+			rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n");
+		}
+	}
+
 	/* Register vhost user driver to handle vhost messages. */
 	for (i = 0; i < nb_sockets; i++) {
 		char *file = socket_files + i * PATH_MAX;

-		if (async_vhost_driver)
+		if (dma_count)
 			flags = flags | RTE_VHOST_USER_ASYNC_COPY;

 		ret = rte_vhost_driver_register(file, flags);
--- a/examples/vhost/main.h
+++ b/examples/vhost/main.h
@ -8,6 +8,7 @@
 #include <sys/queue.h>

 #include <rte_ether.h>
+#include <rte_pci.h>

 /* Macros for printing using RTE_LOG */
 #define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1
@ -79,6 +80,16 @@ struct lcore_info {
 	struct vhost_dev_tailq_list vdev_list;
 };

+struct dma_info {
+	struct rte_pci_addr addr;
+	int16_t dev_id;
+	bool async_enabled;
+};
+
+struct dma_for_vhost {
+	struct dma_info dmas[RTE_MAX_QUEUES_PER_PORT * 2];
+};
+
 /* we implement non-extra virtio net features */
 #define VIRTIO_NET_FEATURES	0

--- a/examples/vhost/meson.build
+++ b/examples/vhost/meson.build
@ -12,13 +12,9 @@ if not is_linux
 endif

 deps += 'vhost'
+deps += 'dmadev'
 allow_experimental_apis = true
 sources = files(
        'main.c',
        'virtio_net.c',
 )
-
-if dpdk_conf.has('RTE_RAW_IOAT')
-    deps += 'raw_ioat'
-    sources += files('ioat.c')
-endif
--- a/lib/vhost/meson.build
+++ b/lib/vhost/meson.build
@ -36,4 +36,4 @@ headers = files(
 driver_sdk_headers = files(
        'vdpa_driver.h',
 )
-deps += ['ethdev', 'cryptodev', 'hash', 'pci']
+deps += ['ethdev', 'cryptodev', 'hash', 'pci', 'dmadev']
--- a/lib/vhost/rte_vhost.h
+++ b/lib/vhost/rte_vhost.h
@ -115,6 +115,8 @@ extern "C" {
 #define VHOST_USER_F_PROTOCOL_FEATURES	30
 #endif

+#define RTE_MAX_VHOST_DEVICE	1024
+
 struct rte_vdpa_device;

 /**
--- a/lib/vhost/rte_vhost_async.h
+++ b/lib/vhost/rte_vhost_async.h
@ -5,93 +5,10 @@
 #ifndef _RTE_VHOST_ASYNC_H_
 #define _RTE_VHOST_ASYNC_H_

-#include "rte_vhost.h"
+#include <stdint.h>

-/**
- * iovec
- */
-struct rte_vhost_iovec {
-	void *src_addr;
-	void *dst_addr;
-	size_t len;
-};
-
-/**
- * iovec iterator
- */
-struct rte_vhost_iov_iter {
-	/** pointer to the iovec array */
-	struct rte_vhost_iovec *iov;
-	/** number of iovec in this iterator */
-	unsigned long nr_segs;
-};
-
-/**
- * dma transfer status
- */
-struct rte_vhost_async_status {
-	/** An array of application specific data for source memory */
-	uintptr_t *src_opaque_data;
-	/** An array of application specific data for destination memory */
-	uintptr_t *dst_opaque_data;
-};
-
-/**
- * dma operation callbacks to be implemented by applications
- */
-struct rte_vhost_async_channel_ops {
-	/**
-	 * instruct async engines to perform copies for a batch of packets
-	 *
-	 * @param vid
-	 *  id of vhost device to perform data copies
-	 * @param queue_id
-	 *  queue id to perform data copies
-	 * @param iov_iter
-	 *  an array of IOV iterators
-	 * @param opaque_data
-	 *  opaque data pair sending to DMA engine
-	 * @param count
-	 *  number of elements in the "descs" array
-	 * @return
-	 *  number of IOV iterators processed, negative value means error
-	 */
-	int32_t (*transfer_data)(int vid, uint16_t queue_id,
-		struct rte_vhost_iov_iter *iov_iter,
-		struct rte_vhost_async_status *opaque_data,
-		uint16_t count);
-	/**
-	 * check copy-completed packets from the async engine
-	 * @param vid
-	 *  id of vhost device to check copy completion
-	 * @param queue_id
-	 *  queue id to check copy completion
-	 * @param opaque_data
-	 *  buffer to receive the opaque data pair from DMA engine
-	 * @param max_packets
-	 *  max number of packets could be completed
-	 * @return
-	 *  number of async descs completed, negative value means error
-	 */
-	int32_t (*check_completed_copies)(int vid, uint16_t queue_id,
-		struct rte_vhost_async_status *opaque_data,
-		uint16_t max_packets);
-};
-
-/**
- *  async channel features
- */
-enum {
-	RTE_VHOST_ASYNC_INORDER = 1U << 0,
-};
-
-/**
- *  async channel configuration
- */
-struct rte_vhost_async_config {
-	uint32_t features;
-	uint32_t rsvd[2];
-};
+#include <rte_compat.h>
+#include <rte_mbuf.h>

 /**
 * Register an async channel for a vhost queue
@ -100,17 +17,11 @@ struct rte_vhost_async_config {
 *  vhost device id async channel to be attached to
 * @param queue_id
 *  vhost queue id async channel to be attached to
- * @param config
- *  Async channel configuration structure
- * @param ops
- *  Async channel operation callbacks
 * @return
 *  0 on success, -1 on failures
 */
 __rte_experimental
-int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
-	struct rte_vhost_async_config config,
-	struct rte_vhost_async_channel_ops *ops);
+int rte_vhost_async_channel_register(int vid, uint16_t queue_id);

 /**
 * Unregister an async channel for a vhost queue
@ -136,17 +47,11 @@ int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id);
 *  vhost device id async channel to be attached to
 * @param queue_id
 *  vhost queue id async channel to be attached to
- * @param config
- *  Async channel configuration
- * @param ops
- *  Async channel operation callbacks
 * @return
 *  0 on success, -1 on failures
 */
 __rte_experimental
-int rte_vhost_async_channel_register_thread_unsafe(int vid, uint16_t queue_id,
-	struct rte_vhost_async_config config,
-	struct rte_vhost_async_channel_ops *ops);
+int rte_vhost_async_channel_register_thread_unsafe(int vid, uint16_t queue_id);

 /**
 * Unregister an async channel for a vhost queue without performing any
@ -179,12 +84,17 @@ int rte_vhost_async_channel_unregister_thread_unsafe(int vid,
 *  array of packets to be enqueued
 * @param count
 *  packets num to be enqueued
+ * @param dma_id
+ *  the identifier of DMA device
+ * @param vchan_id
+ *  the identifier of virtual DMA channel
 * @return
 *  num of packets enqueued
 */
 __rte_experimental
 uint16_t rte_vhost_submit_enqueue_burst(int vid, uint16_t queue_id,
-		struct rte_mbuf **pkts, uint16_t count);
+		struct rte_mbuf **pkts, uint16_t count, int16_t dma_id,
+		uint16_t vchan_id);

 /**
 * This function checks async completion status for a specific vhost
@ -199,12 +109,17 @@ uint16_t rte_vhost_submit_enqueue_burst(int vid, uint16_t queue_id,
 *  blank array to get return packet pointer
 * @param count
 *  size of the packet array
+ * @param dma_id
+ *  the identifier of DMA device
+ * @param vchan_id
+ *  the identifier of virtual DMA channel
 * @return
 *  num of packets returned
 */
 __rte_experimental
 uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
-		struct rte_mbuf **pkts, uint16_t count);
+		struct rte_mbuf **pkts, uint16_t count, int16_t dma_id,
+		uint16_t vchan_id);

 /**
 * This function returns the amount of in-flight packets for the vhost
@ -235,11 +150,37 @@ int rte_vhost_async_get_inflight(int vid, uint16_t queue_id);
 *  Blank array to get return packet pointer
 * @param count
 *  Size of the packet array
+ * @param dma_id
+ *  the identifier of DMA device
+ * @param vchan_id
+ *  the identifier of virtual DMA channel
 * @return
 *  Number of packets returned
 */
 __rte_experimental
 uint16_t rte_vhost_clear_queue_thread_unsafe(int vid, uint16_t queue_id,
-		struct rte_mbuf **pkts, uint16_t count);
+		struct rte_mbuf **pkts, uint16_t count, int16_t dma_id,
+		uint16_t vchan_id);
+
+/**
+ * The DMA vChannels used in asynchronous data path must be configured
+ * first. So this function needs to be called before enabling DMA
+ * acceleration for vring. If this function fails, the given DMA vChannel
+ * cannot be used in asynchronous data path.
+ *
+ * DMA devices used in data-path must belong to DMA devices given in this
+ * function. Application is free to use DMA devices passed to this function
+ * for non-vhost scenarios, but will have to ensure the Vhost library is not
+ * using the channel at the same time.
+ *
+ * @param dma_id
+ *  the identifier of DMA device
+ * @param vchan_id
+ *  the identifier of virtual DMA channel
+ * @return
+ *  0 on success, and -1 on failure
+ */
+__rte_experimental
+int rte_vhost_async_dma_configure(int16_t dma_id, uint16_t vchan_id);

 #endif /* _RTE_VHOST_ASYNC_H_ */
--- a/lib/vhost/version.map
+++ b/lib/vhost/version.map
@ -84,6 +84,9 @@ EXPERIMENTAL {

 	# added in 21.11
 	rte_vhost_get_monitor_addr;
+
+	# added in 22.03
+	rte_vhost_async_dma_configure;
 };

 INTERNAL {
--- a/lib/vhost/vhost.c
+++ b/lib/vhost/vhost.c
@ -25,7 +25,7 @@
 #include "vhost.h"
 #include "vhost_user.h"

-struct virtio_net *vhost_devices[MAX_VHOST_DEVICE];
+struct virtio_net *vhost_devices[RTE_MAX_VHOST_DEVICE];
 pthread_mutex_t vhost_dev_lock = PTHREAD_MUTEX_INITIALIZER;

 /* Called with iotlb_lock read-locked */
@ -343,6 +343,7 @@ vhost_free_async_mem(struct vhost_virtqueue *vq)
 		return;

 	rte_free(vq->async->pkts_info);
+	rte_free(vq->async->pkts_cmpl_flag);

 	rte_free(vq->async->buffers_packed);
 	vq->async->buffers_packed = NULL;
@ -665,12 +666,12 @@ vhost_new_device(void)
 	int i;

 	pthread_mutex_lock(&vhost_dev_lock);
-	for (i = 0; i < MAX_VHOST_DEVICE; i++) {
+	for (i = 0; i < RTE_MAX_VHOST_DEVICE; i++) {
 		if (vhost_devices[i] == NULL)
 			break;
 	}

-	if (i == MAX_VHOST_DEVICE) {
+	if (i == RTE_MAX_VHOST_DEVICE) {
 		VHOST_LOG_CONFIG(ERR, "failed to find a free slot for new device.\n");
 		pthread_mutex_unlock(&vhost_dev_lock);
 		return -1;
@ -1621,8 +1622,7 @@ rte_vhost_extern_callback_register(int vid,
 }

 static __rte_always_inline int
-async_channel_register(int vid, uint16_t queue_id,
-		struct rte_vhost_async_channel_ops *ops)
+async_channel_register(int vid, uint16_t queue_id)
 {
 	struct virtio_net *dev = get_device(vid);
 	struct vhost_virtqueue *vq = dev->virtqueue[queue_id];
@ -1651,6 +1651,14 @@ async_channel_register(int vid, uint16_t queue_id,
 		goto out_free_async;
 	}

+	async->pkts_cmpl_flag = rte_zmalloc_socket(NULL, vq->size * sizeof(bool),
+			RTE_CACHE_LINE_SIZE, node);
+	if (!async->pkts_cmpl_flag) {
+		VHOST_LOG_CONFIG(ERR, "(%s) failed to allocate async pkts_cmpl_flag (qid: %d)\n",
+				dev->ifname, queue_id);
+		goto out_free_async;
+	}
+
 	if (vq_is_packed(dev)) {
 		async->buffers_packed = rte_malloc_socket(NULL,
 				vq->size * sizeof(struct vring_used_elem_packed),
@ -1671,9 +1679,6 @@ async_channel_register(int vid, uint16_t queue_id,
 		}
 	}

-	async->ops.check_completed_copies = ops->check_completed_copies;
-	async->ops.transfer_data = ops->transfer_data;
-
 	vq->async = async;

 	return 0;
@ -1686,15 +1691,13 @@ async_channel_register(int vid, uint16_t queue_id,
 }

 int
-rte_vhost_async_channel_register(int vid, uint16_t queue_id,
-		struct rte_vhost_async_config config,
-		struct rte_vhost_async_channel_ops *ops)
+rte_vhost_async_channel_register(int vid, uint16_t queue_id)
 {
 	struct vhost_virtqueue *vq;
 	struct virtio_net *dev = get_device(vid);
 	int ret;

-	if (dev == NULL || ops == NULL)
+	if (dev == NULL)
 		return -1;

 	if (queue_id >= VHOST_MAX_VRING)
@ -1705,33 +1708,20 @@ rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 	if (unlikely(vq == NULL || !dev->async_copy))
 		return -1;

-	if (unlikely(!(config.features & RTE_VHOST_ASYNC_INORDER))) {
-		VHOST_LOG_CONFIG(ERR,
-			"(%s) async copy is not supported on non-inorder mode (qid: %d)\n",
-			dev->ifname, queue_id);
-		return -1;
-	}
-
-	if (unlikely(ops->check_completed_copies == NULL ||
-		ops->transfer_data == NULL))
-		return -1;
-
 	rte_spinlock_lock(&vq->access_lock);
-	ret = async_channel_register(vid, queue_id, ops);
+	ret = async_channel_register(vid, queue_id);
 	rte_spinlock_unlock(&vq->access_lock);

 	return ret;
 }

 int
-rte_vhost_async_channel_register_thread_unsafe(int vid, uint16_t queue_id,
-		struct rte_vhost_async_config config,
-		struct rte_vhost_async_channel_ops *ops)
+rte_vhost_async_channel_register_thread_unsafe(int vid, uint16_t queue_id)
 {
 	struct vhost_virtqueue *vq;
 	struct virtio_net *dev = get_device(vid);

-	if (dev == NULL || ops == NULL)
+	if (dev == NULL)
 		return -1;

 	if (queue_id >= VHOST_MAX_VRING)
@ -1742,18 +1732,7 @@ rte_vhost_async_channel_register_thread_unsafe(int vid, uint16_t queue_id,
 	if (unlikely(vq == NULL || !dev->async_copy))
 		return -1;

-	if (unlikely(!(config.features & RTE_VHOST_ASYNC_INORDER))) {
-		VHOST_LOG_CONFIG(ERR,
-			"(%s) async copy is not supported on non-inorder mode (qid: %d)\n",
-			dev->ifname, queue_id);
-		return -1;
-	}
-
-	if (unlikely(ops->check_completed_copies == NULL ||
-		ops->transfer_data == NULL))
-		return -1;
-
-	return async_channel_register(vid, queue_id, ops);
+	return async_channel_register(vid, queue_id);
 }

 int
@ -1832,6 +1811,68 @@ rte_vhost_async_channel_unregister_thread_unsafe(int vid, uint16_t queue_id)
 	return 0;
 }

+int
+rte_vhost_async_dma_configure(int16_t dma_id, uint16_t vchan_id)
+{
+	struct rte_dma_info info;
+	void *pkts_cmpl_flag_addr;
+	uint16_t max_desc;
+
+	if (!rte_dma_is_valid(dma_id)) {
+		VHOST_LOG_CONFIG(ERR, "DMA %d is not found.\n", dma_id);
+		return -1;
+	}
+
+	rte_dma_info_get(dma_id, &info);
+	if (vchan_id >= info.max_vchans) {
+		VHOST_LOG_CONFIG(ERR, "Invalid DMA %d vChannel %u.\n", dma_id, vchan_id);
+		return -1;
+	}
+
+	if (!dma_copy_track[dma_id].vchans) {
+		struct async_dma_vchan_info *vchans;
+
+		vchans = rte_zmalloc(NULL, sizeof(struct async_dma_vchan_info) * info.max_vchans,
+				RTE_CACHE_LINE_SIZE);
+		if (vchans == NULL) {
+			VHOST_LOG_CONFIG(ERR, "Failed to allocate vchans for DMA %d vChannel %u.\n",
+					dma_id, vchan_id);
+			return -1;
+		}
+
+		dma_copy_track[dma_id].vchans = vchans;
+	}
+
+	if (dma_copy_track[dma_id].vchans[vchan_id].pkts_cmpl_flag_addr) {
+		VHOST_LOG_CONFIG(INFO, "DMA %d vChannel %u already registered.\n", dma_id,
+				vchan_id);
+		return 0;
+	}
+
+	max_desc = info.max_desc;
+	if (!rte_is_power_of_2(max_desc))
+		max_desc = rte_align32pow2(max_desc);
+
+	pkts_cmpl_flag_addr = rte_zmalloc(NULL, sizeof(bool *) * max_desc, RTE_CACHE_LINE_SIZE);
+	if (!pkts_cmpl_flag_addr) {
+		VHOST_LOG_CONFIG(ERR, "Failed to allocate pkts_cmpl_flag_addr for DMA %d "
+				"vChannel %u.\n", dma_id, vchan_id);
+
+		if (dma_copy_track[dma_id].nr_vchans == 0) {
+			rte_free(dma_copy_track[dma_id].vchans);
+			dma_copy_track[dma_id].vchans = NULL;
+		}
+		return -1;
+	}
+
+	dma_copy_track[dma_id].vchans[vchan_id].pkts_cmpl_flag_addr = pkts_cmpl_flag_addr;
+	dma_copy_track[dma_id].vchans[vchan_id].ring_size = max_desc;
+	dma_copy_track[dma_id].vchans[vchan_id].ring_mask = max_desc - 1;
+	dma_copy_track[dma_id].nr_vchans++;
+
+	return 0;
+}
+
 int
 rte_vhost_async_get_inflight(int vid, uint16_t queue_id)
 {
--- a/lib/vhost/vhost.h
+++ b/lib/vhost/vhost.h
@ -19,6 +19,7 @@
 #include <rte_ether.h>
 #include <rte_rwlock.h>
 #include <rte_malloc.h>
+#include <rte_dmadev.h>

 #include "rte_vhost.h"
 #include "rte_vdpa.h"
@ -50,6 +51,9 @@

 #define VHOST_MAX_ASYNC_IT (MAX_PKT_BURST)
 #define VHOST_MAX_ASYNC_VEC 2048
+#define VIRTIO_MAX_RX_PKTLEN 9728U
+#define VHOST_DMA_MAX_COPY_COMPLETE ((VIRTIO_MAX_RX_PKTLEN / RTE_MBUF_DEFAULT_DATAROOM) \
+		* MAX_PKT_BURST)

 #define PACKED_DESC_ENQUEUE_USED_FLAG(w)	\
 	((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED | VRING_DESC_F_WRITE) : \
@ -119,6 +123,58 @@ struct vring_used_elem_packed {
 	uint32_t count;
 };

+/**
+ * iovec
+ */
+struct vhost_iovec {
+	void *src_addr;
+	void *dst_addr;
+	size_t len;
+};
+
+/**
+ * iovec iterator
+ */
+struct vhost_iov_iter {
+	/** pointer to the iovec array */
+	struct vhost_iovec *iov;
+	/** number of iovec in this iterator */
+	unsigned long nr_segs;
+};
+
+struct async_dma_vchan_info {
+	/* circular array to track if packet copy completes */
+	bool **pkts_cmpl_flag_addr;
+
+	/* max elements in 'pkts_cmpl_flag_addr' */
+	uint16_t ring_size;
+	/* ring index mask for 'pkts_cmpl_flag_addr' */
+	uint16_t ring_mask;
+
+	/**
+	 * DMA virtual channel lock. Although it is able to bind DMA
+	 * virtual channels to data plane threads, vhost control plane
+	 * thread could call data plane functions too, thus causing
+	 * DMA device contention.
+	 *
+	 * For example, in VM exit case, vhost control plane thread needs
+	 * to clear in-flight packets before disable vring, but there could
+	 * be anotther data plane thread is enqueuing packets to the same
+	 * vring with the same DMA virtual channel. As dmadev PMD functions
+	 * are lock-free, the control plane and data plane threads could
+	 * operate the same DMA virtual channel at the same time.
+	 */
+	rte_spinlock_t dma_lock;
+};
+
+struct async_dma_info {
+	struct async_dma_vchan_info *vchans;
+	/* number of registered virtual channels */
+	uint16_t nr_vchans;
+};
+
+extern struct async_dma_info dma_copy_track[RTE_DMADEV_DEFAULT_MAX];
+
 /**
 * inflight async packet information
 */
@ -129,16 +185,32 @@ struct async_inflight_info {
 };

 struct vhost_async {
-	/* operation callbacks for DMA */
-	struct rte_vhost_async_channel_ops ops;
-
-	struct rte_vhost_iov_iter iov_iter[VHOST_MAX_ASYNC_IT];
-	struct rte_vhost_iovec iovec[VHOST_MAX_ASYNC_VEC];
+	struct vhost_iov_iter iov_iter[VHOST_MAX_ASYNC_IT];
+	struct vhost_iovec iovec[VHOST_MAX_ASYNC_VEC];
 	uint16_t iter_idx;
 	uint16_t iovec_idx;

 	/* data transfer status */
 	struct async_inflight_info *pkts_info;
+	/**
+	 * Packet reorder array. "true" indicates that DMA device
+	 * completes all copies for the packet.
+	 *
+	 * Note that this array could be written by multiple threads
+	 * simultaneously. For example, in the case of thread0 and
+	 * thread1 RX packets from NIC and then enqueue packets to
+	 * vring0 and vring1 with own DMA device DMA0 and DMA1, it's
+	 * possible for thread0 to get completed copies belonging to
+	 * vring1 from DMA0, while thread0 is calling rte_vhost_poll
+	 * _enqueue_completed() for vring0 and thread1 is calling
+	 * rte_vhost_submit_enqueue_burst() for vring1. In this case,
+	 * vq->access_lock cannot protect pkts_cmpl_flag of vring1.
+	 *
+	 * However, since offloading is per-packet basis, each packet
+	 * flag will only be written by one thread. And single byte
+	 * write is atomic, so no lock for pkts_cmpl_flag is needed.
+	 */
+	bool *pkts_cmpl_flag;
 	uint16_t pkts_idx;
 	uint16_t pkts_inflight_n;
 	union {
@ -568,8 +640,7 @@ extern int vhost_data_log_level;
 #define PRINT_PACKET(device, addr, size, header) do {} while (0)
 #endif

-#define MAX_VHOST_DEVICE	1024
-extern struct virtio_net *vhost_devices[MAX_VHOST_DEVICE];
+extern struct virtio_net *vhost_devices[RTE_MAX_VHOST_DEVICE];

 #define VHOST_BINARY_SEARCH_THRESH 256

--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@ -11,6 +11,7 @@
 #include <rte_net.h>
 #include <rte_ether.h>
 #include <rte_ip.h>
+#include <rte_dmadev.h>
 #include <rte_vhost.h>
 #include <rte_tcp.h>
 #include <rte_udp.h>
@ -25,6 +26,9 @@

 #define MAX_BATCH_LEN 256

+/* DMA device copy operation tracking array. */
+struct async_dma_info dma_copy_track[RTE_DMADEV_DEFAULT_MAX];
+
 static  __rte_always_inline bool
 rxvq_is_mergeable(struct virtio_net *dev)
 {
@ -43,6 +47,135 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring)
 	return (is_tx ^ (idx & 1)) == 0 && idx < nr_vring;
 }

+static __rte_always_inline int64_t
+vhost_async_dma_transfer_one(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		int16_t dma_id, uint16_t vchan_id, uint16_t flag_idx,
+		struct vhost_iov_iter *pkt)
+{
+	struct async_dma_vchan_info *dma_info = &dma_copy_track[dma_id].vchans[vchan_id];
+	uint16_t ring_mask = dma_info->ring_mask;
+	static bool vhost_async_dma_copy_log;
+
+
+	struct vhost_iovec *iov = pkt->iov;
+	int copy_idx = 0;
+	uint32_t nr_segs = pkt->nr_segs;
+	uint16_t i;
+
+	if (rte_dma_burst_capacity(dma_id, vchan_id) < nr_segs)
+		return -1;
+
+	for (i = 0; i < nr_segs; i++) {
+		copy_idx = rte_dma_copy(dma_id, vchan_id, (rte_iova_t)iov[i].src_addr,
+				(rte_iova_t)iov[i].dst_addr, iov[i].len, RTE_DMA_OP_FLAG_LLC);
+		/**
+		 * Since all memory is pinned and DMA vChannel
+		 * ring has enough space, failure should be a
+		 * rare case. If failure happens, it means DMA
+		 * device encounters serious errors; in this
+		 * case, please stop async data-path and check
+		 * what has happened to DMA device.
+		 */
+		if (unlikely(copy_idx < 0)) {
+			if (!vhost_async_dma_copy_log) {
+				VHOST_LOG_DATA(ERR, "(%s) DMA copy failed for channel %d:%u\n",
+						dev->ifname, dma_id, vchan_id);
+				vhost_async_dma_copy_log = true;
+			}
+			return -1;
+		}
+	}
+
+	/**
+	 * Only store packet completion flag address in the last copy's
+	 * slot, and other slots are set to NULL.
+	 */
+	dma_info->pkts_cmpl_flag_addr[copy_idx & ring_mask] = &vq->async->pkts_cmpl_flag[flag_idx];
+
+	return nr_segs;
+}
+
+static __rte_always_inline uint16_t
+vhost_async_dma_transfer(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		int16_t dma_id, uint16_t vchan_id, uint16_t head_idx,
+		struct vhost_iov_iter *pkts, uint16_t nr_pkts)
+{
+	struct async_dma_vchan_info *dma_info = &dma_copy_track[dma_id].vchans[vchan_id];
+	int64_t ret, nr_copies = 0;
+	uint16_t pkt_idx;
+
+	rte_spinlock_lock(&dma_info->dma_lock);
+
+	for (pkt_idx = 0; pkt_idx < nr_pkts; pkt_idx++) {
+		ret = vhost_async_dma_transfer_one(dev, vq, dma_id, vchan_id, head_idx,
+				&pkts[pkt_idx]);
+		if (unlikely(ret < 0))
+			break;
+
+		nr_copies += ret;
+		head_idx++;
+		if (head_idx >= vq->size)
+			head_idx -= vq->size;
+	}
+
+	if (likely(nr_copies > 0))
+		rte_dma_submit(dma_id, vchan_id);
+
+	rte_spinlock_unlock(&dma_info->dma_lock);
+
+	return pkt_idx;
+}
+
+static __rte_always_inline uint16_t
+vhost_async_dma_check_completed(struct virtio_net *dev, int16_t dma_id, uint16_t vchan_id,
+		uint16_t max_pkts)
+{
+	struct async_dma_vchan_info *dma_info = &dma_copy_track[dma_id].vchans[vchan_id];
+	uint16_t ring_mask = dma_info->ring_mask;
+	uint16_t last_idx = 0;
+	uint16_t nr_copies;
+	uint16_t copy_idx;
+	uint16_t i;
+	bool has_error = false;
+	static bool vhost_async_dma_complete_log;
+
+	rte_spinlock_lock(&dma_info->dma_lock);
+
+	/**
+	 * Print error log for debugging, if DMA reports error during
+	 * DMA transfer. We do not handle error in vhost level.
+	 */
+	nr_copies = rte_dma_completed(dma_id, vchan_id, max_pkts, &last_idx, &has_error);
+	if (unlikely(!vhost_async_dma_complete_log && has_error)) {
+		VHOST_LOG_DATA(ERR, "(%s) DMA completion failure on channel %d:%u\n", dev->ifname,
+				dma_id, vchan_id);
+		vhost_async_dma_complete_log = true;
+	} else if (nr_copies == 0) {
+		goto out;
+	}
+
+	copy_idx = last_idx - nr_copies + 1;
+	for (i = 0; i < nr_copies; i++) {
+		bool *flag;
+
+		flag = dma_info->pkts_cmpl_flag_addr[copy_idx & ring_mask];
+		if (flag) {
+			/**
+			 * Mark the packet flag as received. The flag
+			 * could belong to another virtqueue but write
+			 * is atomic.
+			 */
+			*flag = true;
+			dma_info->pkts_cmpl_flag_addr[copy_idx & ring_mask] = NULL;
+		}
+		copy_idx++;
+	}
+
+out:
+	rte_spinlock_unlock(&dma_info->dma_lock);
+	return nr_copies;
+}
+
 static inline void
 do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
 {
@ -794,7 +927,7 @@ copy_vnet_hdr_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 static __rte_always_inline int
 async_iter_initialize(struct virtio_net *dev, struct vhost_async *async)
 {
-	struct rte_vhost_iov_iter *iter;
+	struct vhost_iov_iter *iter;

 	if (unlikely(async->iovec_idx >= VHOST_MAX_ASYNC_VEC)) {
 		VHOST_LOG_DATA(ERR, "(%s) no more async iovec available\n", dev->ifname);
@ -812,8 +945,8 @@ static __rte_always_inline int
 async_iter_add_iovec(struct virtio_net *dev, struct vhost_async *async,
 		void *src, void *dst, size_t len)
 {
-	struct rte_vhost_iov_iter *iter;
-	struct rte_vhost_iovec *iovec;
+	struct vhost_iov_iter *iter;
+	struct vhost_iovec *iovec;

 	if (unlikely(async->iovec_idx >= VHOST_MAX_ASYNC_VEC)) {
 		static bool vhost_max_async_vec_log;
@ -848,7 +981,7 @@ async_iter_finalize(struct vhost_async *async)
 static __rte_always_inline void
 async_iter_cancel(struct vhost_async *async)
 {
-	struct rte_vhost_iov_iter *iter;
+	struct vhost_iov_iter *iter;

 	iter = async->iov_iter + async->iter_idx;
 	async->iovec_idx -= iter->nr_segs;
@ -1448,9 +1581,9 @@ store_dma_desc_info_packed(struct vring_used_elem_packed *s_ring,
 }

 static __rte_noinline uint32_t
-virtio_dev_rx_async_submit_split(struct virtio_net *dev,
-	struct vhost_virtqueue *vq, uint16_t queue_id,
-	struct rte_mbuf **pkts, uint32_t count)
+virtio_dev_rx_async_submit_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		uint16_t queue_id, struct rte_mbuf **pkts, uint32_t count,
+		int16_t dma_id, uint16_t vchan_id)
 {
 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
 	uint32_t pkt_idx = 0;
@ -1460,7 +1593,7 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct vhost_async *async = vq->async;
 	struct async_inflight_info *pkts_info = async->pkts_info;
 	uint32_t pkt_err = 0;
-	int32_t n_xfer;
+	uint16_t n_xfer;
 	uint16_t slot_idx = 0;

 	/*
@ -1502,17 +1635,16 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	if (unlikely(pkt_idx == 0))
 		return 0;

-	n_xfer = async->ops.transfer_data(dev->vid, queue_id, async->iov_iter, 0, pkt_idx);
-	if (unlikely(n_xfer < 0)) {
-		VHOST_LOG_DATA(ERR, "(%s) %s: failed to transfer data for queue id %d.\n",
-				dev->ifname, __func__, queue_id);
-		n_xfer = 0;
-	}
+	n_xfer = vhost_async_dma_transfer(dev, vq, dma_id, vchan_id, async->pkts_idx,
+			async->iov_iter, pkt_idx);

 	pkt_err = pkt_idx - n_xfer;
 	if (unlikely(pkt_err)) {
 		uint16_t num_descs = 0;

+		VHOST_LOG_DATA(DEBUG, "(%s) %s: failed to transfer %u packets for queue %u.\n",
+				dev->ifname, __func__, pkt_err, queue_id);
+
 		/* update number of completed packets */
 		pkt_idx = n_xfer;

@ -1655,13 +1787,13 @@ dma_error_handler_packed(struct vhost_virtqueue *vq, uint16_t slot_idx,
 }

 static __rte_noinline uint32_t
-virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
-	struct vhost_virtqueue *vq, uint16_t queue_id,
-	struct rte_mbuf **pkts, uint32_t count)
+virtio_dev_rx_async_submit_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		uint16_t queue_id, struct rte_mbuf **pkts, uint32_t count,
+		int16_t dma_id, uint16_t vchan_id)
 {
 	uint32_t pkt_idx = 0;
 	uint32_t remained = count;
-	int32_t n_xfer;
+	uint16_t n_xfer;
 	uint16_t num_buffers;
 	uint16_t num_descs;

@ -1693,19 +1825,17 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
 	if (unlikely(pkt_idx == 0))
 		return 0;

-	n_xfer = async->ops.transfer_data(dev->vid, queue_id, async->iov_iter, 0, pkt_idx);
-	if (unlikely(n_xfer < 0)) {
-		VHOST_LOG_DATA(ERR, "(%s) %s: failed to transfer data for queue id %d.\n",
-				dev->ifname, __func__, queue_id);
-		n_xfer = 0;
-	}
-
-	pkt_err = pkt_idx - n_xfer;
+	n_xfer = vhost_async_dma_transfer(dev, vq, dma_id, vchan_id, async->pkts_idx,
+			async->iov_iter, pkt_idx);

 	async_iter_reset(async);

-	if (unlikely(pkt_err))
+	pkt_err = pkt_idx - n_xfer;
+	if (unlikely(pkt_err)) {
+		VHOST_LOG_DATA(DEBUG, "(%s) %s: failed to transfer %u packets for queue %u.\n",
+				dev->ifname, __func__, pkt_err, queue_id);
 		dma_error_handler_packed(vq, slot_idx, pkt_err, &pkt_idx);
+	}

 	if (likely(vq->shadow_used_idx)) {
 		/* keep used descriptors. */
@ -1825,28 +1955,40 @@ write_back_completed_descs_packed(struct vhost_virtqueue *vq,

 static __rte_always_inline uint16_t
 vhost_poll_enqueue_completed(struct virtio_net *dev, uint16_t queue_id,
-		struct rte_mbuf **pkts, uint16_t count)
+		struct rte_mbuf **pkts, uint16_t count, int16_t dma_id,
+		uint16_t vchan_id)
 {
 	struct vhost_virtqueue *vq = dev->virtqueue[queue_id];
 	struct vhost_async *async = vq->async;
 	struct async_inflight_info *pkts_info = async->pkts_info;
-	int32_t n_cpl;
+	uint16_t nr_cpl_pkts = 0;
 	uint16_t n_descs = 0, n_buffers = 0;
 	uint16_t start_idx, from, i;

-	n_cpl = async->ops.check_completed_copies(dev->vid, queue_id, 0, count);
-	if (unlikely(n_cpl < 0)) {
-		VHOST_LOG_DATA(ERR, "(%s) %s: failed to check completed copies for queue id %d.\n",
-				dev->ifname, __func__, queue_id);
-		return 0;
-	}
-
-	if (n_cpl == 0)
-		return 0;
+	/* Check completed copies for the given DMA vChannel */
+	vhost_async_dma_check_completed(dev, dma_id, vchan_id, VHOST_DMA_MAX_COPY_COMPLETE);

 	start_idx = async_get_first_inflight_pkt_idx(vq);
+	/**
+	 * Calculate the number of copy completed packets.
+	 * Note that there may be completed packets even if
+	 * no copies are reported done by the given DMA vChannel,
+	 * as it's possible that a virtqueue uses multiple DMA
+	 * vChannels.
+	 */
+	from = start_idx;
+	while (vq->async->pkts_cmpl_flag[from] && count--) {
+		vq->async->pkts_cmpl_flag[from] = false;
+		from++;
+		if (from >= vq->size)
+			from -= vq->size;
+		nr_cpl_pkts++;
+	}

-	for (i = 0; i < n_cpl; i++) {
+	if (nr_cpl_pkts == 0)
+		return 0;
+
+	for (i = 0; i < nr_cpl_pkts; i++) {
 		from = (start_idx + i) % vq->size;
 		/* Only used with packed ring */
 		n_buffers += pkts_info[from].nr_buffers;
@ -1855,7 +1997,7 @@ vhost_poll_enqueue_completed(struct virtio_net *dev, uint16_t queue_id,
 		pkts[i] = pkts_info[from].mbuf;
 	}

-	async->pkts_inflight_n -= n_cpl;
+	async->pkts_inflight_n -= nr_cpl_pkts;

 	if (likely(vq->enabled && vq->access_ok)) {
 		if (vq_is_packed(dev)) {
@ -1876,12 +2018,13 @@ vhost_poll_enqueue_completed(struct virtio_net *dev, uint16_t queue_id,
 		}
 	}

-	return n_cpl;
+	return nr_cpl_pkts;
 }

 uint16_t
 rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
-		struct rte_mbuf **pkts, uint16_t count)
+		struct rte_mbuf **pkts, uint16_t count, int16_t dma_id,
+		uint16_t vchan_id)
 {
 	struct virtio_net *dev = get_device(vid);
 	struct vhost_virtqueue *vq;
@ -1897,18 +2040,30 @@ rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 		return 0;
 	}

-	vq = dev->virtqueue[queue_id];
-
-	if (unlikely(!vq->async)) {
-		VHOST_LOG_DATA(ERR, "(%s) %s: async not registered for queue id %d.\n",
-			dev->ifname, __func__, queue_id);
+	if (unlikely(!dma_copy_track[dma_id].vchans ||
+				!dma_copy_track[dma_id].vchans[vchan_id].pkts_cmpl_flag_addr)) {
+		VHOST_LOG_DATA(ERR, "(%s) %s: invalid channel %d:%u.\n", dev->ifname, __func__,
+			       dma_id, vchan_id);
 		return 0;
 	}

-	rte_spinlock_lock(&vq->access_lock);
+	vq = dev->virtqueue[queue_id];

-	n_pkts_cpl = vhost_poll_enqueue_completed(dev, queue_id, pkts, count);
+	if (!rte_spinlock_trylock(&vq->access_lock)) {
+		VHOST_LOG_DATA(DEBUG, "(%s) %s: virtqueue %u is busy.\n", dev->ifname, __func__,
+				queue_id);
+		return 0;
+	}

+	if (unlikely(!vq->async)) {
+		VHOST_LOG_DATA(ERR, "(%s) %s: async not registered for virtqueue %d.\n",
+				dev->ifname, __func__, queue_id);
+		goto out;
+	}
+
+	n_pkts_cpl = vhost_poll_enqueue_completed(dev, queue_id, pkts, count, dma_id, vchan_id);
+
+out:
 	rte_spinlock_unlock(&vq->access_lock);

 	return n_pkts_cpl;
@ -1916,7 +2071,8 @@ rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,

 uint16_t
 rte_vhost_clear_queue_thread_unsafe(int vid, uint16_t queue_id,
-		struct rte_mbuf **pkts, uint16_t count)
+		struct rte_mbuf **pkts, uint16_t count, int16_t dma_id,
+		uint16_t vchan_id)
 {
 	struct virtio_net *dev = get_device(vid);
 	struct vhost_virtqueue *vq;
@ -1940,14 +2096,21 @@ rte_vhost_clear_queue_thread_unsafe(int vid, uint16_t queue_id,
 		return 0;
 	}

-	n_pkts_cpl = vhost_poll_enqueue_completed(dev, queue_id, pkts, count);
+	if (unlikely(!dma_copy_track[dma_id].vchans ||
+				!dma_copy_track[dma_id].vchans[vchan_id].pkts_cmpl_flag_addr)) {
+		VHOST_LOG_DATA(ERR, "(%s) %s: invalid channel %d:%u.\n", dev->ifname, __func__,
+				dma_id, vchan_id);
+		return 0;
+	}
+
+	n_pkts_cpl = vhost_poll_enqueue_completed(dev, queue_id, pkts, count, dma_id, vchan_id);

 	return n_pkts_cpl;
 }

 static __rte_always_inline uint32_t
 virtio_dev_rx_async_submit(struct virtio_net *dev, uint16_t queue_id,
-	struct rte_mbuf **pkts, uint32_t count)
+	struct rte_mbuf **pkts, uint32_t count, int16_t dma_id, uint16_t vchan_id)
 {
 	struct vhost_virtqueue *vq;
 	uint32_t nb_tx = 0;
@ -1959,6 +2122,13 @@ virtio_dev_rx_async_submit(struct virtio_net *dev, uint16_t queue_id,
 		return 0;
 	}

+	if (unlikely(!dma_copy_track[dma_id].vchans ||
+				!dma_copy_track[dma_id].vchans[vchan_id].pkts_cmpl_flag_addr)) {
+		VHOST_LOG_DATA(ERR, "(%s) %s: invalid channel %d:%u.\n", dev->ifname, __func__,
+			       dma_id, vchan_id);
+		return 0;
+	}
+
 	vq = dev->virtqueue[queue_id];

 	rte_spinlock_lock(&vq->access_lock);
@ -1979,10 +2149,10 @@ virtio_dev_rx_async_submit(struct virtio_net *dev, uint16_t queue_id,

 	if (vq_is_packed(dev))
 		nb_tx = virtio_dev_rx_async_submit_packed(dev, vq, queue_id,
-				pkts, count);
+				pkts, count, dma_id, vchan_id);
 	else
 		nb_tx = virtio_dev_rx_async_submit_split(dev, vq, queue_id,
-				pkts, count);
+				pkts, count, dma_id, vchan_id);

 out:
 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
@ -1996,7 +2166,8 @@ virtio_dev_rx_async_submit(struct virtio_net *dev, uint16_t queue_id,

 uint16_t
 rte_vhost_submit_enqueue_burst(int vid, uint16_t queue_id,
-		struct rte_mbuf **pkts, uint16_t count)
+		struct rte_mbuf **pkts, uint16_t count, int16_t dma_id,
+		uint16_t vchan_id)
 {
 	struct virtio_net *dev = get_device(vid);

@ -2009,7 +2180,7 @@ rte_vhost_submit_enqueue_burst(int vid, uint16_t queue_id,
 		return 0;
 	}

-	return virtio_dev_rx_async_submit(dev, queue_id, pkts, count);
+	return virtio_dev_rx_async_submit(dev, queue_id, pkts, count, dma_id, vchan_id);
 }

 static inline bool