Revert "env: Register external memory with DPDK"

This reverts commit aaac48880dc9f3545a19aaeb85a95cef138eb2ce.

This patch was showing issues with SPDK vhost mappings
when handling larger numbers of VMs.

Fixes issue #1901.

Signed-off-by: Jim Harris <james.r.harris@intel.com>
Change-Id: I81bd311d26037dcb9340d85abcb4ea45b20a5170
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/7424
Community-CI: Broadcom CI
Reviewed-by: Aleksey Marchuk <alexeymar@mellanox.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
This commit is contained in:
Jim Harris 2021-04-15 15:46:03 +00:00
parent 476b66614d
commit ba9853b9ba
4 changed files with 160 additions and 134 deletions

View File

@ -98,19 +98,7 @@ spdk_realloc(void *buf, size_t size, size_t align)
void
spdk_free(void *buf)
{
struct spdk_mem_region *region, *tmp;
rte_free(buf);
pthread_mutex_lock(&g_spdk_mem_region_mutex);
g_spdk_mem_do_not_notify = true;
/* Perform memory unregister previously postponed in memory_hotplug_cb() */
TAILQ_FOREACH_SAFE(region, &g_spdk_mem_regions, tailq, tmp) {
spdk_mem_unregister(region->addr, region->len);
TAILQ_REMOVE(&g_spdk_mem_regions, region, tailq);
free(region);
}
g_spdk_mem_do_not_notify = false;
pthread_mutex_unlock(&g_spdk_mem_region_mutex);
}
void *

View File

@ -50,18 +50,6 @@
#error RTE_VERSION is too old! Minimum 19.11 is required.
#endif
extern __thread bool g_spdk_mem_do_not_notify;
extern struct spdk_mem_region_head g_spdk_mem_regions;
extern pthread_mutex_t g_spdk_mem_region_mutex;
struct spdk_mem_region {
void *addr;
size_t len;
TAILQ_ENTRY(spdk_mem_region) tailq;
};
TAILQ_HEAD(spdk_mem_region_head, spdk_mem_region);
/* x86-64 and ARM userspace virtual addresses use only the low 48 bits [0..47],
* which is enough to cover 256 TB.
*/

View File

@ -146,10 +146,7 @@ struct spdk_mem_map {
static struct spdk_mem_map *g_mem_reg_map;
static TAILQ_HEAD(spdk_mem_map_head, spdk_mem_map) g_spdk_mem_maps =
TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps);
struct spdk_mem_region_head g_spdk_mem_regions = TAILQ_HEAD_INITIALIZER(g_spdk_mem_regions);
static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t g_spdk_mem_region_mutex = PTHREAD_MUTEX_INITIALIZER;
__thread bool g_spdk_mem_do_not_notify;
static bool g_legacy_mem;
@ -735,19 +732,7 @@ memory_hotplug_cb(enum rte_mem_event event_type,
len -= seg->hugepage_sz;
}
} else if (event_type == RTE_MEM_EVENT_FREE) {
/* We need to postpone spdk_mem_unregister() call here to avoid
* double lock of SPDK mutex (g_spdk_mem_map_mutex) and DPDK mutex
* (memory_hotplug_lock) which may happen when one thread is calling
* spdk_free() and another one is calling vhost_session_mem_unregister() */
struct spdk_mem_region *region;
region = calloc(1, sizeof(*region));
assert(region != NULL);
region->addr = (void *)addr;
region->len = len;
pthread_mutex_lock(&g_spdk_mem_region_mutex);
TAILQ_INSERT_TAIL(&g_spdk_mem_regions, region, tailq);
pthread_mutex_unlock(&g_spdk_mem_region_mutex);
spdk_mem_unregister((void *)addr, len);
}
}
@ -800,6 +785,126 @@ static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices =
static struct spdk_mem_map *g_vtophys_map;
static struct spdk_mem_map *g_phys_ref_map;
#if VFIO_ENABLED
static int
vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
{
struct spdk_vfio_dma_map *dma_map;
uint64_t refcount;
int ret;
refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
assert(refcount < UINT64_MAX);
if (refcount > 0) {
spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
return 0;
}
dma_map = calloc(1, sizeof(*dma_map));
if (dma_map == NULL) {
return -ENOMEM;
}
dma_map->map.argsz = sizeof(dma_map->map);
dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
dma_map->map.vaddr = vaddr;
dma_map->map.iova = iova;
dma_map->map.size = size;
pthread_mutex_lock(&g_vfio.mutex);
if (g_vfio.device_ref == 0) {
/* VFIO requires at least one device (IOMMU group) to be added to
* a VFIO container before it is possible to perform any IOMMU
* operations on that container. This memory will be mapped once
* the first device (IOMMU group) is hotplugged.
*
* Since the vfio container is managed internally by DPDK, it is
* also possible that some device is already in that container, but
* it's not managed by SPDK - e.g. an NIC attached internally
* inside DPDK. We could map the memory straight away in such
* scenario, but there's no need to do it. DPDK devices clearly
* don't need our mappings and hence we defer the mapping
* unconditionally until the first SPDK-managed device is
* hotplugged.
*/
goto out_insert;
}
ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
if (ret) {
DEBUG_PRINT("Cannot set up DMA mapping, error %d\n", errno);
pthread_mutex_unlock(&g_vfio.mutex);
free(dma_map);
return ret;
}
out_insert:
TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq);
pthread_mutex_unlock(&g_vfio.mutex);
spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
return 0;
}
static int
vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size)
{
struct spdk_vfio_dma_map *dma_map;
uint64_t refcount;
int ret;
struct vfio_iommu_type1_dma_unmap unmap = {};
pthread_mutex_lock(&g_vfio.mutex);
TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
if (dma_map->map.iova == iova) {
break;
}
}
if (dma_map == NULL) {
DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova);
pthread_mutex_unlock(&g_vfio.mutex);
return -ENXIO;
}
refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
assert(refcount < UINT64_MAX);
if (refcount > 0) {
spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount - 1);
}
/* We still have outstanding references, don't clear it. */
if (refcount > 1) {
pthread_mutex_unlock(&g_vfio.mutex);
return 0;
}
/** don't support partial or multiple-page unmap for now */
assert(dma_map->map.size == size);
if (g_vfio.device_ref == 0) {
/* Memory is not mapped anymore, just remove it's references */
goto out_remove;
}
unmap.argsz = sizeof(unmap);
unmap.flags = 0;
unmap.iova = dma_map->map.iova;
unmap.size = dma_map->map.size;
ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap);
if (ret) {
DEBUG_PRINT("Cannot clear DMA mapping, error %d\n", errno);
pthread_mutex_unlock(&g_vfio.mutex);
return ret;
}
out_remove:
TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq);
pthread_mutex_unlock(&g_vfio.mutex);
free(dma_map);
return 0;
}
#endif
static uint64_t
vtophys_get_paddr_memseg(uint64_t vaddr)
{
@ -883,14 +988,6 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
{
int rc = 0, pci_phys = 0;
uint64_t paddr;
#if VFIO_ENABLED
uint32_t num_pages, i;
rte_iova_t *iovas = NULL;
rte_iova_t iova;
struct rte_dev_iterator dev_iter;
struct rte_device *dev;
const char *devstr = "bus=pci";
#endif
if ((uintptr_t)vaddr & ~MASK_256TB) {
DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
@ -909,9 +1006,7 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
switch (action) {
case SPDK_MEM_MAP_NOTIFY_REGISTER:
if (paddr == SPDK_VTOPHYS_ERROR) {
/* This is not an address that DPDK is managing currently.
* Let's register it. */
/* This is not an address that DPDK is managing. */
#if VFIO_ENABLED
enum rte_iova_mode iova_mode;
@ -920,38 +1015,19 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) {
/* We'll use the virtual address as the iova to match DPDK. */
paddr = (uint64_t)vaddr;
num_pages = len / VALUE_2MB;
iovas = calloc(num_pages, sizeof(rte_iova_t));
if (iovas == NULL) {
return -ENOMEM;
rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len);
if (rc) {
return -EFAULT;
}
for (i = 0; i < num_pages; i++) {
iovas[i] = paddr;
paddr += VALUE_2MB;
}
rc = rte_extmem_register(vaddr, len, iovas, num_pages, VALUE_2MB);
if (rc != 0) {
goto error;
}
/* For each device, map the memory */
RTE_DEV_FOREACH(dev, devstr, &dev_iter) {
rte_dev_dma_map(dev, vaddr, (uint64_t)vaddr, len);
}
for (i = 0; i < num_pages; i++) {
rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, iovas[i]);
while (len > 0) {
rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
if (rc != 0) {
goto error;
return rc;
}
vaddr += VALUE_2MB;
paddr += VALUE_2MB;
len -= VALUE_2MB;
}
free(iovas);
} else
#endif
{
@ -962,8 +1038,7 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
paddr = vtophys_get_paddr_pci((uint64_t)vaddr);
if (paddr == SPDK_VTOPHYS_ERROR) {
DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
rc = -EFAULT;
goto error;
return -EFAULT;
}
/* The beginning of this address range points to a PCI resource,
* so the rest must point to a PCI resource as well.
@ -982,39 +1057,29 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
if (paddr == SPDK_VTOPHYS_ERROR) {
DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
rc = -EFAULT;
goto error;
return -EFAULT;
}
/* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */
if (!pci_phys && (paddr & MASK_2MB)) {
DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr);
rc = -EINVAL;
goto error;
return -EINVAL;
}
#if VFIO_ENABLED
/* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory
* with the IOMMU using the physical address to match. */
if (spdk_iommu_is_enabled()) {
iova = paddr;
rc = rte_extmem_register(vaddr, VALUE_2MB, &iova, 1, VALUE_2MB);
if (rc != 0) {
goto error;
}
/* For each device, map the memory */
RTE_DEV_FOREACH(dev, devstr, &dev_iter) {
rte_dev_dma_map(dev, vaddr, paddr, len);
rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB);
if (rc) {
DEBUG_PRINT("Unable to assign vaddr %p to paddr 0x%" PRIx64 "\n", vaddr, paddr);
return -EFAULT;
}
}
#endif
rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
if (rc != 0) {
goto error;
return rc;
}
vaddr += VALUE_2MB;
@ -1043,8 +1108,11 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
break;
case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
#if VFIO_ENABLED
if (g_spdk_mem_do_not_notify == false) {
/* If vfio is enabled, we need to unmap the range from the IOMMU */
if (paddr == SPDK_VTOPHYS_ERROR) {
/*
* This is not an address that DPDK is managing. If vfio is enabled,
* we need to unmap the range from the IOMMU
*/
if (spdk_iommu_is_enabled()) {
uint64_t buffer_len = len;
uint8_t *va = vaddr;
@ -1063,33 +1131,29 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
va, len, paddr, buffer_len);
return -EINVAL;
}
/* For each device, map the memory */
RTE_DEV_FOREACH(dev, devstr, &dev_iter) {
rte_dev_dma_unmap(dev, vaddr, (uint64_t)vaddr, len);
}
rc = rte_extmem_unregister(vaddr, len);
if (rc != 0) {
return rc;
rc = vtophys_iommu_unmap_dma(paddr, len);
if (rc) {
DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr);
return -EFAULT;
}
} else if (iova_mode == RTE_IOVA_PA) {
/* Get paddr for each 2MB chunk in this address range */
paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL);
while (buffer_len > 0) {
paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL);
if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) {
DEBUG_PRINT("could not get phys addr for %p\n", va);
return -EFAULT;
}
if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) {
DEBUG_PRINT("could not get phys addr for %p\n", va);
return -EFAULT;
}
/* For each device, map the memory */
RTE_DEV_FOREACH(dev, devstr, &dev_iter) {
rte_dev_dma_unmap(dev, vaddr, (uint64_t)vaddr, len);
}
rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB);
if (rc) {
DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr);
return -EFAULT;
}
rc = rte_extmem_unregister(vaddr, len);
if (rc != 0) {
return rc;
va += VALUE_2MB;
buffer_len -= VALUE_2MB;
}
}
}
@ -1110,13 +1174,6 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
SPDK_UNREACHABLE();
}
return rc;
error:
#if VFIO_ENABLED
free(iovas);
#endif
return rc;
}

View File

@ -65,13 +65,6 @@ DEFINE_STUB(rte_dev_event_callback_unregister, int, (const char *device_name,
rte_dev_event_cb_fn cb_fn, void *cb_arg), 0);
DEFINE_STUB(rte_dev_event_monitor_start, int, (void), 0);
DEFINE_STUB(rte_dev_event_monitor_stop, int, (void), 0);
DEFINE_STUB(rte_extmem_register, int, (void *va_addr, size_t len, rte_iova_t iova_addrs[],
unsigned int n_pages, size_t page_sz), 0);
DEFINE_STUB(rte_extmem_unregister, int, (void *va_addr, size_t len), 0);
DEFINE_STUB(rte_dev_dma_map, int, (struct rte_device *dev, void *addr, uint64_t iova,
size_t len), 0);
DEFINE_STUB(rte_dev_dma_unmap, int, (struct rte_device *dev, void *addr, uint64_t iova,
size_t len), 0);
static int
test_mem_map_notify(void *cb_ctx, struct spdk_mem_map *map,