env_dpdk/memory: aggregate adjacent vfio mappings
In the past, memory in spdk could have been unregistered in
different chunks than it was registered, so to account
for that the vtophys code used to register each hugepage
(2MB chunk of memory) separately to the VFIO driver. This
really made the code generally simple.
Now that memory in spdk can only be unregistered in the same
chunks it was registered in, we no longer have to register
each hugepage to VFIO separately. We could register the
entire memory region with just a single VFIO ioctl instead,
so that's we'll do now.
This serves as an optimization as we obviously send less
ioctls now, but most importantly it prevents SPDK from
reaching a VFIO registrations limit that was introduced
in Linux 5.1. [1]
The default limit is 65535, which results in SPDK being able to
make only the first 128GB of memory DMA-able. This is most
problematic for vhost where we need to register the memory
of all the VMs.
Fixes #915
[1] 492855939bdb59c6f947b0b5b44af9ad82b7e38c
("vfio/type1: Limit DMA mappings per container")
Signed-off-by: Darek Stojaczyk <dariusz.stojaczyk@intel.com>
Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/432442 (master)
(cherry picked from commit be04cfc342
)
Change-Id: Ida40306b2684e20daa2fd8d12e0df2eef5a4bff1
Signed-off-by: Tomasz Zawadzki <tomasz.zawadzki@intel.com>
Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/467143
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
This commit is contained in:
parent
d6870e44b2
commit
f666018041
@ -1005,82 +1005,135 @@ spdk_vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
while (len > 0) {
|
||||
/* Get the physical address from the DPDK memsegs */
|
||||
paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
|
||||
/* Get the physical address from the DPDK memsegs */
|
||||
paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
|
||||
|
||||
switch (action) {
|
||||
case SPDK_MEM_MAP_NOTIFY_REGISTER:
|
||||
if (paddr == SPDK_VTOPHYS_ERROR) {
|
||||
/* This is not an address that DPDK is managing. */
|
||||
switch (action) {
|
||||
case SPDK_MEM_MAP_NOTIFY_REGISTER:
|
||||
if (paddr == SPDK_VTOPHYS_ERROR) {
|
||||
/* This is not an address that DPDK is managing. */
|
||||
#if SPDK_VFIO_ENABLED
|
||||
if (spdk_iommu_is_enabled()) {
|
||||
/* We'll use the virtual address as the iova. DPDK
|
||||
* currently uses physical addresses as the iovas (or counts
|
||||
* up from 0 if it can't get physical addresses), so
|
||||
* the range of user space virtual addresses and physical
|
||||
* addresses will never overlap.
|
||||
*/
|
||||
paddr = (uint64_t)vaddr;
|
||||
rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB);
|
||||
if (rc) {
|
||||
if (spdk_iommu_is_enabled()) {
|
||||
/* We'll use the virtual address as the iova. DPDK
|
||||
* currently uses physical addresses as the iovas (or counts
|
||||
* up from 0 if it can't get physical addresses), so
|
||||
* the range of user space virtual addresses and physical
|
||||
* addresses will never overlap.
|
||||
*/
|
||||
paddr = (uint64_t)vaddr;
|
||||
rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len);
|
||||
if (rc) {
|
||||
return -EFAULT;
|
||||
}
|
||||
while (len > 0) {
|
||||
rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
|
||||
if (rc != 0) {
|
||||
return rc;
|
||||
}
|
||||
vaddr += VALUE_2MB;
|
||||
paddr += VALUE_2MB;
|
||||
len -= VALUE_2MB;
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
/* Get the physical address from /proc/self/pagemap. */
|
||||
paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
|
||||
if (paddr == SPDK_VTOPHYS_ERROR) {
|
||||
/* Get the physical address from PCI devices */
|
||||
paddr = vtophys_get_paddr_pci((uint64_t)vaddr);
|
||||
if (paddr == SPDK_VTOPHYS_ERROR) {
|
||||
DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
|
||||
return -EFAULT;
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
/* Get the physical address from /proc/self/pagemap. */
|
||||
paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
|
||||
if (paddr == SPDK_VTOPHYS_ERROR) {
|
||||
/* Get the physical address from PCI devices */
|
||||
paddr = vtophys_get_paddr_pci((uint64_t)vaddr);
|
||||
if (paddr == SPDK_VTOPHYS_ERROR) {
|
||||
DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
|
||||
return -EFAULT;
|
||||
}
|
||||
pci_phys = 1;
|
||||
}
|
||||
/* The beginning of this address range points to a PCI resource,
|
||||
* so the rest must point to a PCI resource as well.
|
||||
*/
|
||||
pci_phys = 1;
|
||||
}
|
||||
}
|
||||
/* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */
|
||||
if (!pci_phys && (paddr & MASK_2MB)) {
|
||||
DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
|
||||
break;
|
||||
case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
|
||||
#if SPDK_VFIO_ENABLED
|
||||
if (paddr == SPDK_VTOPHYS_ERROR) {
|
||||
/*
|
||||
* This is not an address that DPDK is managing. If vfio is enabled,
|
||||
* we need to unmap the range from the IOMMU
|
||||
*/
|
||||
if (spdk_iommu_is_enabled()) {
|
||||
uint64_t buffer_len = VALUE_2MB;
|
||||
paddr = spdk_mem_map_translate(map, (uint64_t)vaddr, &buffer_len);
|
||||
if (buffer_len != VALUE_2MB) {
|
||||
/* Get paddr for each 2MB chunk in this address range */
|
||||
while (len > 0) {
|
||||
/* Get the physical address from /proc/self/pagemap. */
|
||||
if (pci_phys) {
|
||||
paddr = vtophys_get_paddr_pci((uint64_t)vaddr);
|
||||
} else {
|
||||
paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
|
||||
}
|
||||
|
||||
if (paddr == SPDK_VTOPHYS_ERROR) {
|
||||
DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
/* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */
|
||||
if (!pci_phys && (paddr & MASK_2MB)) {
|
||||
DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr);
|
||||
return -EINVAL;
|
||||
}
|
||||
rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB);
|
||||
if (rc) {
|
||||
return -EFAULT;
|
||||
|
||||
rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
|
||||
if (rc != 0) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
vaddr += VALUE_2MB;
|
||||
len -= VALUE_2MB;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB);
|
||||
break;
|
||||
default:
|
||||
SPDK_UNREACHABLE();
|
||||
} else {
|
||||
/* This is an address managed by DPDK. Just setup the translations. */
|
||||
while (len > 0) {
|
||||
paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
|
||||
if (paddr == SPDK_VTOPHYS_ERROR) {
|
||||
DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
|
||||
if (rc != 0) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
vaddr += VALUE_2MB;
|
||||
len -= VALUE_2MB;
|
||||
}
|
||||
}
|
||||
|
||||
if (rc != 0) {
|
||||
return rc;
|
||||
break;
|
||||
case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
|
||||
#if SPDK_VFIO_ENABLED
|
||||
if (paddr == SPDK_VTOPHYS_ERROR) {
|
||||
/*
|
||||
* This is not an address that DPDK is managing. If vfio is enabled,
|
||||
* we need to unmap the range from the IOMMU
|
||||
*/
|
||||
if (spdk_iommu_is_enabled()) {
|
||||
uint64_t buffer_len = len;
|
||||
paddr = spdk_mem_map_translate(map, (uint64_t)vaddr, &buffer_len);
|
||||
if (buffer_len != len) {
|
||||
return -EINVAL;
|
||||
}
|
||||
rc = vtophys_iommu_unmap_dma(paddr, len);
|
||||
if (rc) {
|
||||
return -EFAULT;
|
||||
}
|
||||
}
|
||||
}
|
||||
vaddr += VALUE_2MB;
|
||||
len -= VALUE_2MB;
|
||||
#endif
|
||||
while (len > 0) {
|
||||
rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB);
|
||||
if (rc != 0) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
vaddr += VALUE_2MB;
|
||||
len -= VALUE_2MB;
|
||||
}
|
||||
|
||||
break;
|
||||
default:
|
||||
SPDK_UNREACHABLE();
|
||||
}
|
||||
|
||||
return rc;
|
||||
|
Loading…
Reference in New Issue
Block a user