env_dpdk/memory: aggregate adjacent vfio mappings
In the past, memory in spdk could have been unregistered in different chunks than it was registered, so to account for that the vtophys code used to register each hugepage (2MB chunk of memory) separately to the VFIO driver. This really made the code generally simple. Now that memory in spdk can only be unregistered in the same chunks it was registered in, we no longer have to register each hugepage to VFIO separately. We could register the entire memory region with just a single VFIO ioctl instead, so that's we'll do now. This serves as an optimization as we obviously send less ioctls now, but most importantly it prevents SPDK from reaching a VFIO registrations limit that was introduced in Linux 5.1. [1] The default limit is 65535, which results in SPDK being able to make only the first 128GB of memory DMA-able. This is most problematic for vhost where we need to register the memory of all the VMs. Fixes #915 [1] 492855939bdb59c6f947b0b5b44af9ad82b7e38c ("vfio/type1: Limit DMA mappings per container") Change-Id: Ida40306b2684e20daa2fd8d12e0df2eef5a4bff1 Signed-off-by: Darek Stojaczyk <dariusz.stojaczyk@intel.com> Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/432442 Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Ben Walker <benjamin.walker@intel.com> Reviewed-by: Changpeng Liu <changpeng.liu@intel.com>
This commit is contained in:
parent
43f4e3932a
commit
be04cfc342
@ -1005,82 +1005,135 @@ spdk_vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
while (len > 0) {
|
||||
/* Get the physical address from the DPDK memsegs */
|
||||
paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
|
||||
/* Get the physical address from the DPDK memsegs */
|
||||
paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
|
||||
|
||||
switch (action) {
|
||||
case SPDK_MEM_MAP_NOTIFY_REGISTER:
|
||||
if (paddr == SPDK_VTOPHYS_ERROR) {
|
||||
/* This is not an address that DPDK is managing. */
|
||||
switch (action) {
|
||||
case SPDK_MEM_MAP_NOTIFY_REGISTER:
|
||||
if (paddr == SPDK_VTOPHYS_ERROR) {
|
||||
/* This is not an address that DPDK is managing. */
|
||||
#if SPDK_VFIO_ENABLED
|
||||
if (spdk_iommu_is_enabled()) {
|
||||
/* We'll use the virtual address as the iova. DPDK
|
||||
* currently uses physical addresses as the iovas (or counts
|
||||
* up from 0 if it can't get physical addresses), so
|
||||
* the range of user space virtual addresses and physical
|
||||
* addresses will never overlap.
|
||||
*/
|
||||
paddr = (uint64_t)vaddr;
|
||||
rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB);
|
||||
if (rc) {
|
||||
if (spdk_iommu_is_enabled()) {
|
||||
/* We'll use the virtual address as the iova. DPDK
|
||||
* currently uses physical addresses as the iovas (or counts
|
||||
* up from 0 if it can't get physical addresses), so
|
||||
* the range of user space virtual addresses and physical
|
||||
* addresses will never overlap.
|
||||
*/
|
||||
paddr = (uint64_t)vaddr;
|
||||
rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len);
|
||||
if (rc) {
|
||||
return -EFAULT;
|
||||
}
|
||||
while (len > 0) {
|
||||
rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
|
||||
if (rc != 0) {
|
||||
return rc;
|
||||
}
|
||||
vaddr += VALUE_2MB;
|
||||
paddr += VALUE_2MB;
|
||||
len -= VALUE_2MB;
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
/* Get the physical address from /proc/self/pagemap. */
|
||||
paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
|
||||
if (paddr == SPDK_VTOPHYS_ERROR) {
|
||||
/* Get the physical address from PCI devices */
|
||||
paddr = vtophys_get_paddr_pci((uint64_t)vaddr);
|
||||
if (paddr == SPDK_VTOPHYS_ERROR) {
|
||||
DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
|
||||
return -EFAULT;
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
/* Get the physical address from /proc/self/pagemap. */
|
||||
paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
|
||||
if (paddr == SPDK_VTOPHYS_ERROR) {
|
||||
/* Get the physical address from PCI devices */
|
||||
paddr = vtophys_get_paddr_pci((uint64_t)vaddr);
|
||||
if (paddr == SPDK_VTOPHYS_ERROR) {
|
||||
DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
|
||||
return -EFAULT;
|
||||
}
|
||||
pci_phys = 1;
|
||||
}
|
||||
/* The beginning of this address range points to a PCI resource,
|
||||
* so the rest must point to a PCI resource as well.
|
||||
*/
|
||||
pci_phys = 1;
|
||||
}
|
||||
}
|
||||
/* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */
|
||||
if (!pci_phys && (paddr & MASK_2MB)) {
|
||||
DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
|
||||
break;
|
||||
case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
|
||||
#if SPDK_VFIO_ENABLED
|
||||
if (paddr == SPDK_VTOPHYS_ERROR) {
|
||||
/*
|
||||
* This is not an address that DPDK is managing. If vfio is enabled,
|
||||
* we need to unmap the range from the IOMMU
|
||||
*/
|
||||
if (spdk_iommu_is_enabled()) {
|
||||
uint64_t buffer_len = VALUE_2MB;
|
||||
paddr = spdk_mem_map_translate(map, (uint64_t)vaddr, &buffer_len);
|
||||
if (buffer_len != VALUE_2MB) {
|
||||
/* Get paddr for each 2MB chunk in this address range */
|
||||
while (len > 0) {
|
||||
/* Get the physical address from /proc/self/pagemap. */
|
||||
if (pci_phys) {
|
||||
paddr = vtophys_get_paddr_pci((uint64_t)vaddr);
|
||||
} else {
|
||||
paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
|
||||
}
|
||||
|
||||
if (paddr == SPDK_VTOPHYS_ERROR) {
|
||||
DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
/* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */
|
||||
if (!pci_phys && (paddr & MASK_2MB)) {
|
||||
DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr);
|
||||
return -EINVAL;
|
||||
}
|
||||
rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB);
|
||||
if (rc) {
|
||||
return -EFAULT;
|
||||
|
||||
rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
|
||||
if (rc != 0) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
vaddr += VALUE_2MB;
|
||||
len -= VALUE_2MB;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB);
|
||||
break;
|
||||
default:
|
||||
SPDK_UNREACHABLE();
|
||||
} else {
|
||||
/* This is an address managed by DPDK. Just setup the translations. */
|
||||
while (len > 0) {
|
||||
paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
|
||||
if (paddr == SPDK_VTOPHYS_ERROR) {
|
||||
DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
|
||||
if (rc != 0) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
vaddr += VALUE_2MB;
|
||||
len -= VALUE_2MB;
|
||||
}
|
||||
}
|
||||
|
||||
if (rc != 0) {
|
||||
return rc;
|
||||
break;
|
||||
case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
|
||||
#if SPDK_VFIO_ENABLED
|
||||
if (paddr == SPDK_VTOPHYS_ERROR) {
|
||||
/*
|
||||
* This is not an address that DPDK is managing. If vfio is enabled,
|
||||
* we need to unmap the range from the IOMMU
|
||||
*/
|
||||
if (spdk_iommu_is_enabled()) {
|
||||
uint64_t buffer_len = len;
|
||||
paddr = spdk_mem_map_translate(map, (uint64_t)vaddr, &buffer_len);
|
||||
if (buffer_len != len) {
|
||||
return -EINVAL;
|
||||
}
|
||||
rc = vtophys_iommu_unmap_dma(paddr, len);
|
||||
if (rc) {
|
||||
return -EFAULT;
|
||||
}
|
||||
}
|
||||
}
|
||||
vaddr += VALUE_2MB;
|
||||
len -= VALUE_2MB;
|
||||
#endif
|
||||
while (len > 0) {
|
||||
rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB);
|
||||
if (rc != 0) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
vaddr += VALUE_2MB;
|
||||
len -= VALUE_2MB;
|
||||
}
|
||||
|
||||
break;
|
||||
default:
|
||||
SPDK_UNREACHABLE();
|
||||
}
|
||||
|
||||
return rc;
|
||||
|
Loading…
Reference in New Issue
Block a user