amd64: make uiomove_fromphys functional for pages not mapped by the DMAP

Place the code introduced in r268660 into a separate function that can be called from uiomove_fromphys. Instead of pre-allocating two KVA pages use vmem_alloc to allocate them on demand when needed. This prevents blocking if a page fault is taken while physical addresses from outside the DMAP are used, since the lock is now removed. Also introduce a safety catch in PHYS_TO_DMAP and DMAP_TO_PHYS. Sponsored by: Citrix Systems R&D Reviewed by: kib Differential Revision: https://reviews.freebsd.org/D947 amd64/amd64/pmap.c: - Factor out the code to deal with non DMAP addresses from pmap_copy_pages and place it in pmap_map_io_transient. - Change the code to use vmem_alloc instead of a set of pre-allocated pages. - Use pmap_qenter and don't pin the thread if there can be page faults. amd64/amd64/uio_machdep.c: - Use pmap_map_io_transient in order to correctly deal with physical addresses not covered by the DMAP. amd64/include/pmap.h: - Add the prototypes for the new functions. amd64/include/vmparam.h: - Add safety catches to make sure PHYS_TO_DMAP and DMAP_TO_PHYS are only used with addresses covered by the DMAP.
svn path=/head/; revision=273582
2014-10-24 09:48:58 +00:00 · 2014-10-24 09:48:58 +00:00 · 927dc0e02a · 2020-12-20 02:59:44 +00:00
commit 927dc0e02a
parent cd033efc14
4 changed files with 142 additions and 66 deletions
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@ -115,6 +115,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sx.h>
+#include <sys/vmem.h>
 #include <sys/vmmeter.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
@ -402,11 +403,6 @@ SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW |
    CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU",
    "Count of saved TLB context on switch");

-/* pmap_copy_pages() over non-DMAP */
-static struct mtx cpage_lock;
-static vm_offset_t cpage_a;
-static vm_offset_t cpage_b;
-
 /*
 * Crashdump maps.
 */
@ -1072,10 +1068,6 @@ pmap_init(void)
 	    M_WAITOK | M_ZERO);
 	for (i = 0; i < pv_npg; i++)
 		TAILQ_INIT(&pv_table[i].pv_list);
-
-	mtx_init(&cpage_lock, "cpage", NULL, MTX_DEF);
-	cpage_a = kva_alloc(PAGE_SIZE);
-	cpage_b = kva_alloc(PAGE_SIZE);
 }

 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
@ -5056,66 +5048,24 @@ pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
    vm_offset_t b_offset, int xfersize)
 {
 	void *a_cp, *b_cp;
-	vm_page_t m_a, m_b;
-	vm_paddr_t p_a, p_b;
-	pt_entry_t *pte;
-	vm_offset_t a_pg_offset, b_pg_offset;
+	vm_page_t pages[2];
+	vm_offset_t vaddr[2], a_pg_offset, b_pg_offset;
 	int cnt;
-	boolean_t pinned;
+	boolean_t mapped;

-	/*
-	 * NB:  The sequence of updating a page table followed by accesses
-	 * to the corresponding pages used in the !DMAP case is subject to
-	 * the situation described in the "AMD64 Architecture Programmer's
-	 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special
-	 * Coherency Considerations".  Therefore, issuing the INVLPG right
-	 * after modifying the PTE bits is crucial.
-	 */
-	pinned = FALSE;
 	while (xfersize > 0) {
 		a_pg_offset = a_offset & PAGE_MASK;
-		m_a = ma[a_offset >> PAGE_SHIFT];
-		p_a = m_a->phys_addr;
+		pages[0] = ma[a_offset >> PAGE_SHIFT];
 		b_pg_offset = b_offset & PAGE_MASK;
-		m_b = mb[b_offset >> PAGE_SHIFT];
-		p_b = m_b->phys_addr;
+		pages[1] = mb[b_offset >> PAGE_SHIFT];
 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
-		if (__predict_false(p_a < DMAP_MIN_ADDRESS ||
-		    p_a > DMAP_MIN_ADDRESS + dmaplimit)) {
-			mtx_lock(&cpage_lock);
-			sched_pin();
-			pinned = TRUE;
-			pte = vtopte(cpage_a);
-			*pte = p_a | X86_PG_A | X86_PG_V |
-			    pmap_cache_bits(kernel_pmap, m_a->md.pat_mode, 0);
-			invlpg(cpage_a);
-			a_cp = (char *)cpage_a + a_pg_offset;
-		} else {
-			a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
-		}
-		if (__predict_false(p_b < DMAP_MIN_ADDRESS ||
-		    p_b > DMAP_MIN_ADDRESS + dmaplimit)) {
-			if (!pinned) {
-				mtx_lock(&cpage_lock);
-				sched_pin();
-				pinned = TRUE;
-			}
-			pte = vtopte(cpage_b);
-			*pte = p_b | X86_PG_A | X86_PG_M | X86_PG_RW |
-			    X86_PG_V | pmap_cache_bits(kernel_pmap,
-			    m_b->md.pat_mode, 0);
-			invlpg(cpage_b);
-			b_cp = (char *)cpage_b + b_pg_offset;
-		} else {
-			b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
-		}
+		mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE);
+		a_cp = (char *)vaddr[0] + a_pg_offset;
+		b_cp = (char *)vaddr[1] + b_pg_offset;
 		bcopy(a_cp, b_cp, cnt);
-		if (__predict_false(pinned)) {
-			sched_unpin();
-			mtx_unlock(&cpage_lock);
-			pinned = FALSE;
-		}
+		if (__predict_false(mapped))
+			pmap_unmap_io_transient(pages, vaddr, 2, FALSE);
 		a_offset += cnt;
 		b_offset += cnt;
 		xfersize -= cnt;
@ -6901,6 +6851,107 @@ pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num)
 	*num = idx;
 }

+/**
+ * Get the kernel virtual address of a set of physical pages. If there are
+ * physical addresses not covered by the DMAP perform a transient mapping
+ * that will be removed when calling pmap_unmap_io_transient.
+ *
+ * \param page        The pages the caller wishes to obtain the virtual
+ *                    address on the kernel memory map.
+ * \param vaddr       On return contains the kernel virtual memory address
+ *                    of the pages passed in the page parameter.
+ * \param count       Number of pages passed in.
+ * \param can_fault   TRUE if the thread using the mapped pages can take
+ *                    page faults, FALSE otherwise.
+ *
+ * \returns TRUE if the caller must call pmap_unmap_io_transient when
+ *          finished or FALSE otherwise.
+ *
+ */
+boolean_t
+pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
+    boolean_t can_fault)
+{
+	vm_paddr_t paddr;
+	boolean_t needs_mapping;
+	pt_entry_t *pte;
+	int cache_bits, error, i;
+
+	/*
+	 * Allocate any KVA space that we need, this is done in a separate
+	 * loop to prevent calling vmem_alloc while pinned.
+	 */
+	needs_mapping = FALSE;
+	for (i = 0; i < count; i++) {
+		paddr = VM_PAGE_TO_PHYS(page[i]);
+		if (__predict_false(paddr >= dmaplimit)) {
+			error = vmem_alloc(kernel_arena, PAGE_SIZE,
+			    M_BESTFIT | M_WAITOK, &vaddr[i]);
+			KASSERT(error == 0, ("vmem_alloc failed: %d", error));
+			needs_mapping = TRUE;
+		} else {
+			vaddr[i] = PHYS_TO_DMAP(paddr);
+		}
+	}
+
+	/* Exit early if everything is covered by the DMAP */
+	if (!needs_mapping)
+		return (FALSE);
+
+	/*
+	 * NB:  The sequence of updating a page table followed by accesses
+	 * to the corresponding pages used in the !DMAP case is subject to
+	 * the situation described in the "AMD64 Architecture Programmer's
+	 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special
+	 * Coherency Considerations".  Therefore, issuing the INVLPG right
+	 * after modifying the PTE bits is crucial.
+	 */
+	if (!can_fault)
+		sched_pin();
+	for (i = 0; i < count; i++) {
+		paddr = VM_PAGE_TO_PHYS(page[i]);
+		if (paddr >= dmaplimit) {
+			if (can_fault) {
+				/*
+				 * Slow path, since we can get page faults
+				 * while mappings are active don't pin the
+				 * thread to the CPU and instead add a global
+				 * mapping visible to all CPUs.
+				 */
+				pmap_qenter(vaddr[i], &page[i], 1);
+			} else {
+				pte = vtopte(vaddr[i]);
+				cache_bits = pmap_cache_bits(kernel_pmap,
+				    page[i]->md.pat_mode, 0);
+				pte_store(pte, paddr | X86_PG_RW | X86_PG_V |
+				    cache_bits);
+				invlpg(vaddr[i]);
+			}
+		}
+	}
+
+	return (needs_mapping);
+}
+
+void
+pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
+    boolean_t can_fault)
+{
+	vm_paddr_t paddr;
+	int i;
+
+	if (!can_fault)
+		sched_unpin();
+	for (i = 0; i < count; i++) {
+		paddr = VM_PAGE_TO_PHYS(page[i]);
+		if (paddr >= dmaplimit) {
+			if (can_fault)
+				pmap_qremove(vaddr[i], 1);
+			vmem_free(kernel_arena, vaddr[i], PAGE_SIZE);
+		}
+	}
+}
+
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>
--- a/sys/amd64/amd64/uio_machdep.c
+++ b/sys/amd64/amd64/uio_machdep.c
@ -61,10 +61,11 @@ uiomove_fromphys(vm_page_t ma[], vm_offset_t offset, int n, struct uio *uio)
 	struct thread *td = curthread;
 	struct iovec *iov;
 	void *cp;
-	vm_offset_t page_offset;
+	vm_offset_t page_offset, vaddr;
 	size_t cnt;
 	int error = 0;
 	int save = 0;
+	boolean_t mapped;

 	KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
 	    ("uiomove_fromphys: mode"));
@ -72,6 +73,7 @@ uiomove_fromphys(vm_page_t ma[], vm_offset_t offset, int n, struct uio *uio)
 	    ("uiomove_fromphys proc"));
 	save = td->td_pflags & TDP_DEADLKTREAT;
 	td->td_pflags |= TDP_DEADLKTREAT;
+	mapped = FALSE;
 	while (n > 0 && uio->uio_resid) {
 		iov = uio->uio_iov;
 		cnt = iov->iov_len;
@ -84,8 +86,11 @@ uiomove_fromphys(vm_page_t ma[], vm_offset_t offset, int n, struct uio *uio)
 			cnt = n;
 		page_offset = offset & PAGE_MASK;
 		cnt = min(cnt, PAGE_SIZE - page_offset);
-		cp = (char *)PHYS_TO_DMAP(ma[offset >> PAGE_SHIFT]->phys_addr) +
-		    page_offset;
+		if (uio->uio_segflg != UIO_NOCOPY) {
+			mapped = pmap_map_io_transient(
+			    &ma[offset >> PAGE_SHIFT], &vaddr, 1, TRUE);
+			cp = (char *)vaddr + page_offset;
+		}
 		switch (uio->uio_segflg) {
 		case UIO_USERSPACE:
 			maybe_yield();
@ -105,6 +110,11 @@ uiomove_fromphys(vm_page_t ma[], vm_offset_t offset, int n, struct uio *uio)
 		case UIO_NOCOPY:
 			break;
 		}
+		if (__predict_false(mapped)) {
+			pmap_unmap_io_transient(&ma[offset >> PAGE_SHIFT],
+			    &vaddr, 1, TRUE);
+			mapped = FALSE;
+		}
 		iov->iov_base = (char *)iov->iov_base + cnt;
 		iov->iov_len -= cnt;
 		uio->uio_resid -= cnt;
@ -113,6 +123,9 @@ uiomove_fromphys(vm_page_t ma[], vm_offset_t offset, int n, struct uio *uio)
 		n -= cnt;
 	}
 out:
+	if (__predict_false(mapped))
+		pmap_unmap_io_transient(&ma[offset >> PAGE_SHIFT], &vaddr, 1,
+		    TRUE);
 	if (save == 0)
 		td->td_pflags &= ~TDP_DEADLKTREAT;
 	return (error);
--- a/sys/amd64/include/pmap.h
+++ b/sys/amd64/include/pmap.h
@ -397,6 +397,8 @@ void	pmap_invalidate_cache_pages(vm_page_t *pages, int count);
 void	pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva,
 	    boolean_t force);
 void	pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num);
+boolean_t pmap_map_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t);
+void	pmap_unmap_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t);
 #endif /* _KERNEL */

 #endif /* !LOCORE */
--- a/sys/amd64/include/vmparam.h
+++ b/sys/amd64/include/vmparam.h
@ -175,8 +175,18 @@
 #define	VM_MAX_ADDRESS		UPT_MAX_ADDRESS
 #define	VM_MIN_ADDRESS		(0)

-#define	PHYS_TO_DMAP(x)		((x) | DMAP_MIN_ADDRESS)
-#define	DMAP_TO_PHYS(x)		((x) & ~DMAP_MIN_ADDRESS)
+#define	PHYS_TO_DMAP(x)	({						\
+	KASSERT((x) < dmaplimit,					\
+	    ("physical address %#jx not covered by the DMAP",		\
+	    (uintmax_t)x));						\
+	(x) | DMAP_MIN_ADDRESS; })
+
+#define	DMAP_TO_PHYS(x)	({						\
+	KASSERT((x) < (DMAP_MIN_ADDRESS + dmaplimit) &&			\
+	    (x) >= DMAP_MIN_ADDRESS,					\
+	    ("virtual address %#jx not covered by the DMAP",		\
+	    (uintmax_t)x));						\
+	(x) & ~DMAP_MIN_ADDRESS; })

 /*
 * How many physical pages per kmem arena virtual page.