Long, long ago in r27464 special case code for mapping device-backed

memory with 4MB pages was added to pmap_object_init_pt(). This code assumes that the pages of a OBJT_DEVICE object are always physically contiguous. Unfortunately, this is not always the case. For example, jhb@ informs me that the recently introduced /dev/ksyms driver creates a OBJT_DEVICE object that violates this assumption. Thus, this revision modifies pmap_object_init_pt() to abort the mapping if the OBJT_DEVICE object's pages are not physically contiguous. This revision also changes some inconsistent if not buggy behavior. For example, the i386 version aborts if the first 4MB virtual page that would be mapped is already valid. However, it incorrectly replaces any subsequent 4MB virtual page mappings that it encounters, potentially leaking a page table page. The amd64 version has a bug of my own creation. It potentially busies the wrong page and always an insufficent number of pages if it blocks allocating a page table page. To my knowledge, there have been no reports of these bugs, hence, their persistance. I suspect that the existing restrictions that pmap_object_init_pt() placed on the OBJT_DEVICE objects that it would choose to map, for example, that the first page must be aligned on a 2 or 4MB physical boundary and that the size of the mapping must be a multiple of the large page size, were enough to avoid triggering the bug for drivers like ksyms. However, one side effect of testing the OBJT_DEVICE object's pages for physical contiguity is that a dubious difference between pmap_object_init_pt() and the standard path for mapping devices pages, i.e., vm_fault(), has been eliminated. Previously, pmap_object_init_pt() would only instantiate the first PG_FICTITOUS page being mapped because it never examined the rest. Now, however, pmap_object_init_pt() uses the new function vm_object_populate() to instantiate them all (in order to support testing their physical contiguity). These pages need to be instantiated for the mechanism that I have prototyped for automatically maintaining the consistency of the PAT settings across multiple mappings, particularly, amd64's direct mapping, to work. (Translation: This change is also being made to support jhb@'s work on the Nvidia feature requests.) Discussed with: jhb@
svn path=/head/; revision=194209
2009-06-14 19:51:43 +00:00 · 2009-06-14 19:51:43 +00:00 · 387aabc513 · 2020-12-20 02:59:44 +00:00
commit 387aabc513
parent 00c49c7a2d
4 changed files with 133 additions and 92 deletions
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@ -3322,78 +3322,74 @@ void
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
    vm_pindex_t pindex, vm_size_t size)
 {
-	vm_offset_t va;
+	pd_entry_t *pde;
+	vm_paddr_t pa, ptepa;
 	vm_page_t p, pdpg;

 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	KASSERT(object->type == OBJT_DEVICE,
 	    ("pmap_object_init_pt: non-device object"));
-	if (((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
-		vm_page_t m[1];
-		pd_entry_t ptepa, *pde;
-
-		PMAP_LOCK(pmap);
-		pde = pmap_pde(pmap, addr);
-		if (pde != 0 && (*pde & PG_V) != 0)
-			goto out;
-		PMAP_UNLOCK(pmap);
-retry:
+	if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
+		if (!vm_object_populate(object, pindex, pindex + atop(size)))
+			return;
 		p = vm_page_lookup(object, pindex);
-		if (p != NULL) {
-			if (vm_page_sleep_if_busy(p, FALSE, "init4p"))
-				goto retry;
-		} else {
-			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
-			if (p == NULL)
-				return;
-			m[0] = p;
-
-			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
-				vm_page_lock_queues();
-				vm_page_free(p);
-				vm_page_unlock_queues();
-				return;
-			}
-
-			p = vm_page_lookup(object, pindex);
-			vm_page_wakeup(p);
-		}
+		KASSERT(p->valid == VM_PAGE_BITS_ALL,
+		    ("pmap_object_init_pt: invalid page %p", p));

+		/*
+		 * Abort the mapping if the first page is not physically
+		 * aligned to a 2MB page boundary.
+		 */
 		ptepa = VM_PAGE_TO_PHYS(p);
 		if (ptepa & (NBPDR - 1))
 			return;

-		p->valid = VM_PAGE_BITS_ALL;
+		/*
+		 * Skip the first page.  Abort the mapping if the rest of
+		 * the pages are not physically contiguous.
+		 */
+		p = TAILQ_NEXT(p, listq);
+		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
+		    pa += PAGE_SIZE) {
+			KASSERT(p->valid == VM_PAGE_BITS_ALL,
+			    ("pmap_object_init_pt: invalid page %p", p));
+			if (pa != VM_PAGE_TO_PHYS(p))
+				return;
+			p = TAILQ_NEXT(p, listq);
+		}

+		/* Map using 2MB pages. */
 		PMAP_LOCK(pmap);
-		for (va = addr; va < addr + size; va += NBPDR) {
-			while ((pdpg =
-			    pmap_allocpde(pmap, va, M_NOWAIT)) == NULL) {
-				PMAP_UNLOCK(pmap);
-				vm_page_busy(p);
-				VM_OBJECT_UNLOCK(object);
-				VM_WAIT;
-				VM_OBJECT_LOCK(object);
-				vm_page_wakeup(p);
-				PMAP_LOCK(pmap);
+		for (pa = ptepa; pa < ptepa + size; pa += NBPDR) {
+			pdpg = pmap_allocpde(pmap, addr, M_NOWAIT);
+			if (pdpg == NULL) {
+				/*
+				 * The creation of mappings below is only an
+				 * optimization.  If a page directory page
+				 * cannot be allocated without blocking,
+				 * continue on to the next mapping rather than
+				 * blocking.
+				 */
+				addr += NBPDR;
+				continue;
 			}
 			pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
-			pde = &pde[pmap_pde_index(va)];
+			pde = &pde[pmap_pde_index(addr)];
 			if ((*pde & PG_V) == 0) {
-				pde_store(pde, ptepa | PG_PS | PG_M | PG_A |
+				pde_store(pde, pa | PG_PS | PG_M | PG_A |
 				    PG_U | PG_RW | PG_V);
-				pmap->pm_stats.resident_count +=
-				    NBPDR / PAGE_SIZE;
+				pmap->pm_stats.resident_count += NBPDR /
+				    PAGE_SIZE;
+				pmap_pde_mappings++;
 			} else {
+				/* Continue on if the PDE is already valid. */
 				pdpg->wire_count--;
 				KASSERT(pdpg->wire_count > 0,
 				    ("pmap_object_init_pt: missing reference "
-				     "to page directory page, va: 0x%lx", va));
+				    "to page directory page, va: 0x%lx", addr));
 			}
-			ptepa += NBPDR;
+			addr += NBPDR;
 		}
-		pmap_invalidate_all(pmap);
-out:
 		PMAP_UNLOCK(pmap);
 	}
 }
--- a/sys/i386/i386/pmap.c
+++ b/sys/i386/i386/pmap.c
@ -3442,62 +3442,57 @@ void
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
    vm_pindex_t pindex, vm_size_t size)
 {
+	pd_entry_t *pde;
+	vm_paddr_t pa, ptepa;
 	vm_page_t p;

 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	KASSERT(object->type == OBJT_DEVICE,
 	    ("pmap_object_init_pt: non-device object"));
 	if (pseflag && 
-	    ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
-		int i;
-		vm_page_t m[1];
-		unsigned int ptepindex;
-		int npdes;
-		pd_entry_t ptepa;
-
-		PMAP_LOCK(pmap);
-		if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)])
-			goto out;
-		PMAP_UNLOCK(pmap);
-retry:
+	    (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
+		if (!vm_object_populate(object, pindex, pindex + atop(size)))
+			return;
 		p = vm_page_lookup(object, pindex);
-		if (p != NULL) {
-			if (vm_page_sleep_if_busy(p, FALSE, "init4p"))
-				goto retry;
-		} else {
-			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
-			if (p == NULL)
-				return;
-			m[0] = p;
-
-			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
-				vm_page_lock_queues();
-				vm_page_free(p);
-				vm_page_unlock_queues();
-				return;
-			}
-
-			p = vm_page_lookup(object, pindex);
-			vm_page_wakeup(p);
-		}
+		KASSERT(p->valid == VM_PAGE_BITS_ALL,
+		    ("pmap_object_init_pt: invalid page %p", p));

+		/*
+		 * Abort the mapping if the first page is not physically
+		 * aligned to a 2/4MB page boundary.
+		 */
 		ptepa = VM_PAGE_TO_PHYS(p);
 		if (ptepa & (NBPDR - 1))
 			return;

-		p->valid = VM_PAGE_BITS_ALL;
-
-		PMAP_LOCK(pmap);
-		pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
-		npdes = size >> PDRSHIFT;
-		for(i = 0; i < npdes; i++) {
-			pde_store(&pmap->pm_pdir[ptepindex],
-			    ptepa | PG_U | PG_RW | PG_V | PG_PS);
-			ptepa += NBPDR;
-			ptepindex += 1;
+		/*
+		 * Skip the first page.  Abort the mapping if the rest of
+		 * the pages are not physically contiguous.
+		 */
+		p = TAILQ_NEXT(p, listq);
+		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
+		    pa += PAGE_SIZE) {
+			KASSERT(p->valid == VM_PAGE_BITS_ALL,
+			    ("pmap_object_init_pt: invalid page %p", p));
+			if (pa != VM_PAGE_TO_PHYS(p))
+				return;
+			p = TAILQ_NEXT(p, listq);
+		}
+
+		/* Map using 2/4MB pages. */
+		PMAP_LOCK(pmap);
+		for (pa = ptepa; pa < ptepa + size; pa += NBPDR) {
+			pde = pmap_pde(pmap, addr);
+			if (*pde == 0) {
+				pde_store(pde, pa | PG_PS | PG_M | PG_A |
+				    PG_U | PG_RW | PG_V);
+				pmap->pm_stats.resident_count += NBPDR /
+				    PAGE_SIZE;
+				pmap_pde_mappings++;
+			}
+			/* Else continue on if the PDE is already valid. */
+			addr += NBPDR;
 		}
-		pmap_invalidate_all(pmap);
-out:
 		PMAP_UNLOCK(pmap);
 	}
 }
--- a/sys/vm/vm_object.c
+++ b/sys/vm/vm_object.c
@ -1930,6 +1930,55 @@ vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
 		vm_page_cache_free(object, start, end);
 }

+/*
+ *	Populate the specified range of the object with valid pages.  Returns
+ *	TRUE if the range is successfully populated and FALSE otherwise.
+ *
+ *	Note: This function should be optimized to pass a larger array of
+ *	pages to vm_pager_get_pages() before it is applied to a non-
+ *	OBJT_DEVICE object.
+ *
+ *	The object must be locked.
+ */
+boolean_t
+vm_object_populate(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
+{
+	vm_page_t m, ma[1];
+	vm_pindex_t pindex;
+	int rv;
+
+	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	for (pindex = start; pindex < end; pindex++) {
+		m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL |
+		    VM_ALLOC_RETRY);
+		if (m->valid != VM_PAGE_BITS_ALL) {
+			ma[0] = m;
+			rv = vm_pager_get_pages(object, ma, 1, 0);
+			m = vm_page_lookup(object, pindex);
+			if (m == NULL)
+				break;
+			if (rv != VM_PAGER_OK) {
+				vm_page_lock_queues();
+				vm_page_free(m);
+				vm_page_unlock_queues();
+				break;
+			}
+		}
+		/*
+		 * Keep "m" busy because a subsequent iteration may unlock
+		 * the object.
+		 */
+	}
+	if (pindex > start) {
+		m = vm_page_lookup(object, start);
+		while (m != NULL && m->pindex < pindex) {
+			vm_page_wakeup(m);
+			m = TAILQ_NEXT(m, listq);
+		}
+	}
+	return (pindex == end);
+}
+
 /*
 *	Routine:	vm_object_coalesce
 *	Function:	Coalesces two objects backing up adjoining
--- a/sys/vm/vm_object.h
+++ b/sys/vm/vm_object.h
@ -207,6 +207,7 @@ void vm_object_set_writeable_dirty (vm_object_t);
 void vm_object_init (void);
 void vm_object_page_clean (vm_object_t, vm_pindex_t, vm_pindex_t, boolean_t);
 void vm_object_page_remove (vm_object_t, vm_pindex_t, vm_pindex_t, boolean_t);
+boolean_t vm_object_populate(vm_object_t, vm_pindex_t, vm_pindex_t);
 void vm_object_reference (vm_object_t);
 void vm_object_reference_locked(vm_object_t);
 void vm_object_shadow (vm_object_t *, vm_ooffset_t *, vm_size_t);