Interim fix for pmap problems I introduced with my last commit.

Remove the code to dyanmically change the pv_entry limits. Go back to a single fixed kva reservation for pv entries, like was done before when using the uma zone. Go back to never freeing pages back to the free pool after they are no longer used, just like before. This stops the lock order reversal due to aquiring the kernel map lock while pmap was locked. This fixes the recursive panic if invariants are enabled. The problem was that allocating/freeing kva causes vm_map_entry nodes to be allocated/freed. That can recurse back into pmap as new pages are hooked up to kvm and hence all the problem. Allocating/freeing kva indirectly allocate/frees memory. So, by going back to a single fixed size kva block and an index, we avoid the recursion panics and the LOR. The problem is that now with a linear block of kva, we have no mechanism to track holes once pages are freed. UMA has the same problem when using custom object for a zone and a fixed reservation of kva. Simple solutions like having a bitmap would work, but would be very inefficient when there are hundreds of thousands of bits in the map. A first-free pointer is similarly flawed because pages can be freed at random and the first-free pointer would be rewinding huge amounts. If we could allocate memory for tree strucures or an external freelist, that would work. Except we cannot allocate/free memory here because we cannot allocate/free address space to use it in. Anyway, my change here reverts back to the UMA behavior of not freeing pages for now, thereby avoiding holes in the map. ups@ had a truely evil idea that I'll investigate. It should allow freeing unused pages again by giving us a no-cost way to track the holes in the kva block. But in the meantime, this should get people booting with witness and/or invariants again. Footnote: amd64 doesn't have this problem because of the direct map access method. I'd done all my witness/invariants testing there. I'd never considered that the harmless-looking kmem_alloc/kmem_free calls would cause such a problem and it didn't show up on the boot test.
2006-04-28 19:05:08 +00:00 · 2006-04-28 19:05:08 +00:00 · 7eeda22793
commit 7eeda22793
parent a31f91a061
1 changed files with 65 additions and 82 deletions
--- a/sys/i386/i386/pmap.c
+++ b/sys/i386/i386/pmap.c
@ -216,6 +216,11 @@ static uma_zone_t pdptzone;
 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
 static int shpgperproc = PMAP_SHPGPERPROC;

+TAILQ_HEAD(,pv_chunk) pv_freechunks;	/* Freelist of chunk pages */
+struct pv_chunk *pv_chunkbase;		/* KVA block for pv_chunks */
+int pv_maxchunks;			/* How many chunks we have KVA for */
+int pv_nextindex;			/* Where to map the next page */
+
 /*
 * All those kernel PT submaps that BSD is so fond of
 */
@ -491,6 +496,7 @@ void
 pmap_init(void)
 {

+	TAILQ_INIT(&pv_freechunks);
 	/*
 	 * Initialize the address space (zone) for the pv entries.  Set a
 	 * high water mark so that the system can recover from excessive
@ -499,8 +505,15 @@ pmap_init(void)
 	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
 	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
 	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
+	pv_entry_max = roundup(pv_entry_max, _NPCPV);
 	pv_entry_high_water = 9 * (pv_entry_max / 10);

+	pv_maxchunks = pv_entry_max / _NPCPV;
+	pv_chunkbase = (struct pv_chunk *)kmem_alloc_nofault(kernel_map,
+	    PAGE_SIZE * pv_maxchunks);
+	if (pv_chunkbase == NULL)
+		panic("pmap_init: not enough kvm for pv chunks");
+	pv_nextindex = 0;
 #ifdef PAE
 	pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
 	    NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
@ -511,35 +524,10 @@ pmap_init(void)


 SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
-static int
-pmap_pventry_proc(SYSCTL_HANDLER_ARGS)
-{
-	int error;
-
-	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
-	if (error == 0 && req->newptr) {
-		shpgperproc = (pv_entry_max - cnt.v_page_count) / maxproc;
-		pv_entry_high_water = 9 * (pv_entry_max / 10);
-	}
-	return (error);
-}
-SYSCTL_PROC(_vm_pmap, OID_AUTO, pv_entry_max, CTLTYPE_INT|CTLFLAG_RW, 
-    &pv_entry_max, 0, pmap_pventry_proc, "IU", "Max number of PV entries");
-
-static int
-pmap_shpgperproc_proc(SYSCTL_HANDLER_ARGS)
-{
-	int error;
-
-	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
-	if (error == 0 && req->newptr) {
-		pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
-		pv_entry_high_water = 9 * (pv_entry_max / 10);
-	}
-	return (error);
-}
-SYSCTL_PROC(_vm_pmap, OID_AUTO, shpgperproc, CTLTYPE_INT|CTLFLAG_RW, 
-    &shpgperproc, 0, pmap_shpgperproc_proc, "IU", "Page share factor per proc");
+SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0,
+	"Max number of PV entries");
+SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0,
+	"Page share factor per proc");

 /***************************************************
 * Low level helper routines.....
@ -1488,6 +1476,7 @@ SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,

 #ifdef PV_STATS
 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
+static int pc_chunk_spare;

 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
 	"Current number of pv entry chunks");
@ -1497,6 +1486,8 @@ SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
 	"Current number of pv entry chunks frees");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
 	"Number of times tried to get a chunk page but failed.");
+SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_spare, CTLFLAG_RD, &pc_chunk_spare, 0,
+	"Current number of spare pv entry chunks allocated");

 static long pv_entry_frees, pv_entry_allocs;
 static int pv_entry_spare;
@ -1577,7 +1568,6 @@ pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq)
 static void
 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 {
-	vm_page_t m;
 	struct pv_chunk *pc;
 	int idx, field, bit;

@ -1598,15 +1588,10 @@ free_pv_entry(pmap_t pmap, pv_entry_t pv)
 	PV_STAT(pv_entry_spare -= _NPCPV);
 	PV_STAT(pc_chunk_count--);
 	PV_STAT(pc_chunk_frees++);
-	/* entire chunk is free, return it */
+	/* entire chunk is free, return it to freelist */
 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
-	m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
-	pmap_qremove((vm_offset_t)pc, 1);
-	vm_page_lock_queues();
-	vm_page_unwire(m, 0);
-	vm_page_free(m);
-	vm_page_unlock_queues();
-	kmem_free(kernel_map, (vm_offset_t)pc, PAGE_SIZE);
+	TAILQ_INSERT_HEAD(&pv_freechunks, pc, pc_list);
+	PV_STAT(pc_chunk_spare++);
 }

 /*
@ -1653,50 +1638,52 @@ get_pv_entry(pmap_t pmap, int try)
 			return (pv);
 		}
 	}
-	/* No free items, allocate another chunk */
-	pc = (struct pv_chunk *)kmem_alloc_nofault(kernel_map, PAGE_SIZE);
-	if (pc == NULL) {
-		if (try) {
-			pv_entry_count--;
-			PV_STAT(pc_chunk_tryfail++);
-			return (NULL);
-		}
-		panic("get_pv_entry: out of kvm for pv entry chunk!");
-	}
-	m = vm_page_alloc(NULL, colour, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
-	if (m == NULL) {
-		if (try) {
-			pv_entry_count--;
-			PV_STAT(pc_chunk_tryfail++);
-			kmem_free(kernel_map, (vm_offset_t)pc, PAGE_SIZE);
-			return (NULL);
-		}
-		/*
-		 * Reclaim pv entries: At first, destroy mappings to inactive
-		 * pages.  After that, if a pv chunk entry is still needed,
-		 * destroy mappings to active pages.
-		 */
-		if (ratecheck(&lastprint, &printinterval))
-			printf("Approaching the limit on PV entries, consider"
-			    "increasing sysctl vm.pmap.shpgperproc or "
-			    "vm.pmap.pv_entry_max\n");
-		PV_STAT(pmap_collect_inactive++);
-		pmap_collect(pmap, &vm_page_queues[PQ_INACTIVE]);
-		m = vm_page_alloc(NULL, colour,
-		    VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ);
+	/* See if we have a preallocated chunk */
+	pc = TAILQ_FIRST(&pv_freechunks);
+	if (pc) {
+		/* Take a preallocated one from the freelist */
+		TAILQ_REMOVE(&pv_freechunks, pc, pc_list);
+		PV_STAT(pc_chunk_spare--);
+	} else {
+		/* No free items, allocate another chunk */
+		m = vm_page_alloc(NULL, colour, VM_ALLOC_SYSTEM |
+		    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
 		if (m == NULL) {
-			PV_STAT(pmap_collect_active++);
-			pmap_collect(pmap, &vm_page_queues[PQ_ACTIVE]);
+			if (try) {
+				pv_entry_count--;
+				PV_STAT(pc_chunk_tryfail++);
+				return (NULL);
+			}
+			/*
+			 * Reclaim pv entries: At first, destroy mappings to
+			 * inactive pages.  After that, if a pv chunk entry
+			 * is still needed, destroy mappings to active pages.
+			 */
+			if (ratecheck(&lastprint, &printinterval))
+				printf("Approaching the limit on PV entries, "
+				    "consider increasing tunables "
+				    "vm.pmap.shpgperproc or "
+				    "vm.pmap.pv_entry_max\n");
+			PV_STAT(pmap_collect_inactive++);
+			pmap_collect(pmap, &vm_page_queues[PQ_INACTIVE]);
 			m = vm_page_alloc(NULL, colour,
 			    VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ);
-			if (m == NULL)
-				panic("get_pv_entry: increase vm.pmap.shpgperproc");
+			if (m == NULL) {
+				PV_STAT(pmap_collect_active++);
+				pmap_collect(pmap, &vm_page_queues[PQ_ACTIVE]);
+				m = vm_page_alloc(NULL, colour,
+				    VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ);
+				if (m == NULL)
+					panic("get_pv_entry: increase vm.pmap.shpgperproc");
+			}
 		}
+		colour++;
+		pc = pv_chunkbase + pv_nextindex;	/* Scaled */
+		pv_nextindex++;
+		pmap_qenter((vm_offset_t)pc, &m, 1);
 	}
 	PV_STAT(pc_chunk_count++);
 	PV_STAT(pc_chunk_allocs++);
-	colour++;
-	pmap_qenter((vm_offset_t)pc, &m, 1);
 	pc->pc_pmap = pmap;
 	pc->pc_map[0] = pc_freemask[0] & ~1ul;	/* preallocated bit 0 */
 	for (field = 1; field < _NPCM; field++)
@ -2843,13 +2830,9 @@ pmap_remove_pages(pmap_t pmap)
 			PV_STAT(pc_chunk_count--);
 			PV_STAT(pc_chunk_frees++);
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
-			m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
-			pmap_qremove((vm_offset_t)pc, 1);
-			vm_page_lock_queues();
-			vm_page_unwire(m, 0);
-			vm_page_free(m);
-			vm_page_unlock_queues();
-			kmem_free(kernel_map, (vm_offset_t)pc, PAGE_SIZE);
+			/* Return to freelist */
+			TAILQ_INSERT_HEAD(&pv_freechunks, pc, pc_list);
+			PV_STAT(pc_chunk_spare++);
 		}
 	}
 	sched_unpin();