Add a deferred free mechanism for freeing swap space that does not require

an exclusive object lock. Previously swap space was freed on a best effort basis when a page that had valid swap was dirtied, thus invalidating the swap copy. This may be done inconsistently and requires the object lock which is not always convenient. Instead, track when swap space is present. The first dirty is responsible for deleting space or setting PGA_SWAP_FREE which will trigger background scans to free the swap space. Simplify the locking in vm_fault_dirty() now that we can reliably identify the first dirty. Discussed with: alc, kib, markj Differential Revision: https://reviews.freebsd.org/D22654
svn path=/head/; revision=355765
2019-12-15 03:15:06 +00:00 · 2019-12-15 03:15:06 +00:00 · a808177864 · 2020-12-20 02:59:44 +00:00
commit a808177864
parent d966c7615f
9 changed files with 182 additions and 67 deletions
--- a/sys/dev/md/md.c
+++ b/sys/dev/md/md.c
@ -1118,10 +1118,7 @@ mdstart_swap(struct md_s *sc, struct bio *bp)
 			}

 			vm_page_valid(m);
-			if (m->dirty != VM_PAGE_BITS_ALL) {
-				vm_page_dirty(m);
-				vm_pager_page_unswapped(m);
-			}
+			vm_page_set_dirty(m);
 		} else if (bp->bio_cmd == BIO_DELETE) {
 			if (len == PAGE_SIZE || vm_page_all_valid(m))
 				rv = VM_PAGER_OK;
@ -1138,10 +1135,7 @@ mdstart_swap(struct md_s *sc, struct bio *bp)
 				/* Page is valid. */
 				if (len != PAGE_SIZE) {
 					pmap_zero_page_area(m, offs, len);
-					if (m->dirty != VM_PAGE_BITS_ALL) {
-						vm_page_dirty(m);
-						vm_pager_page_unswapped(m);
-					}
+					vm_page_set_dirty(m);
 				} else {
 					vm_pager_page_unswapped(m);
 					vm_page_free(m);
--- a/sys/fs/tmpfs/tmpfs_subr.c
+++ b/sys/fs/tmpfs/tmpfs_subr.c
@ -1505,9 +1505,8 @@ tmpfs_reg_resize(struct vnode *vp, off_t newsize, boolean_t ignerr)
 			}
 			if (m != NULL) {
 				pmap_zero_page_area(m, base, PAGE_SIZE - base);
-				vm_page_dirty(m);
+				vm_page_set_dirty(m);
 				vm_page_xunbusy(m);
-				vm_pager_page_unswapped(m);
 			}
 		}

--- a/sys/kern/uipc_shm.c
+++ b/sys/kern/uipc_shm.c
@ -198,7 +198,7 @@ uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio)
 	 * type object.
 	 */
 	rv = vm_page_grab_valid(&m, obj, idx,
-	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_NOBUSY);
+	    VM_ALLOC_NORMAL | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY);
 	if (rv != VM_PAGER_OK) {
 		VM_OBJECT_WUNLOCK(obj);
 		printf("uiomove_object: vm_obj %p idx %jd pager error %d\n",
@ -207,13 +207,10 @@ uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio)
 	}
 	VM_OBJECT_WUNLOCK(obj);
 	error = uiomove_fromphys(&m, offset, tlen, uio);
-	if (uio->uio_rw == UIO_WRITE && error == 0) {
-		VM_OBJECT_WLOCK(obj);
-		vm_page_dirty(m);
-		vm_pager_page_unswapped(m);
-		VM_OBJECT_WUNLOCK(obj);
-	}
-	vm_page_unwire(m, PQ_ACTIVE);
+	if (uio->uio_rw == UIO_WRITE && error == 0)
+		vm_page_set_dirty(m);
+	vm_page_aflag_set(m, PGA_REFERENCED);
+	vm_page_sunbusy(m);

 	return (error);
 }
@ -527,9 +524,8 @@ shm_dotruncate_locked(struct shmfd *shmfd, off_t length, void *rl_cookie)
 				pmap_zero_page_area(m, base, PAGE_SIZE - base);
 				KASSERT(vm_page_all_valid(m),
 				    ("shm_dotruncate: page %p is invalid", m));
-				vm_page_dirty(m);
+				vm_page_set_dirty(m);
 				vm_page_xunbusy(m);
-				vm_pager_page_unswapped(m);
 			}
 		}
 		delta = IDX_TO_OFF(object->size - nobjsize);
--- a/sys/vm/swap_pager.c
+++ b/sys/vm/swap_pager.c
@ -155,6 +155,9 @@ static struct sx swdev_syscall_lock;	/* serialize swap(on|off) */
 static u_long swap_reserved;
 static u_long swap_total;
 static int sysctl_page_shift(SYSCTL_HANDLER_ARGS);
+
+static SYSCTL_NODE(_vm_stats, OID_AUTO, swap, CTLFLAG_RD, 0, "VM swap stats");
+
 SYSCTL_PROC(_vm, OID_AUTO, swap_reserved, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
    &swap_reserved, 0, sysctl_page_shift, "A", 
    "Amount of swap storage needed to back all allocated anonymous memory.");
@ -173,6 +176,16 @@ static unsigned long swap_maxpages;
 SYSCTL_ULONG(_vm, OID_AUTO, swap_maxpages, CTLFLAG_RD, &swap_maxpages, 0,
    "Maximum amount of swap supported");

+static counter_u64_t swap_free_deferred;
+SYSCTL_COUNTER_U64(_vm_stats_swap, OID_AUTO, free_deferred,
+    CTLFLAG_RD, &swap_free_deferred,
+    "Number of pages that deferred freeing swap space");
+
+static counter_u64_t swap_free_completed;
+SYSCTL_COUNTER_U64(_vm_stats_swap, OID_AUTO, free_completed,
+    CTLFLAG_RD, &swap_free_completed,
+    "Number of deferred frees completed");
+
 /* bits from overcommit */
 #define	SWAP_RESERVE_FORCE_ON		(1 << 0)
 #define	SWAP_RESERVE_RLIMIT_ON		(1 << 1)
@ -513,6 +526,15 @@ swap_pager_init(void)
 	sx_init(&swdev_syscall_lock, "swsysc");
 }

+static void
+swap_pager_counters(void)
+{
+
+	swap_free_deferred = counter_u64_alloc(M_WAITOK);
+	swap_free_completed = counter_u64_alloc(M_WAITOK);
+}
+SYSINIT(swap_counters, SI_SUB_CPU, SI_ORDER_ANY, swap_pager_counters, NULL);
+
 /*
 * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process
 *
@ -1112,14 +1134,37 @@ swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
 *
 *	This routine may not sleep.
 *
- *	The object containing the page must be locked.
+ *	The object containing the page may be locked.
 */
 static void
 swap_pager_unswapped(vm_page_t m)
 {
 	struct swblk *sb;
+	vm_object_t obj;

-	VM_OBJECT_ASSERT_WLOCKED(m->object);
+	/*
+	 * Handle enqueing deferred frees first.  If we do not have the
+	 * object lock we wait for the page daemon to clear the space.
+	 */
+	obj = m->object;
+	if (!VM_OBJECT_WOWNED(obj)) {
+		VM_PAGE_OBJECT_BUSY_ASSERT(m);
+		/*
+		 * The caller is responsible for synchronization but we
+		 * will harmlessly handle races.  This is typically provided
+		 * by only calling unswapped() when a page transitions from
+		 * clean to dirty.
+		 */
+		if ((m->a.flags & (PGA_SWAP_SPACE | PGA_SWAP_FREE)) ==
+		    PGA_SWAP_SPACE) {
+			vm_page_aflag_set(m, PGA_SWAP_FREE);
+			counter_u64_add(swap_free_deferred, 1);
+		}
+		return;
+	}
+	if ((m->a.flags & PGA_SWAP_FREE) != 0)
+		counter_u64_add(swap_free_completed, 1);
+	vm_page_aflag_clear(m, PGA_SWAP_FREE | PGA_SWAP_SPACE);

 	/*
 	 * The meta data only exists if the object is OBJT_SWAP
@ -1436,6 +1481,7 @@ swap_pager_putpages(vm_object_t object, vm_page_t *ma, int count,
 		VM_OBJECT_WLOCK(object);
 		for (j = 0; j < n; ++j) {
 			mreq = ma[i + j];
+			vm_page_aflag_clear(mreq, PGA_SWAP_FREE);
 			addr = swp_pager_meta_build(mreq->object, mreq->pindex,
 			    blk + j);
 			if (addr != SWAPBLK_NONE)
@ -1560,6 +1606,9 @@ swp_pager_async_iodone(struct buf *bp)
 			wakeup(&object->handle);
 		}

+		/* We always have space after I/O, successful or not. */
+		vm_page_aflag_set(m, PGA_SWAP_SPACE);
+
 		if (bp->b_ioflags & BIO_ERROR) {
 			/*
 			 * If an error occurs I'd love to throw the swapblk
@ -1581,6 +1630,7 @@ swp_pager_async_iodone(struct buf *bp)
 				 * then finish the I/O.
 				 */
 				MPASS(m->dirty == VM_PAGE_BITS_ALL);
+				/* PQ_UNSWAPPABLE? */
 				vm_page_lock(m);
 				vm_page_activate(m);
 				vm_page_unlock(m);
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@ -214,7 +214,7 @@ unlock_and_deallocate(struct faultstate *fs)

 static void
 vm_fault_dirty(vm_map_entry_t entry, vm_page_t m, vm_prot_t prot,
-    vm_prot_t fault_type, int fault_flags, bool excl)
+    vm_prot_t fault_type, int fault_flags)
 {
 	bool need_dirty;

@ -223,7 +223,6 @@ vm_fault_dirty(vm_map_entry_t entry, vm_page_t m, vm_prot_t prot,
 	    (m->oflags & VPO_UNMANAGED) != 0)
 		return;

-	VM_OBJECT_ASSERT_LOCKED(m->object);
 	VM_PAGE_OBJECT_BUSY_ASSERT(m);

 	need_dirty = ((fault_type & VM_PROT_WRITE) != 0 &&
@ -232,49 +231,29 @@ vm_fault_dirty(vm_map_entry_t entry, vm_page_t m, vm_prot_t prot,

 	vm_object_set_writeable_dirty(m->object);

-	if (!excl)
-		/*
-		 * If two callers of vm_fault_dirty() with excl ==
-		 * FALSE, one for the map entry with MAP_ENTRY_NOSYNC
-		 * flag set, other with flag clear, race, it is
-		 * possible for the no-NOSYNC thread to see m->dirty
-		 * != 0 and not clear PGA_NOSYNC.  Take vm_page lock
-		 * around manipulation of PGA_NOSYNC and
-		 * vm_page_dirty() call to avoid the race.
-		 */
-		vm_page_lock(m);
-
-	/*
-	 * If this is a NOSYNC mmap we do not want to set PGA_NOSYNC
-	 * if the page is already dirty to prevent data written with
-	 * the expectation of being synced from not being synced.
-	 * Likewise if this entry does not request NOSYNC then make
-	 * sure the page isn't marked NOSYNC.  Applications sharing
-	 * data should use the same flags to avoid ping ponging.
-	 */
-	if ((entry->eflags & MAP_ENTRY_NOSYNC) != 0) {
-		if (m->dirty == 0) {
-			vm_page_aflag_set(m, PGA_NOSYNC);
-		}
-	} else {
-		vm_page_aflag_clear(m, PGA_NOSYNC);
-	}
-
 	/*
 	 * If the fault is a write, we know that this page is being
 	 * written NOW so dirty it explicitly to save on
 	 * pmap_is_modified() calls later.
 	 *
 	 * Also, since the page is now dirty, we can possibly tell
-	 * the pager to release any swap backing the page.  Calling
-	 * the pager requires a write lock on the object.
+	 * the pager to release any swap backing the page.
 	 */
-	if (need_dirty)
-		vm_page_dirty(m);
-	if (!excl)
-		vm_page_unlock(m);
-	else if (need_dirty)
-		vm_pager_page_unswapped(m);
+	if (need_dirty && vm_page_set_dirty(m) == 0) {
+		/*
+		 * If this is a NOSYNC mmap we do not want to set PGA_NOSYNC
+		 * if the page is already dirty to prevent data written with
+		 * the expectation of being synced from not being synced.
+		 * Likewise if this entry does not request NOSYNC then make
+		 * sure the page isn't marked NOSYNC.  Applications sharing
+		 * data should use the same flags to avoid ping ponging.
+		 */
+		if ((entry->eflags & MAP_ENTRY_NOSYNC) != 0)
+			vm_page_aflag_set(m, PGA_NOSYNC);
+		else
+			vm_page_aflag_clear(m, PGA_NOSYNC);
+	}
+
 }

 /*
@ -344,7 +323,7 @@ vm_fault_soft_fast(struct faultstate *fs, vm_offset_t vaddr, vm_prot_t prot,
 		*m_hold = m;
 		vm_page_wire(m);
 	}
-	vm_fault_dirty(fs->entry, m, prot, fault_type, fault_flags, false);
+	vm_fault_dirty(fs->entry, m, prot, fault_type, fault_flags);
 	if (psind == 0 && !wired)
 		vm_fault_prefault(fs, vaddr, PFBAK, PFFOR, true);
 	VM_OBJECT_RUNLOCK(fs->first_object);
@ -502,7 +481,7 @@ vm_fault_populate(struct faultstate *fs, vm_prot_t prot, int fault_type,
 		for (i = 0; i < npages; i++) {
 			vm_fault_populate_check_page(&m[i]);
 			vm_fault_dirty(fs->entry, &m[i], prot, fault_type,
-			    fault_flags, true);
+			    fault_flags);
 		}
 		VM_OBJECT_WUNLOCK(fs->first_object);
 		rv = pmap_enter(fs->map->pmap, vaddr, m, prot, fault_type |
@ -1381,7 +1360,7 @@ vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
 		fs.entry->next_read = vaddr + ptoa(ahead) + PAGE_SIZE;

 	vm_page_assert_xbusied(fs.m);
-	vm_fault_dirty(fs.entry, fs.m, prot, fault_type, fault_flags, true);
+	vm_fault_dirty(fs.entry, fs.m, prot, fault_type, fault_flags);

 	/*
 	 * Page must be completely valid or it is not fit to
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@ -1584,6 +1584,10 @@ vm_page_object_remove(vm_page_t m)
 	KASSERT((m->ref_count & VPRC_OBJREF) != 0,
 	    ("page %p is missing its object ref", m));

+	/* Deferred free of swap space. */
+	if ((m->a.flags & PGA_SWAP_FREE) != 0)
+		vm_pager_page_unswapped(m);
+
 	mrem = vm_radix_remove(&object->rtree, m->pindex);
 	KASSERT(mrem == m, ("removed page %p, expected page %p", mrem, m));

@ -4633,6 +4637,62 @@ vm_page_bits_clear(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t clear)
 #endif		/* PAGE_SIZE */
 }

+static inline vm_page_bits_t
+vm_page_bits_swap(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t newbits)
+{
+#if PAGE_SIZE == 32768
+	uint64_t old;
+
+	old = *bits;
+	while (atomic_fcmpset_64(bits, &old, newbits) == 0);
+	return (old);
+#elif PAGE_SIZE == 16384
+	uint32_t old;
+
+	old = *bits;
+	while (atomic_fcmpset_32(bits, &old, newbits) == 0);
+	return (old);
+#elif (PAGE_SIZE == 8192) && defined(atomic_fcmpset_16)
+	uint16_t old;
+
+	old = *bits;
+	while (atomic_fcmpset_16(bits, &old, newbits) == 0);
+	return (old);
+#elif (PAGE_SIZE == 4096) && defined(atomic_fcmpset_8)
+	uint8_t old;
+
+	old = *bits;
+	while (atomic_fcmpset_8(bits, &old, newbits) == 0);
+	return (old);
+#else		/* PAGE_SIZE <= 4096*/
+	uintptr_t addr;
+	uint32_t old, new, mask;
+	int shift;
+
+	addr = (uintptr_t)bits;
+	/*
+	 * Use a trick to perform a 32-bit atomic on the
+	 * containing aligned word, to not depend on the existence
+	 * of atomic_{set, swap, clear}_{8, 16}.
+	 */
+	shift = addr & (sizeof(uint32_t) - 1);
+#if BYTE_ORDER == BIG_ENDIAN
+	shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY;
+#else
+	shift *= NBBY;
+#endif
+	addr &= ~(sizeof(uint32_t) - 1);
+	mask = VM_PAGE_BITS_ALL << shift;
+
+	old = *bits;
+	do {
+		new = old & ~mask;
+		new |= newbits << shift;
+	} while (atomic_fcmpset_32((uint32_t *)addr, &old, new) == 0);
+	return (old >> shift);
+#endif		/* PAGE_SIZE */
+}
+
 /*
 *	vm_page_set_valid_range:
 *
@ -4690,6 +4750,28 @@ vm_page_set_valid_range(vm_page_t m, int base, int size)
 		vm_page_bits_set(m, &m->valid, pagebits);
 }

+/*
+ * Set the page dirty bits and free the invalid swap space if
+ * present.  Returns the previous dirty bits.
+ */
+vm_page_bits_t
+vm_page_set_dirty(vm_page_t m)
+{
+	vm_page_bits_t old;
+
+	VM_PAGE_OBJECT_BUSY_ASSERT(m);
+
+	if (vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) {
+		old = m->dirty;
+		m->dirty = VM_PAGE_BITS_ALL;
+	} else
+		old = vm_page_bits_swap(m, &m->dirty, VM_PAGE_BITS_ALL);
+	if (old == 0 && (m->a.flags & PGA_SWAP_SPACE) != 0)
+		vm_pager_page_unswapped(m);
+
+	return (old);
+}
+
 /*
 * Clear the given bits from the specified page's dirty field.
 */
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@ -429,6 +429,10 @@ extern struct mtx_padalign pa_lock[];
 * PGA_REQUEUE_HEAD is a special flag for enqueuing pages near the head of
 * the inactive queue, thus bypassing LRU.  The page lock must be held to
 * set this flag, and the queue lock for the page must be held to clear it.
+ *
+ * PGA_SWAP_FREE is used to defer freeing swap space to the pageout daemon
+ * when the context that dirties the page does not have the object write lock
+ * held.
 */
 #define	PGA_WRITEABLE	0x0001		/* page may be mapped writeable */
 #define	PGA_REFERENCED	0x0002		/* page has been referenced */
@ -438,6 +442,8 @@ extern struct mtx_padalign pa_lock[];
 #define	PGA_REQUEUE	0x0020		/* page is due to be requeued */
 #define	PGA_REQUEUE_HEAD 0x0040		/* page requeue should bypass LRU */
 #define	PGA_NOSYNC	0x0080		/* do not collect for syncer */
+#define	PGA_SWAP_FREE	0x0100		/* page with swap space was dirtied */
+#define	PGA_SWAP_SPACE	0x0200		/* page has allocated swap space */

 #define	PGA_QUEUE_OP_MASK	(PGA_DEQUEUE | PGA_REQUEUE | PGA_REQUEUE_HEAD)
 #define	PGA_QUEUE_STATE_MASK	(PGA_ENQUEUED | PGA_QUEUE_OP_MASK)
@ -647,6 +653,7 @@ void vm_page_requeue(vm_page_t m);
 int vm_page_sbusied(vm_page_t m);
 vm_page_t vm_page_scan_contig(u_long npages, vm_page_t m_start,
    vm_page_t m_end, u_long alignment, vm_paddr_t boundary, int options);
+vm_page_bits_t vm_page_set_dirty(vm_page_t m);
 void vm_page_set_valid_range(vm_page_t m, int base, int size);
 int vm_page_sleep_if_busy(vm_page_t m, const char *msg);
 int vm_page_sleep_if_xbusy(vm_page_t m, const char *msg);
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@ -1307,6 +1307,14 @@ vm_pageout_scan_active(struct vm_domain *vmd, int page_shortage)
 			act_delta++;
 		}

+		/* Deferred free of swap space. */
+		if ((m->a.flags & PGA_SWAP_FREE) != 0 &&
+		    VM_OBJECT_TRYWLOCK(object)) {
+			if (m->object == object)
+				vm_pager_page_unswapped(m);
+			VM_OBJECT_WUNLOCK(object);
+		}
+
 		/*
 		 * Advance or decay the act_count based on recent usage.
 		 */
@ -1542,6 +1550,10 @@ vm_pageout_scan_inactive(struct vm_domain *vmd, int shortage,
 			goto reinsert;
 		}

+		/* Deferred free of swap space. */
+		if ((m->a.flags & PGA_SWAP_FREE) != 0)
+			vm_pager_page_unswapped(m);
+
 		/*
 		 * Re-check for wirings now that we hold the object lock and
 		 * have verified that the page is unbusied.  If the page is
--- a/sys/vm/vm_pager.h
+++ b/sys/vm/vm_pager.h
@ -179,9 +179,6 @@ vm_pager_populate(vm_object_t object, vm_pindex_t pidx, int fault_type,
 * 
 *	Destroy swap associated with the page.
 * 
- *	The object containing the page must be locked.
- *      This function may not block.
- *
 *	XXX: A much better name would be "vm_pager_page_dirtied()"
 *	XXX: It is not obvious if this could be profitably used by any
 *	XXX: pagers besides the swap_pager or if it should even be a
@ -191,7 +188,6 @@ static __inline void
 vm_pager_page_unswapped(vm_page_t m)
 {

-	VM_OBJECT_ASSERT_LOCKED(m->object);
 	if (pagertab[m->object->type]->pgo_pageunswapped)
 		(*pagertab[m->object->type]->pgo_pageunswapped)(m);
 }