Add a deferred free mechanism for freeing swap space that does not require

an exclusive object lock.

Previously swap space was freed on a best effort basis when a page that
had valid swap was dirtied, thus invalidating the swap copy.  This may be
done inconsistently and requires the object lock which is not always
convenient.

Instead, track when swap space is present.  The first dirty is responsible
for deleting space or setting PGA_SWAP_FREE which will trigger background
scans to free the swap space.

Simplify the locking in vm_fault_dirty() now that we can reliably identify
the first dirty.

Discussed with:	alc, kib, markj
Differential Revision:	https://reviews.freebsd.org/D22654
This commit is contained in:
Jeff Roberson 2019-12-15 03:15:06 +00:00
parent d966c7615f
commit a808177864
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=355765
9 changed files with 182 additions and 67 deletions

View File

@ -1118,10 +1118,7 @@ mdstart_swap(struct md_s *sc, struct bio *bp)
}
vm_page_valid(m);
if (m->dirty != VM_PAGE_BITS_ALL) {
vm_page_dirty(m);
vm_pager_page_unswapped(m);
}
vm_page_set_dirty(m);
} else if (bp->bio_cmd == BIO_DELETE) {
if (len == PAGE_SIZE || vm_page_all_valid(m))
rv = VM_PAGER_OK;
@ -1138,10 +1135,7 @@ mdstart_swap(struct md_s *sc, struct bio *bp)
/* Page is valid. */
if (len != PAGE_SIZE) {
pmap_zero_page_area(m, offs, len);
if (m->dirty != VM_PAGE_BITS_ALL) {
vm_page_dirty(m);
vm_pager_page_unswapped(m);
}
vm_page_set_dirty(m);
} else {
vm_pager_page_unswapped(m);
vm_page_free(m);

View File

@ -1505,9 +1505,8 @@ tmpfs_reg_resize(struct vnode *vp, off_t newsize, boolean_t ignerr)
}
if (m != NULL) {
pmap_zero_page_area(m, base, PAGE_SIZE - base);
vm_page_dirty(m);
vm_page_set_dirty(m);
vm_page_xunbusy(m);
vm_pager_page_unswapped(m);
}
}

View File

@ -198,7 +198,7 @@ uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio)
* type object.
*/
rv = vm_page_grab_valid(&m, obj, idx,
VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_NOBUSY);
VM_ALLOC_NORMAL | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY);
if (rv != VM_PAGER_OK) {
VM_OBJECT_WUNLOCK(obj);
printf("uiomove_object: vm_obj %p idx %jd pager error %d\n",
@ -207,13 +207,10 @@ uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio)
}
VM_OBJECT_WUNLOCK(obj);
error = uiomove_fromphys(&m, offset, tlen, uio);
if (uio->uio_rw == UIO_WRITE && error == 0) {
VM_OBJECT_WLOCK(obj);
vm_page_dirty(m);
vm_pager_page_unswapped(m);
VM_OBJECT_WUNLOCK(obj);
}
vm_page_unwire(m, PQ_ACTIVE);
if (uio->uio_rw == UIO_WRITE && error == 0)
vm_page_set_dirty(m);
vm_page_aflag_set(m, PGA_REFERENCED);
vm_page_sunbusy(m);
return (error);
}
@ -527,9 +524,8 @@ shm_dotruncate_locked(struct shmfd *shmfd, off_t length, void *rl_cookie)
pmap_zero_page_area(m, base, PAGE_SIZE - base);
KASSERT(vm_page_all_valid(m),
("shm_dotruncate: page %p is invalid", m));
vm_page_dirty(m);
vm_page_set_dirty(m);
vm_page_xunbusy(m);
vm_pager_page_unswapped(m);
}
}
delta = IDX_TO_OFF(object->size - nobjsize);

View File

@ -155,6 +155,9 @@ static struct sx swdev_syscall_lock; /* serialize swap(on|off) */
static u_long swap_reserved;
static u_long swap_total;
static int sysctl_page_shift(SYSCTL_HANDLER_ARGS);
static SYSCTL_NODE(_vm_stats, OID_AUTO, swap, CTLFLAG_RD, 0, "VM swap stats");
SYSCTL_PROC(_vm, OID_AUTO, swap_reserved, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
&swap_reserved, 0, sysctl_page_shift, "A",
"Amount of swap storage needed to back all allocated anonymous memory.");
@ -173,6 +176,16 @@ static unsigned long swap_maxpages;
SYSCTL_ULONG(_vm, OID_AUTO, swap_maxpages, CTLFLAG_RD, &swap_maxpages, 0,
"Maximum amount of swap supported");
static counter_u64_t swap_free_deferred;
SYSCTL_COUNTER_U64(_vm_stats_swap, OID_AUTO, free_deferred,
CTLFLAG_RD, &swap_free_deferred,
"Number of pages that deferred freeing swap space");
static counter_u64_t swap_free_completed;
SYSCTL_COUNTER_U64(_vm_stats_swap, OID_AUTO, free_completed,
CTLFLAG_RD, &swap_free_completed,
"Number of deferred frees completed");
/* bits from overcommit */
#define SWAP_RESERVE_FORCE_ON (1 << 0)
#define SWAP_RESERVE_RLIMIT_ON (1 << 1)
@ -513,6 +526,15 @@ swap_pager_init(void)
sx_init(&swdev_syscall_lock, "swsysc");
}
static void
swap_pager_counters(void)
{
swap_free_deferred = counter_u64_alloc(M_WAITOK);
swap_free_completed = counter_u64_alloc(M_WAITOK);
}
SYSINIT(swap_counters, SI_SUB_CPU, SI_ORDER_ANY, swap_pager_counters, NULL);
/*
* SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process
*
@ -1112,14 +1134,37 @@ swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
*
* This routine may not sleep.
*
* The object containing the page must be locked.
* The object containing the page may be locked.
*/
static void
swap_pager_unswapped(vm_page_t m)
{
struct swblk *sb;
vm_object_t obj;
VM_OBJECT_ASSERT_WLOCKED(m->object);
/*
* Handle enqueing deferred frees first. If we do not have the
* object lock we wait for the page daemon to clear the space.
*/
obj = m->object;
if (!VM_OBJECT_WOWNED(obj)) {
VM_PAGE_OBJECT_BUSY_ASSERT(m);
/*
* The caller is responsible for synchronization but we
* will harmlessly handle races. This is typically provided
* by only calling unswapped() when a page transitions from
* clean to dirty.
*/
if ((m->a.flags & (PGA_SWAP_SPACE | PGA_SWAP_FREE)) ==
PGA_SWAP_SPACE) {
vm_page_aflag_set(m, PGA_SWAP_FREE);
counter_u64_add(swap_free_deferred, 1);
}
return;
}
if ((m->a.flags & PGA_SWAP_FREE) != 0)
counter_u64_add(swap_free_completed, 1);
vm_page_aflag_clear(m, PGA_SWAP_FREE | PGA_SWAP_SPACE);
/*
* The meta data only exists if the object is OBJT_SWAP
@ -1436,6 +1481,7 @@ swap_pager_putpages(vm_object_t object, vm_page_t *ma, int count,
VM_OBJECT_WLOCK(object);
for (j = 0; j < n; ++j) {
mreq = ma[i + j];
vm_page_aflag_clear(mreq, PGA_SWAP_FREE);
addr = swp_pager_meta_build(mreq->object, mreq->pindex,
blk + j);
if (addr != SWAPBLK_NONE)
@ -1560,6 +1606,9 @@ swp_pager_async_iodone(struct buf *bp)
wakeup(&object->handle);
}
/* We always have space after I/O, successful or not. */
vm_page_aflag_set(m, PGA_SWAP_SPACE);
if (bp->b_ioflags & BIO_ERROR) {
/*
* If an error occurs I'd love to throw the swapblk
@ -1581,6 +1630,7 @@ swp_pager_async_iodone(struct buf *bp)
* then finish the I/O.
*/
MPASS(m->dirty == VM_PAGE_BITS_ALL);
/* PQ_UNSWAPPABLE? */
vm_page_lock(m);
vm_page_activate(m);
vm_page_unlock(m);

View File

@ -214,7 +214,7 @@ unlock_and_deallocate(struct faultstate *fs)
static void
vm_fault_dirty(vm_map_entry_t entry, vm_page_t m, vm_prot_t prot,
vm_prot_t fault_type, int fault_flags, bool excl)
vm_prot_t fault_type, int fault_flags)
{
bool need_dirty;
@ -223,7 +223,6 @@ vm_fault_dirty(vm_map_entry_t entry, vm_page_t m, vm_prot_t prot,
(m->oflags & VPO_UNMANAGED) != 0)
return;
VM_OBJECT_ASSERT_LOCKED(m->object);
VM_PAGE_OBJECT_BUSY_ASSERT(m);
need_dirty = ((fault_type & VM_PROT_WRITE) != 0 &&
@ -232,49 +231,29 @@ vm_fault_dirty(vm_map_entry_t entry, vm_page_t m, vm_prot_t prot,
vm_object_set_writeable_dirty(m->object);
if (!excl)
/*
* If two callers of vm_fault_dirty() with excl ==
* FALSE, one for the map entry with MAP_ENTRY_NOSYNC
* flag set, other with flag clear, race, it is
* possible for the no-NOSYNC thread to see m->dirty
* != 0 and not clear PGA_NOSYNC. Take vm_page lock
* around manipulation of PGA_NOSYNC and
* vm_page_dirty() call to avoid the race.
*/
vm_page_lock(m);
/*
* If this is a NOSYNC mmap we do not want to set PGA_NOSYNC
* if the page is already dirty to prevent data written with
* the expectation of being synced from not being synced.
* Likewise if this entry does not request NOSYNC then make
* sure the page isn't marked NOSYNC. Applications sharing
* data should use the same flags to avoid ping ponging.
*/
if ((entry->eflags & MAP_ENTRY_NOSYNC) != 0) {
if (m->dirty == 0) {
vm_page_aflag_set(m, PGA_NOSYNC);
}
} else {
vm_page_aflag_clear(m, PGA_NOSYNC);
}
/*
* If the fault is a write, we know that this page is being
* written NOW so dirty it explicitly to save on
* pmap_is_modified() calls later.
*
* Also, since the page is now dirty, we can possibly tell
* the pager to release any swap backing the page. Calling
* the pager requires a write lock on the object.
* the pager to release any swap backing the page.
*/
if (need_dirty)
vm_page_dirty(m);
if (!excl)
vm_page_unlock(m);
else if (need_dirty)
vm_pager_page_unswapped(m);
if (need_dirty && vm_page_set_dirty(m) == 0) {
/*
* If this is a NOSYNC mmap we do not want to set PGA_NOSYNC
* if the page is already dirty to prevent data written with
* the expectation of being synced from not being synced.
* Likewise if this entry does not request NOSYNC then make
* sure the page isn't marked NOSYNC. Applications sharing
* data should use the same flags to avoid ping ponging.
*/
if ((entry->eflags & MAP_ENTRY_NOSYNC) != 0)
vm_page_aflag_set(m, PGA_NOSYNC);
else
vm_page_aflag_clear(m, PGA_NOSYNC);
}
}
/*
@ -344,7 +323,7 @@ vm_fault_soft_fast(struct faultstate *fs, vm_offset_t vaddr, vm_prot_t prot,
*m_hold = m;
vm_page_wire(m);
}
vm_fault_dirty(fs->entry, m, prot, fault_type, fault_flags, false);
vm_fault_dirty(fs->entry, m, prot, fault_type, fault_flags);
if (psind == 0 && !wired)
vm_fault_prefault(fs, vaddr, PFBAK, PFFOR, true);
VM_OBJECT_RUNLOCK(fs->first_object);
@ -502,7 +481,7 @@ vm_fault_populate(struct faultstate *fs, vm_prot_t prot, int fault_type,
for (i = 0; i < npages; i++) {
vm_fault_populate_check_page(&m[i]);
vm_fault_dirty(fs->entry, &m[i], prot, fault_type,
fault_flags, true);
fault_flags);
}
VM_OBJECT_WUNLOCK(fs->first_object);
rv = pmap_enter(fs->map->pmap, vaddr, m, prot, fault_type |
@ -1381,7 +1360,7 @@ vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
fs.entry->next_read = vaddr + ptoa(ahead) + PAGE_SIZE;
vm_page_assert_xbusied(fs.m);
vm_fault_dirty(fs.entry, fs.m, prot, fault_type, fault_flags, true);
vm_fault_dirty(fs.entry, fs.m, prot, fault_type, fault_flags);
/*
* Page must be completely valid or it is not fit to

View File

@ -1584,6 +1584,10 @@ vm_page_object_remove(vm_page_t m)
KASSERT((m->ref_count & VPRC_OBJREF) != 0,
("page %p is missing its object ref", m));
/* Deferred free of swap space. */
if ((m->a.flags & PGA_SWAP_FREE) != 0)
vm_pager_page_unswapped(m);
mrem = vm_radix_remove(&object->rtree, m->pindex);
KASSERT(mrem == m, ("removed page %p, expected page %p", mrem, m));
@ -4633,6 +4637,62 @@ vm_page_bits_clear(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t clear)
#endif /* PAGE_SIZE */
}
static inline vm_page_bits_t
vm_page_bits_swap(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t newbits)
{
#if PAGE_SIZE == 32768
uint64_t old;
old = *bits;
while (atomic_fcmpset_64(bits, &old, newbits) == 0);
return (old);
#elif PAGE_SIZE == 16384
uint32_t old;
old = *bits;
while (atomic_fcmpset_32(bits, &old, newbits) == 0);
return (old);
#elif (PAGE_SIZE == 8192) && defined(atomic_fcmpset_16)
uint16_t old;
old = *bits;
while (atomic_fcmpset_16(bits, &old, newbits) == 0);
return (old);
#elif (PAGE_SIZE == 4096) && defined(atomic_fcmpset_8)
uint8_t old;
old = *bits;
while (atomic_fcmpset_8(bits, &old, newbits) == 0);
return (old);
#else /* PAGE_SIZE <= 4096*/
uintptr_t addr;
uint32_t old, new, mask;
int shift;
addr = (uintptr_t)bits;
/*
* Use a trick to perform a 32-bit atomic on the
* containing aligned word, to not depend on the existence
* of atomic_{set, swap, clear}_{8, 16}.
*/
shift = addr & (sizeof(uint32_t) - 1);
#if BYTE_ORDER == BIG_ENDIAN
shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY;
#else
shift *= NBBY;
#endif
addr &= ~(sizeof(uint32_t) - 1);
mask = VM_PAGE_BITS_ALL << shift;
old = *bits;
do {
new = old & ~mask;
new |= newbits << shift;
} while (atomic_fcmpset_32((uint32_t *)addr, &old, new) == 0);
return (old >> shift);
#endif /* PAGE_SIZE */
}
/*
* vm_page_set_valid_range:
*
@ -4690,6 +4750,28 @@ vm_page_set_valid_range(vm_page_t m, int base, int size)
vm_page_bits_set(m, &m->valid, pagebits);
}
/*
* Set the page dirty bits and free the invalid swap space if
* present. Returns the previous dirty bits.
*/
vm_page_bits_t
vm_page_set_dirty(vm_page_t m)
{
vm_page_bits_t old;
VM_PAGE_OBJECT_BUSY_ASSERT(m);
if (vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) {
old = m->dirty;
m->dirty = VM_PAGE_BITS_ALL;
} else
old = vm_page_bits_swap(m, &m->dirty, VM_PAGE_BITS_ALL);
if (old == 0 && (m->a.flags & PGA_SWAP_SPACE) != 0)
vm_pager_page_unswapped(m);
return (old);
}
/*
* Clear the given bits from the specified page's dirty field.
*/

View File

@ -429,6 +429,10 @@ extern struct mtx_padalign pa_lock[];
* PGA_REQUEUE_HEAD is a special flag for enqueuing pages near the head of
* the inactive queue, thus bypassing LRU. The page lock must be held to
* set this flag, and the queue lock for the page must be held to clear it.
*
* PGA_SWAP_FREE is used to defer freeing swap space to the pageout daemon
* when the context that dirties the page does not have the object write lock
* held.
*/
#define PGA_WRITEABLE 0x0001 /* page may be mapped writeable */
#define PGA_REFERENCED 0x0002 /* page has been referenced */
@ -438,6 +442,8 @@ extern struct mtx_padalign pa_lock[];
#define PGA_REQUEUE 0x0020 /* page is due to be requeued */
#define PGA_REQUEUE_HEAD 0x0040 /* page requeue should bypass LRU */
#define PGA_NOSYNC 0x0080 /* do not collect for syncer */
#define PGA_SWAP_FREE 0x0100 /* page with swap space was dirtied */
#define PGA_SWAP_SPACE 0x0200 /* page has allocated swap space */
#define PGA_QUEUE_OP_MASK (PGA_DEQUEUE | PGA_REQUEUE | PGA_REQUEUE_HEAD)
#define PGA_QUEUE_STATE_MASK (PGA_ENQUEUED | PGA_QUEUE_OP_MASK)
@ -647,6 +653,7 @@ void vm_page_requeue(vm_page_t m);
int vm_page_sbusied(vm_page_t m);
vm_page_t vm_page_scan_contig(u_long npages, vm_page_t m_start,
vm_page_t m_end, u_long alignment, vm_paddr_t boundary, int options);
vm_page_bits_t vm_page_set_dirty(vm_page_t m);
void vm_page_set_valid_range(vm_page_t m, int base, int size);
int vm_page_sleep_if_busy(vm_page_t m, const char *msg);
int vm_page_sleep_if_xbusy(vm_page_t m, const char *msg);

View File

@ -1307,6 +1307,14 @@ vm_pageout_scan_active(struct vm_domain *vmd, int page_shortage)
act_delta++;
}
/* Deferred free of swap space. */
if ((m->a.flags & PGA_SWAP_FREE) != 0 &&
VM_OBJECT_TRYWLOCK(object)) {
if (m->object == object)
vm_pager_page_unswapped(m);
VM_OBJECT_WUNLOCK(object);
}
/*
* Advance or decay the act_count based on recent usage.
*/
@ -1542,6 +1550,10 @@ vm_pageout_scan_inactive(struct vm_domain *vmd, int shortage,
goto reinsert;
}
/* Deferred free of swap space. */
if ((m->a.flags & PGA_SWAP_FREE) != 0)
vm_pager_page_unswapped(m);
/*
* Re-check for wirings now that we hold the object lock and
* have verified that the page is unbusied. If the page is

View File

@ -179,9 +179,6 @@ vm_pager_populate(vm_object_t object, vm_pindex_t pidx, int fault_type,
*
* Destroy swap associated with the page.
*
* The object containing the page must be locked.
* This function may not block.
*
* XXX: A much better name would be "vm_pager_page_dirtied()"
* XXX: It is not obvious if this could be profitably used by any
* XXX: pagers besides the swap_pager or if it should even be a
@ -191,7 +188,6 @@ static __inline void
vm_pager_page_unswapped(vm_page_t m)
{
VM_OBJECT_ASSERT_LOCKED(m->object);
if (pagertab[m->object->type]->pgo_pageunswapped)
(*pagertab[m->object->type]->pgo_pageunswapped)(m);
}