diff --git a/share/man/man9/vm_page_wire.9 b/share/man/man9/vm_page_wire.9 index 940cf9f6ceec..e3f03ab6b1f8 100644 --- a/share/man/man9/vm_page_wire.9 +++ b/share/man/man9/vm_page_wire.9 @@ -51,7 +51,7 @@ The .Fn vm_page_wire and .Fn vm_page_wire_mapped -functions wire the page, which prevents it from being reclaimed by the page +function wire the page, prevent it from being reclaimed by the page daemon or when its containing object is destroyed. Both functions require that the page belong to an object. The diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index c4e4762fe751..f2ebee2ca550 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -3064,8 +3064,10 @@ pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pd_entry_t pde, *pdep; pt_entry_t pte, PG_RW, PG_V; + vm_paddr_t pa; vm_page_t m; + pa = 0; m = NULL; PG_RW = pmap_rw_bit(pmap); PG_V = pmap_valid_bit(pmap); @@ -5804,7 +5806,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, ("pmap_enter: no PV entry for %#lx", va)); if ((newpte & PG_MANAGED) == 0) free_pv_entry(pmap, pv); - if ((vm_page_aflags(om) & PGA_WRITEABLE) != 0 && + if ((om->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) @@ -6987,7 +6989,7 @@ pmap_remove_pages(pmap_t pmap) pvh->pv_gen++; if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) - if ((vm_page_aflags(mt) & PGA_WRITEABLE) != 0 && + if ((mt->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } @@ -7005,7 +7007,7 @@ pmap_remove_pages(pmap_t pmap) pmap_resident_count_dec(pmap, 1); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; - if ((vm_page_aflags(m) & PGA_WRITEABLE) != 0 && + if ((m->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); @@ -7136,7 +7138,7 @@ pmap_is_modified(vm_page_t m) * is clear, no PTEs can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); return (pmap_page_test_mappings(m, FALSE, TRUE)); } @@ -7205,7 +7207,7 @@ pmap_remove_write(vm_page_t m) * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : @@ -7688,7 +7690,7 @@ pmap_clear_modify(vm_page_t m) * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ - if ((vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if ((m->aflags & PGA_WRITEABLE) == 0) return; pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h index ac8ed1c88063..b0a15a1bab86 100644 --- a/sys/amd64/include/pmap.h +++ b/sys/amd64/include/pmap.h @@ -423,8 +423,7 @@ extern int pmap_pcid_enabled; extern int invpcid_works; #define pmap_page_get_memattr(m) ((vm_memattr_t)(m)->md.pat_mode) -#define pmap_page_is_write_mapped(m) \ - (((m)->astate.flags & PGA_WRITEABLE) != 0) +#define pmap_page_is_write_mapped(m) (((m)->aflags & PGA_WRITEABLE) != 0) #define pmap_unmapbios(va, sz) pmap_unmapdev((va), (sz)) struct thread; diff --git a/sys/arm/arm/pmap-v4.c b/sys/arm/arm/pmap-v4.c index e746d66f9bf2..e1f411ccc832 100644 --- a/sys/arm/arm/pmap-v4.c +++ b/sys/arm/arm/pmap-v4.c @@ -4104,7 +4104,7 @@ pmap_clear_modify(vm_page_t m) * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ - if ((vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if ((m->aflags & PGA_WRITEABLE) == 0) return; if (m->md.pvh_attrs & PVF_MOD) pmap_clearbit(m, PVF_MOD); @@ -4143,7 +4143,7 @@ pmap_remove_write(vm_page_t m) * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (vm_page_xbusied(m) || (vm_page_aflags(m) & PGA_WRITEABLE) != 0) + if (vm_page_xbusied(m) || (m->aflags & PGA_WRITEABLE) != 0) pmap_clearbit(m, PVF_WRITE); } diff --git a/sys/arm/arm/pmap-v6.c b/sys/arm/arm/pmap-v6.c index 2ad04723a7c7..1d82ebf48cb2 100644 --- a/sys/arm/arm/pmap-v6.c +++ b/sys/arm/arm/pmap-v6.c @@ -5197,7 +5197,7 @@ pmap_is_modified(vm_page_t m) * is clear, no PTE2s can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = pmap_is_modified_pvh(&m->md) || @@ -5540,7 +5540,7 @@ pmap_remove_write(vm_page_t m) * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); sched_pin(); diff --git a/sys/arm64/arm64/pmap.c b/sys/arm64/arm64/pmap.c index c5063828d6a1..919537e86b84 100644 --- a/sys/arm64/arm64/pmap.c +++ b/sys/arm64/arm64/pmap.c @@ -3333,7 +3333,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, pv = pmap_pvh_remove(&om->md, pmap, va); if ((m->oflags & VPO_UNMANAGED) != 0) free_pv_entry(pmap, pv); - if ((vm_page_aflags(om) & PGA_WRITEABLE) != 0 && + if ((om->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) @@ -4372,7 +4372,7 @@ pmap_remove_pages(pmap_t pmap) pvh->pv_gen++; if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) - if (vm_page_aflags(mt) & PGA_WRITEABLE) != 0 && + if ((mt->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } @@ -4394,7 +4394,7 @@ pmap_remove_pages(pmap_t pmap) TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; - if (vm_page_aflags(m) & PGA_WRITEABLE) != 0 && + if ((m->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh( @@ -4534,7 +4534,7 @@ pmap_is_modified(vm_page_t m) * is clear, no PTEs can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); return (pmap_page_test_mappings(m, FALSE, TRUE)); } @@ -4600,7 +4600,7 @@ pmap_remove_write(vm_page_t m) * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : @@ -4977,7 +4977,7 @@ pmap_clear_modify(vm_page_t m) * set. If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ - if ((vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if ((m->aflags & PGA_WRITEABLE) == 0) return; pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c index 9afc7db022b2..59147515097f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c @@ -1718,10 +1718,12 @@ dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, bcopy((char *)db->db_data + bufoff, va, PAGESIZE); zfs_unmap_page(sf); m->valid = VM_PAGE_BITS_ALL; + vm_page_lock(m); if ((m->busy_lock & VPB_BIT_WAITERS) != 0) vm_page_activate(m); else vm_page_deactivate(m); + vm_page_unlock(m); } *rbehind = i; @@ -1836,10 +1838,12 @@ dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, } zfs_unmap_page(sf); m->valid = VM_PAGE_BITS_ALL; + vm_page_lock(m); if ((m->busy_lock & VPB_BIT_WAITERS) != 0) vm_page_activate(m); else vm_page_deactivate(m); + vm_page_unlock(m); } *rahead = i; zfs_vmobject_wunlock(vmobj); diff --git a/sys/dev/virtio/balloon/virtio_balloon.c b/sys/dev/virtio/balloon/virtio_balloon.c index 32b9b41b8d94..060d6d68afc7 100644 --- a/sys/dev/virtio/balloon/virtio_balloon.c +++ b/sys/dev/virtio/balloon/virtio_balloon.c @@ -332,6 +332,8 @@ vtballoon_inflate(struct vtballoon_softc *sc, int npages) sc->vtballoon_page_frames[i] = VM_PAGE_TO_PHYS(m) >> VIRTIO_BALLOON_PFN_SHIFT; + KASSERT(m->queue == PQ_NONE, + ("%s: allocated page %p on queue", __func__, m)); TAILQ_INSERT_TAIL(&sc->vtballoon_pages, m, plinks.q); } diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index 3e2748ad1c88..f07f500e8977 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -3752,7 +3752,7 @@ __CONCAT(PMTYPE, enter)(pmap_t pmap, vm_offset_t va, vm_page_t m, ("pmap_enter: no PV entry for %#x", va)); if ((newpte & PG_MANAGED) == 0) free_pv_entry(pmap, pv); - if ((vm_page_aflags(om) & PGA_WRITEABLE) != 0 && + if ((om->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) @@ -4848,7 +4848,7 @@ __CONCAT(PMTYPE, is_modified)(vm_page_t m) * is clear, no PTEs can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = pmap_is_modified_pvh(&m->md) || @@ -4979,7 +4979,7 @@ __CONCAT(PMTYPE, remove_write)(vm_page_t m) * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); sched_pin(); @@ -5291,7 +5291,7 @@ __CONCAT(PMTYPE, clear_modify)(vm_page_t m) * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ - if ((vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if ((m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); sched_pin(); diff --git a/sys/mips/mips/pmap.c b/sys/mips/mips/pmap.c index 571fe83397bd..072618f793a0 100644 --- a/sys/mips/mips/pmap.c +++ b/sys/mips/mips/pmap.c @@ -2164,7 +2164,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, pv = pmap_pvh_remove(&om->md, pmap, va); if (!pte_test(&newpte, PTE_MANAGED)) free_pv_entry(pmap, pv); - if (vm_page_aflags(m) & PGA_WRITEABLE) != 0 && + if ((om->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list)) vm_page_aflag_clear(om, PGA_WRITEABLE); } @@ -2934,7 +2934,7 @@ pmap_remove_write(vm_page_t m) * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { @@ -2999,7 +2999,7 @@ pmap_is_modified(vm_page_t m) * is clear, no PTEs can have PTE_D set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = pmap_testbit(m, PTE_D); @@ -3143,7 +3143,7 @@ pmap_clear_modify(vm_page_t m) * If the object containing the page is locked and the page is not * write busied, then PGA_WRITEABLE cannot be concurrently set. */ - if ((vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if ((m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { @@ -3270,7 +3270,7 @@ pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) * determine if the address is MINCORE_REFERENCED. */ m = PHYS_TO_VM_PAGE(pa); - if ((vm_page_aflags(m) & PGA_REFERENCED) != 0) + if ((m->aflags & PGA_REFERENCED) != 0) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != diff --git a/sys/powerpc/aim/mmu_oea.c b/sys/powerpc/aim/mmu_oea.c index 5ef269db98cc..9eacac27707b 100644 --- a/sys/powerpc/aim/mmu_oea.c +++ b/sys/powerpc/aim/mmu_oea.c @@ -1319,7 +1319,7 @@ moea_is_modified(mmu_t mmu, vm_page_t m) * is clear, no PTEs can have PTE_CHG set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = moea_query_bit(m, PTE_CHG); @@ -1355,7 +1355,7 @@ moea_clear_modify(mmu_t mmu, vm_page_t m) * set. If the object containing the page is locked and the page is * not exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ - if ((vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if ((m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); moea_clear_bit(m, PTE_CHG); @@ -1382,7 +1382,7 @@ moea_remove_write(mmu_t mmu, vm_page_t m) * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); lo = moea_attr_fetch(m); @@ -1915,8 +1915,7 @@ moea_remove_all(mmu_t mmu, vm_page_t m) moea_pvo_remove(pvo, -1); PMAP_UNLOCK(pmap); } - if ((vm_page_aflags(m) & PGA_WRITEABLE) != 0 && - moea_query_bit(m, PTE_CHG)) { + if ((m->aflags & PGA_WRITEABLE) && moea_query_bit(m, PTE_CHG)) { moea_attr_clear(m, PTE_CHG); vm_page_dirty(m); } diff --git a/sys/powerpc/aim/mmu_oea64.c b/sys/powerpc/aim/mmu_oea64.c index 6361938e0dc9..7ad86d5f1896 100644 --- a/sys/powerpc/aim/mmu_oea64.c +++ b/sys/powerpc/aim/mmu_oea64.c @@ -1467,7 +1467,7 @@ moea64_enter(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_page_t m, * Flush the page from the instruction cache if this page is * mapped executable and cacheable. */ - if (pmap != kernel_pmap && (vm_page_aflags(m) & PGA_EXECUTABLE) != 0 && + if (pmap != kernel_pmap && !(m->aflags & PGA_EXECUTABLE) && (pte_lo & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) { vm_page_aflag_set(m, PGA_EXECUTABLE); moea64_syncicache(mmu, pmap, va, VM_PAGE_TO_PHYS(m), PAGE_SIZE); @@ -1688,7 +1688,7 @@ moea64_is_modified(mmu_t mmu, vm_page_t m) * is clear, no PTEs can have LPTE_CHG set. */ VM_OBJECT_ASSERT_LOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); return (moea64_query_bit(mmu, m, LPTE_CHG)); } @@ -1722,7 +1722,7 @@ moea64_clear_modify(mmu_t mmu, vm_page_t m) * set. If the object containing the page is locked and the page is * not exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ - if ((vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if ((m->aflags & PGA_WRITEABLE) == 0) return; moea64_clear_bit(mmu, m, LPTE_CHG); } @@ -1746,7 +1746,7 @@ moea64_remove_write(mmu_t mmu, vm_page_t m) * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; powerpc_sync(); PV_PAGE_LOCK(m); @@ -2240,8 +2240,7 @@ moea64_pvo_protect(mmu_t mmu, pmap_t pm, struct pvo_entry *pvo, vm_prot_t prot) if (refchg < 0) refchg = (oldprot & VM_PROT_WRITE) ? LPTE_CHG : 0; - if (pm != kernel_pmap && pg != NULL && - (vm_page_aflags(pg) & PGA_EXECUTABLE) == 0 && + if (pm != kernel_pmap && pg != NULL && !(pg->aflags & PGA_EXECUTABLE) && (pvo->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) { if ((pg->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(pg, PGA_EXECUTABLE); @@ -2455,8 +2454,7 @@ moea64_remove_all(mmu_t mmu, vm_page_t m) } KASSERT(!pmap_page_is_mapped(m), ("Page still has mappings")); - KASSERT((vm_page_aflags(m) & PGA_WRITEABLE) == 0, - ("Page still writable")); + KASSERT(!(m->aflags & PGA_WRITEABLE), ("Page still writable")); PV_PAGE_UNLOCK(m); /* Clean up UMA allocations */ diff --git a/sys/powerpc/booke/pmap.c b/sys/powerpc/booke/pmap.c index 2374d1a9ad91..140b1367325f 100644 --- a/sys/powerpc/booke/pmap.c +++ b/sys/powerpc/booke/pmap.c @@ -2694,7 +2694,7 @@ mmu_booke_remove_write(mmu_t mmu, vm_page_t m) * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { @@ -3040,7 +3040,7 @@ mmu_booke_is_modified(mmu_t mmu, vm_page_t m) * is clear, no PTEs can be modified. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (rv); rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { @@ -3119,7 +3119,7 @@ mmu_booke_clear_modify(mmu_t mmu, vm_page_t m) * If the object containing the page is locked and the page is not * exclusive busied, then PG_AWRITEABLE cannot be concurrently set. */ - if ((vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if ((m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { diff --git a/sys/riscv/riscv/pmap.c b/sys/riscv/riscv/pmap.c index 2196a6c153af..bbda832ff885 100644 --- a/sys/riscv/riscv/pmap.c +++ b/sys/riscv/riscv/pmap.c @@ -2825,7 +2825,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, ("pmap_enter: no PV entry for %#lx", va)); if ((new_l3 & PTE_SW_MANAGED) == 0) free_pv_entry(pmap, pv); - if ((vm_page_aflags(om) & PGA_WRITEABLE) == 0 && + if ((om->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list)) vm_page_aflag_clear(om, PGA_WRITEABLE); } @@ -3556,7 +3556,7 @@ pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv, if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[Ln_ENTRIES]; mt++) if (TAILQ_EMPTY(&mt->md.pv_list) && - (vm_page_aflags(mt) & PGA_WRITEABLE) != 0) + (mt->aflags & PGA_WRITEABLE) != 0) vm_page_aflag_clear(mt, PGA_WRITEABLE); } mpte = pmap_remove_pt_page(pmap, pv->pv_va); @@ -3574,7 +3574,7 @@ pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv, TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (TAILQ_EMPTY(&m->md.pv_list) && - (vm_page_aflags(m) & PGA_WRITEABLE) != 0) { + (m->aflags & PGA_WRITEABLE) != 0) { pvh = pa_to_pvh(m->phys_addr); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); @@ -3789,7 +3789,7 @@ pmap_is_modified(vm_page_t m) * is clear, no PTEs can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); return (pmap_page_test_mappings(m, FALSE, TRUE)); } @@ -3855,7 +3855,7 @@ pmap_remove_write(vm_page_t m) * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : @@ -4115,7 +4115,7 @@ pmap_clear_modify(vm_page_t m) * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ - if ((vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if ((m->aflags & PGA_WRITEABLE) == 0) return; pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); diff --git a/sys/sparc64/sparc64/pmap.c b/sys/sparc64/sparc64/pmap.c index a038845e359f..436c15623a6e 100644 --- a/sys/sparc64/sparc64/pmap.c +++ b/sys/sparc64/sparc64/pmap.c @@ -2121,7 +2121,7 @@ pmap_is_modified(vm_page_t m) * is clear, no TTEs can have TD_W set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (rv); rw_wlock(&tte_list_global_lock); TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) { @@ -2204,7 +2204,7 @@ pmap_clear_modify(vm_page_t m) * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ - if ((vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if ((m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&tte_list_global_lock); TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) { @@ -2232,7 +2232,7 @@ pmap_remove_write(vm_page_t m) * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&tte_list_global_lock); TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) { diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index 249d158ca6d3..4ea49c7aa4a2 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -1648,6 +1648,12 @@ swp_pager_force_dirty(vm_page_t m) { vm_page_dirty(m); +#ifdef INVARIANTS + vm_page_lock(m); + if (!vm_page_wired(m) && m->queue == PQ_NONE) + panic("page %p is neither wired nor queued", m); + vm_page_unlock(m); +#endif vm_page_xunbusy(m); swap_pager_unswapped(m); } diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index ea783fbf53b8..f3557bbde5ac 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -153,7 +153,9 @@ release_page(struct faultstate *fs) { vm_page_xunbusy(fs->m); + vm_page_lock(fs->m); vm_page_deactivate(fs->m); + vm_page_unlock(fs->m); fs->m = NULL; } @@ -374,7 +376,9 @@ vm_fault_populate_cleanup(vm_object_t object, vm_pindex_t first, for (pidx = first, m = vm_page_lookup(object, pidx); pidx <= last; pidx++, m = vm_page_next(m)) { vm_fault_populate_check_page(m); + vm_page_lock(m); vm_page_deactivate(m); + vm_page_unlock(m); vm_page_xunbusy(m); } } @@ -1321,7 +1325,9 @@ vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, if ((fault_flags & VM_FAULT_WIRE) != 0) { vm_page_wire(fs.m); } else { + vm_page_lock(fs.m); vm_page_activate(fs.m); + vm_page_unlock(fs.m); } if (m_hold != NULL) { *m_hold = fs.m; diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c index 882a77e0de30..23bdde163cec 100644 --- a/sys/vm/vm_mmap.c +++ b/sys/vm/vm_mmap.c @@ -935,9 +935,9 @@ kern_mincore(struct thread *td, uintptr_t addr0, size_t len, char *vec) * and set PGA_REFERENCED before the call to * pmap_is_referenced(). */ - if ((vm_page_aflags(m) & PGA_REFERENCED) != 0 || + if ((m->aflags & PGA_REFERENCED) != 0 || pmap_is_referenced(m) || - (vm_page_aflags(m) & PGA_REFERENCED) != 0) + (m->aflags & PGA_REFERENCED) != 0) mincoreinfo |= MINCORE_REFERENCED_OTHER; } if (object != NULL) diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c index 8a6ace0dfe27..a2f6cb7c1f22 100644 --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -2312,9 +2312,9 @@ sysctl_vm_object_list(SYSCTL_HANDLER_ARGS) * sysctl is only meant to give an * approximation of the system anyway. */ - if (m->astate.queue == PQ_ACTIVE) + if (m->queue == PQ_ACTIVE) kvo->kvo_active++; - else if (m->astate.queue == PQ_INACTIVE) + else if (m->queue == PQ_INACTIVE) kvo->kvo_inactive++; } diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 7343210f6e41..99c3abe1f9e7 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -73,12 +73,11 @@ __FBSDID("$FreeBSD$"); #include #include -#include +#include #include #include #include #include -#include #include #include #include @@ -131,34 +130,6 @@ static int vm_min_waiters; static int vm_severe_waiters; static int vm_pageproc_waiters; -static SYSCTL_NODE(_vm_stats, OID_AUTO, page, CTLFLAG_RD, 0, - "VM page stats"); - -static counter_u64_t pqstate_commit_aborts = EARLY_COUNTER; -SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, commit_aborts, CTLFLAG_RD, - &pqstate_commit_aborts, - "Failed page queue state updates"); - -static counter_u64_t queue_ops = EARLY_COUNTER; -SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, queue_ops, CTLFLAG_RD, - &queue_ops, - "Batched queue operations"); - -static counter_u64_t null_queue_ops = EARLY_COUNTER; -SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, null_queue_ops, CTLFLAG_RD, - &null_queue_ops, - "Batched queue operations with no effect"); - -static void -counter_startup(void) -{ - - pqstate_commit_aborts = counter_u64_alloc(M_WAITOK); - queue_ops = counter_u64_alloc(M_WAITOK); - null_queue_ops = counter_u64_alloc(M_WAITOK); -} -SYSINIT(page_counters, SI_SUB_CPU, SI_ORDER_ANY, counter_startup, NULL); - /* * bogus page -- for I/O to/from partially complete buffers, * or for paging into sparsely invalid regions. @@ -187,17 +158,16 @@ static uma_zone_t fakepg_zone; static void vm_page_alloc_check(vm_page_t m); static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits); +static void vm_page_dequeue_complete(vm_page_t m); static void vm_page_enqueue(vm_page_t m, uint8_t queue); static void vm_page_init(void *dummy); static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred); static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred); -static void vm_page_mvqueue(vm_page_t m, const uint8_t queue, - const uint16_t nflag); +static void vm_page_mvqueue(vm_page_t m, uint8_t queue); static int vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run, vm_paddr_t high); -static bool vm_page_release_toq(vm_page_t m, uint8_t queue, bool noreuse); static int vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, int req); static int vm_page_zone_import(void *arg, void **store, int cnt, int domain, @@ -470,10 +440,10 @@ vm_page_init_marker(vm_page_t marker, int queue, uint8_t aflags) { bzero(marker, sizeof(*marker)); - marker->busy_lock = VPB_SINGLE_EXCLUSIVER; - marker->astate.flags = aflags; - marker->astate.queue = queue; marker->flags = PG_MARKER; + marker->aflags = aflags; + marker->busy_lock = VPB_SINGLE_EXCLUSIVER; + marker->queue = queue; } static void @@ -543,10 +513,9 @@ vm_page_init_page(vm_page_t m, vm_paddr_t pa, int segind) m->object = NULL; m->ref_count = 0; m->busy_lock = VPB_UNBUSIED; - m->flags = 0; + m->flags = m->aflags = 0; m->phys_addr = pa; - m->astate.flags = 0; - m->astate.queue = PQ_NONE; + m->queue = PQ_NONE; m->psind = 0; m->segind = segind; m->order = VM_NFREEORDER; @@ -1183,7 +1152,7 @@ vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) goto memattr; } m->phys_addr = paddr; - m->astate.queue = PQ_NONE; + m->queue = PQ_NONE; /* Fictitious pages don't use "segind". */ m->flags = PG_FICTITIOUS; /* Fictitious pages don't use "order" or "pool". */ @@ -1270,10 +1239,12 @@ vm_page_readahead_finish(vm_page_t m) * have shown that deactivating the page is usually the best choice, * unless the page is wanted by another thread. */ + vm_page_lock(m); if ((m->busy_lock & VPB_BIT_WAITERS) != 0) vm_page_activate(m); else vm_page_deactivate(m); + vm_page_unlock(m); vm_page_xunbusy(m); } @@ -1636,7 +1607,7 @@ vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex) mnew->pindex = pindex; atomic_set_int(&mnew->ref_count, VPRC_OBJREF); mold = vm_radix_replace(&object->rtree, mnew); - KASSERT(mold->astate.queue == PQ_NONE, + KASSERT(mold->queue == PQ_NONE, ("vm_page_replace: old page %p is on a paging queue", mold)); /* Keep the resident page list in sorted order. */ @@ -1912,7 +1883,7 @@ vm_page_alloc_domain_after(vm_object_t object, vm_pindex_t pindex, int domain, if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; m->flags = flags; - m->astate.flags = 0; + m->aflags = 0; m->oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0; m->busy_lock = VPB_UNBUSIED; @@ -1928,7 +1899,7 @@ vm_page_alloc_domain_after(vm_object_t object, vm_pindex_t pindex, int domain, vm_wire_add(1); m->ref_count = 1; } - m->astate.act_count = 0; + m->act_count = 0; if (object != NULL) { if (vm_page_insert_after(m, object, pindex, mpred)) { @@ -2122,12 +2093,12 @@ vm_page_alloc_contig_domain(vm_object_t object, vm_pindex_t pindex, int domain, memattr = object->memattr; } for (m = m_ret; m < &m_ret[npages]; m++) { - m->astate.flags = 0; + m->aflags = 0; m->flags = (m->flags | PG_NODUMP) & flags; m->busy_lock = busy_lock; if ((req & VM_ALLOC_WIRED) != 0) m->ref_count = 1; - m->astate.act_count = 0; + m->act_count = 0; m->oflags = oflags; if (object != NULL) { if (vm_page_insert_after(m, object, pindex, mpred)) { @@ -2170,10 +2141,9 @@ vm_page_alloc_check(vm_page_t m) { KASSERT(m->object == NULL, ("page %p has object", m)); - KASSERT(m->astate.queue == PQ_NONE && - (m->astate.flags & PGA_QUEUE_STATE_MASK) == 0, + KASSERT(m->queue == PQ_NONE && (m->aflags & PGA_QUEUE_STATE_MASK) == 0, ("page %p has unexpected queue %d, flags %#x", - m, m->astate.queue, (m->astate.flags & PGA_QUEUE_STATE_MASK))); + m, m->queue, (m->aflags & PGA_QUEUE_STATE_MASK))); KASSERT(m->ref_count == 0, ("page %p has references", m)); KASSERT(!vm_page_busied(m), ("page %p is busy", m)); KASSERT(m->dirty == 0, ("page %p is dirty", m)); @@ -2247,7 +2217,7 @@ vm_page_alloc_freelist_domain(int domain, int freelist, int req) /* * Initialize the page. Only the PG_ZERO flag is inherited. */ - m->astate.flags = 0; + m->aflags = 0; flags = 0; if ((req & VM_ALLOC_ZERO) != 0) flags = PG_ZERO; @@ -2426,7 +2396,8 @@ vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end, vm_reserv_size(level)) - pa); #endif } else if (object->memattr == VM_MEMATTR_DEFAULT && - !vm_page_busied(m) && !vm_page_wired(m)) { + vm_page_queue(m) != PQ_NONE && !vm_page_busied(m) && + !vm_page_wired(m)) { /* * The page is allocated but eligible for * relocation. Extend the current run by one @@ -2574,7 +2545,8 @@ vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run, error = EINVAL; else if (object->memattr != VM_MEMATTR_DEFAULT) error = EINVAL; - else if (!vm_page_busied(m) && !vm_page_wired(m)) { + else if (vm_page_queue(m) != PQ_NONE && + !vm_page_busied(m) && !vm_page_wired(m)) { KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has an unexpected memattr", m)); @@ -2635,7 +2607,7 @@ vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run, error = EBUSY; goto unlock; } - m_new->astate.flags = m->astate.flags & + m_new->aflags = m->aflags & ~PGA_QUEUE_STATE_MASK; KASSERT(m_new->oflags == VPO_UNMANAGED, ("page %p is managed", m_new)); @@ -3103,141 +3075,65 @@ vm_waitpfault(struct domainset *dset, int timo) mtx_unlock(&vm_domainset_lock); } -bool -vm_page_pqstate_commit(vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new) +static struct vm_pagequeue * +vm_page_pagequeue(vm_page_t m) { - vm_page_t next; - struct vm_pagequeue *pq; - int mask; - if (old->queue != PQ_NONE && old->queue != new.queue) { - new.flags &= ~PGA_ENQUEUED; + uint8_t queue; - pq = _vm_page_pagequeue(m, old->queue); - - /* - * The physical queue state might change at any point before the - * page queue lock is acquired, so we must verify that the lock - * is correct before proceeding. Once the page's queue index is - * changed, the page queue lock we hold will no longer - * synchronize the physical queue state of the page, so we must - * awkwardly remove the page from the queue and put it back if - * the commit fails. - */ - vm_pagequeue_lock(pq); - if (__predict_false(m->astate.queue != old->queue)) { - vm_pagequeue_unlock(pq); - *old = vm_page_astate_load(m); - return (false); - } - if (__predict_true((m->astate.flags & PGA_ENQUEUED) != 0)) { - next = TAILQ_NEXT(m, plinks.q); - TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); - } - if (__predict_false(!vm_page_astate_fcmpset(m, old, new))) { - if ((old->flags & PGA_ENQUEUED) != 0) { - if (next == NULL) - TAILQ_INSERT_TAIL(&pq->pq_pl, m, - plinks.q); - else - TAILQ_INSERT_BEFORE(next, m, plinks.q); - } - vm_pagequeue_unlock(pq); - counter_u64_add(pqstate_commit_aborts, 1); - return (false); - } - if ((old->flags & PGA_ENQUEUED) != 0) - vm_pagequeue_cnt_dec(pq); - vm_pagequeue_unlock(pq); - } else if (__predict_false(!vm_page_astate_fcmpset(m, old, new))) { - counter_u64_add(pqstate_commit_aborts, 1); - return (false); - } - - if (new.queue != PQ_NONE) { - mask = new.flags & PGA_QUEUE_OP_MASK; - if (mask != 0 && (old->flags & mask) != mask) - vm_page_pqbatch_submit(m, new.queue); - } - - return (true); + if ((queue = atomic_load_8(&m->queue)) == PQ_NONE) + return (NULL); + return (&vm_pagequeue_domain(m)->vmd_pagequeues[queue]); } static inline void -vm_pqbatch_process_page(struct vm_pagequeue *pq, vm_page_t m, uint8_t queue) +vm_pqbatch_process_page(struct vm_pagequeue *pq, vm_page_t m) { - vm_page_t next; struct vm_domain *vmd; - vm_page_astate_t old, new; + uint8_t qflags; CRITICAL_ASSERT(curthread); vm_pagequeue_assert_locked(pq); - old = vm_page_astate_load(m); -retry: - if (__predict_false(old.queue != queue)) - return; - KASSERT(pq == _vm_page_pagequeue(m, queue), - ("page %p does not belong to queue %p", m, pq)); - KASSERT(old.queue != PQ_NONE || (old.flags & PGA_QUEUE_STATE_MASK) == 0, - ("page %p has unexpected queue state", m)); - /* - * Update the page's queue state before modifying the page queues - * themselves, to avoid having to roll back updates when a queue state - * update fails and requires a retry. + * The page daemon is allowed to set m->queue = PQ_NONE without + * the page queue lock held. In this case it is about to free the page, + * which must not have any queue state. */ - new = old; - if ((old.flags & PGA_DEQUEUE) != 0) { - new.queue = PQ_NONE; - new.flags &= ~PGA_QUEUE_STATE_MASK; - if (__predict_true((old.flags & PGA_ENQUEUED) != 0)) { - next = TAILQ_NEXT(m, plinks.q); - TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); - } - if (__predict_false(!vm_page_astate_fcmpset(m, &old, new))) { - if ((old.flags & PGA_ENQUEUED) != 0) { - if (next == NULL) - TAILQ_INSERT_TAIL(&pq->pq_pl, m, - plinks.q); - else - TAILQ_INSERT_BEFORE(next, m, plinks.q); - } - counter_u64_add(pqstate_commit_aborts, 1); - goto retry; - } - if ((old.flags & PGA_ENQUEUED) != 0) - vm_pagequeue_cnt_dec(pq); - counter_u64_add(queue_ops, 1); - } else if ((old.flags & (PGA_REQUEUE | PGA_REQUEUE_HEAD)) != 0) { - new.flags |= PGA_ENQUEUED; - new.flags &= ~(PGA_REQUEUE | PGA_REQUEUE_HEAD); - if (__predict_false(!vm_page_astate_fcmpset(m, &old, new))) { - counter_u64_add(pqstate_commit_aborts, 1); - goto retry; - } + qflags = atomic_load_8(&m->aflags); + KASSERT(pq == vm_page_pagequeue(m) || + (qflags & PGA_QUEUE_STATE_MASK) == 0, + ("page %p doesn't belong to queue %p but has aflags %#x", + m, pq, qflags)); - if ((old.flags & PGA_ENQUEUED) != 0) + if ((qflags & PGA_DEQUEUE) != 0) { + if (__predict_true((qflags & PGA_ENQUEUED) != 0)) + vm_pagequeue_remove(pq, m); + vm_page_dequeue_complete(m); + } else if ((qflags & (PGA_REQUEUE | PGA_REQUEUE_HEAD)) != 0) { + if ((qflags & PGA_ENQUEUED) != 0) TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); - else + else { vm_pagequeue_cnt_inc(pq); + vm_page_aflag_set(m, PGA_ENQUEUED); + } /* - * Give PGA_REQUEUE_HEAD precedence over PGA_REQUEUE. In - * particular, if both flags are set in close succession, only - * PGA_REQUEUE_HEAD will be applied, even if it was set first. + * Give PGA_REQUEUE_HEAD precedence over PGA_REQUEUE. + * In particular, if both flags are set in close succession, + * only PGA_REQUEUE_HEAD will be applied, even if it was set + * first. */ - if ((old.flags & PGA_REQUEUE_HEAD) != 0) { - KASSERT(old.queue == PQ_INACTIVE, + if ((qflags & PGA_REQUEUE_HEAD) != 0) { + KASSERT(m->queue == PQ_INACTIVE, ("head enqueue not supported for page %p", m)); vmd = vm_pagequeue_domain(m); TAILQ_INSERT_BEFORE(&vmd->vmd_inacthead, m, plinks.q); - } else { + } else TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); - } - counter_u64_add(queue_ops, 1); - } else { - counter_u64_add(null_queue_ops, 1); + + vm_page_aflag_clear(m, qflags & (PGA_REQUEUE | + PGA_REQUEUE_HEAD)); } } @@ -3245,10 +3141,15 @@ static void vm_pqbatch_process(struct vm_pagequeue *pq, struct vm_batchqueue *bq, uint8_t queue) { + vm_page_t m; int i; - for (i = 0; i < bq->bq_cnt; i++) - vm_pqbatch_process_page(pq, bq->bq_pa[i], queue); + for (i = 0; i < bq->bq_cnt; i++) { + m = bq->bq_pa[i]; + if (__predict_false(m->queue != queue)) + continue; + vm_pqbatch_process_page(pq, m); + } vm_batchqueue_init(bq); } @@ -3256,6 +3157,8 @@ vm_pqbatch_process(struct vm_pagequeue *pq, struct vm_batchqueue *bq, * vm_page_pqbatch_submit: [ internal use only ] * * Enqueue a page in the specified page queue's batched work queue. + * The caller must have encoded the requested operation in the page + * structure's aflags field. */ void vm_page_pqbatch_submit(vm_page_t m, uint8_t queue) @@ -3266,6 +3169,8 @@ vm_page_pqbatch_submit(vm_page_t m, uint8_t queue) KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("page %p is unmanaged", m)); + KASSERT(mtx_owned(vm_page_lockptr(m)) || m->object == NULL, + ("missing synchronization for page %p", m)); KASSERT(queue < PQ_COUNT, ("invalid queue %d", queue)); domain = vm_phys_domain(m); @@ -3284,7 +3189,21 @@ vm_page_pqbatch_submit(vm_page_t m, uint8_t queue) bq = DPCPU_PTR(pqbatch[domain][queue]); } vm_pqbatch_process(pq, bq, queue); - vm_pqbatch_process_page(pq, m, queue); + + /* + * The page may have been logically dequeued before we acquired the + * page queue lock. In this case, since we either hold the page lock + * or the page is being freed, a different thread cannot be concurrently + * enqueuing the page. + */ + if (__predict_true(m->queue == queue)) + vm_pqbatch_process_page(pq, m); + else { + KASSERT(m->queue == PQ_NONE, + ("invalid queue transition for page %p", m)); + KASSERT((m->aflags & PGA_ENQUEUED) == 0, + ("page %p is enqueued with invalid queue index", m)); + } vm_pagequeue_unlock(pq); critical_exit(); } @@ -3328,54 +3247,131 @@ vm_page_pqbatch_drain(void) thread_unlock(td); } -/* XXX comment */ +/* + * Complete the logical removal of a page from a page queue. We must be + * careful to synchronize with the page daemon, which may be concurrently + * examining the page with only the page lock held. The page must not be + * in a state where it appears to be logically enqueued. + */ static void -vm_page_dequeue_free(vm_page_t m) +vm_page_dequeue_complete(vm_page_t m) { - vm_page_astate_t old, new; - for (old = vm_page_astate_load(m);;) { - if (old.queue == PQ_NONE) { - KASSERT((old.flags & PGA_QUEUE_STATE_MASK) == 0, - ("page %p has unexpected queue state flags %#x", - m, old.flags)); - break; - } - if ((old.flags & PGA_DEQUEUE) != 0) { - vm_page_pqbatch_submit(m, old.queue); - break; - } - new = old; - new.flags |= PGA_DEQUEUE; - if (vm_page_pqstate_commit(m, &old, new)) - break; - } + m->queue = PQ_NONE; + atomic_thread_fence_rel(); + vm_page_aflag_clear(m, PGA_QUEUE_STATE_MASK); +} + +/* + * vm_page_dequeue_deferred: [ internal use only ] + * + * Request removal of the given page from its current page + * queue. Physical removal from the queue may be deferred + * indefinitely. + * + * The page must be locked. + */ +void +vm_page_dequeue_deferred(vm_page_t m) +{ + uint8_t queue; + + vm_page_assert_locked(m); + + if ((queue = vm_page_queue(m)) == PQ_NONE) + return; + + /* + * Set PGA_DEQUEUE if it is not already set to handle a concurrent call + * to vm_page_dequeue_deferred_free(). In particular, avoid modifying + * the page's queue state once vm_page_dequeue_deferred_free() has been + * called. In the event of a race, two batch queue entries for the page + * will be created, but the second will have no effect. + */ + if (vm_page_pqstate_cmpset(m, queue, queue, PGA_DEQUEUE, PGA_DEQUEUE)) + vm_page_pqbatch_submit(m, queue); +} + +/* + * A variant of vm_page_dequeue_deferred() that does not assert the page + * lock and is only to be called from vm_page_free_prep(). Because the + * page is being freed, we can assume that nothing other than the page + * daemon is scheduling queue operations on this page, so we get for + * free the mutual exclusion that is otherwise provided by the page lock. + * To handle races, the page daemon must take care to atomically check + * for PGA_DEQUEUE when updating queue state. + */ +static void +vm_page_dequeue_deferred_free(vm_page_t m) +{ + uint8_t queue; + + KASSERT(m->ref_count == 0, ("page %p has references", m)); + + if ((m->aflags & PGA_DEQUEUE) != 0) + return; + atomic_thread_fence_acq(); + if ((queue = m->queue) == PQ_NONE) + return; + vm_page_aflag_set(m, PGA_DEQUEUE); + vm_page_pqbatch_submit(m, queue); } /* * vm_page_dequeue: * * Remove the page from whichever page queue it's in, if any. - * XXX + * The page must either be locked or unallocated. This constraint + * ensures that the queue state of the page will remain consistent + * after this function returns. */ void vm_page_dequeue(vm_page_t m) { - vm_page_astate_t old, new; + struct vm_pagequeue *pq, *pq1; + uint8_t aflags; - for (old = vm_page_astate_load(m);;) { - if (old.queue == PQ_NONE) { - KASSERT((old.flags & PGA_QUEUE_STATE_MASK) == 0, + KASSERT(mtx_owned(vm_page_lockptr(m)) || m->object == NULL, + ("page %p is allocated and unlocked", m)); + + for (pq = vm_page_pagequeue(m);; pq = pq1) { + if (pq == NULL) { + /* + * A thread may be concurrently executing + * vm_page_dequeue_complete(). Ensure that all queue + * state is cleared before we return. + */ + aflags = atomic_load_8(&m->aflags); + if ((aflags & PGA_QUEUE_STATE_MASK) == 0) + return; + KASSERT((aflags & PGA_DEQUEUE) != 0, ("page %p has unexpected queue state flags %#x", - m, old.flags)); - break; + m, aflags)); + + /* + * Busy wait until the thread updating queue state is + * finished. Such a thread must be executing in a + * critical section. + */ + cpu_spinwait(); + pq1 = vm_page_pagequeue(m); + continue; } - new = old; - new.queue = PQ_NONE; - new.flags &= ~PGA_QUEUE_STATE_MASK; - if (vm_page_pqstate_commit(m, &old, new)) + vm_pagequeue_lock(pq); + if ((pq1 = vm_page_pagequeue(m)) == pq) break; + vm_pagequeue_unlock(pq); } + KASSERT(pq == vm_page_pagequeue(m), + ("%s: page %p migrated directly between queues", __func__, m)); + KASSERT((m->aflags & PGA_DEQUEUE) != 0 || + mtx_owned(vm_page_lockptr(m)), + ("%s: queued unlocked page %p", __func__, m)); + + if ((m->aflags & PGA_ENQUEUED) != 0) + vm_pagequeue_remove(pq, m); + vm_page_dequeue_complete(m); + vm_pagequeue_unlock(pq); } /* @@ -3387,16 +3383,71 @@ vm_page_enqueue(vm_page_t m, uint8_t queue) { vm_page_assert_locked(m); - KASSERT(m->astate.queue == PQ_NONE && - (m->astate.flags & PGA_QUEUE_STATE_MASK) == 0, + KASSERT(m->queue == PQ_NONE && (m->aflags & PGA_QUEUE_STATE_MASK) == 0, ("%s: page %p is already enqueued", __func__, m)); - m->astate.queue = queue; - if ((m->astate.flags & PGA_REQUEUE) == 0) + m->queue = queue; + if ((m->aflags & PGA_REQUEUE) == 0) vm_page_aflag_set(m, PGA_REQUEUE); vm_page_pqbatch_submit(m, queue); } +/* + * vm_page_requeue: [ internal use only ] + * + * Schedule a requeue of the given page. + * + * The page must be locked. + */ +void +vm_page_requeue(vm_page_t m) +{ + + vm_page_assert_locked(m); + KASSERT(vm_page_queue(m) != PQ_NONE, + ("%s: page %p is not logically enqueued", __func__, m)); + + if ((m->aflags & PGA_REQUEUE) == 0) + vm_page_aflag_set(m, PGA_REQUEUE); + vm_page_pqbatch_submit(m, atomic_load_8(&m->queue)); +} + +/* + * vm_page_swapqueue: [ internal use only ] + * + * Move the page from one queue to another, or to the tail of its + * current queue, in the face of a possible concurrent call to + * vm_page_dequeue_deferred_free(). + */ +void +vm_page_swapqueue(vm_page_t m, uint8_t oldq, uint8_t newq) +{ + struct vm_pagequeue *pq; + + KASSERT(oldq < PQ_COUNT && newq < PQ_COUNT && oldq != newq, + ("vm_page_swapqueue: invalid queues (%d, %d)", oldq, newq)); + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("vm_page_swapqueue: page %p is unmanaged", m)); + vm_page_assert_locked(m); + + /* + * Atomically update the queue field and set PGA_REQUEUE while + * ensuring that PGA_DEQUEUE has not been set. + */ + pq = &vm_pagequeue_domain(m)->vmd_pagequeues[oldq]; + vm_pagequeue_lock(pq); + if (!vm_page_pqstate_cmpset(m, oldq, newq, PGA_DEQUEUE, PGA_REQUEUE)) { + vm_pagequeue_unlock(pq); + return; + } + if ((m->aflags & PGA_ENQUEUED) != 0) { + vm_pagequeue_remove(pq, m); + vm_page_aflag_clear(m, PGA_ENQUEUED); + } + vm_pagequeue_unlock(pq); + vm_page_pqbatch_submit(m, newq); +} + /* * vm_page_free_prep: * @@ -3428,11 +3479,10 @@ vm_page_free_prep(vm_page_t m) } #endif if ((m->oflags & VPO_UNMANAGED) == 0) - KASSERT(!pmap_page_is_mapped(m) && (vm_page_aflags(m) & - (PGA_EXECUTABLE | PGA_WRITEABLE)) == 0, + KASSERT(!pmap_page_is_mapped(m), ("vm_page_free_prep: freeing mapped page %p", m)); else - KASSERT(m->astate.queue == PQ_NONE, + KASSERT(m->queue == PQ_NONE, ("vm_page_free_prep: unmanaged page %p is queued", m)); VM_CNT_INC(v_tfree); @@ -3461,7 +3511,7 @@ vm_page_free_prep(vm_page_t m) if ((m->flags & PG_FICTITIOUS) != 0) { KASSERT(m->ref_count == 1, ("fictitious page %p is referenced", m)); - KASSERT(m->astate.queue == PQ_NONE, + KASSERT(m->queue == PQ_NONE, ("fictitious page %p is queued", m)); return (false); } @@ -3472,7 +3522,7 @@ vm_page_free_prep(vm_page_t m) * dequeue. */ if ((m->oflags & VPO_UNMANAGED) == 0) - vm_page_dequeue_free(m); + vm_page_dequeue_deferred_free(m); m->valid = 0; vm_page_undirty(m); @@ -3579,8 +3629,6 @@ vm_page_wire(vm_page_t m) old = atomic_fetchadd_int(&m->ref_count, 1); KASSERT(VPRC_WIRE_COUNT(old) != VPRC_WIRE_COUNT_MAX, ("vm_page_wire: counter overflow for page %p", m)); - if ((m->oflags & VPO_UNMANAGED) == 0) - vm_page_aflag_set(m, PGA_DEQUEUE); if (VPRC_WIRE_COUNT(old) == 0) vm_wire_add(1); } @@ -3602,45 +3650,11 @@ vm_page_wire_mapped(vm_page_t m) return (false); } while (!atomic_fcmpset_int(&m->ref_count, &old, old + 1)); - if ((m->oflags & VPO_UNMANAGED) == 0) - vm_page_aflag_set(m, PGA_DEQUEUE); if (VPRC_WIRE_COUNT(old) == 0) vm_wire_add(1); return (true); } -/* XXX comment */ -static void -vm_page_unwire_managed(vm_page_t m, uint8_t queue, bool noreuse) -{ - u_int old; - - KASSERT((m->oflags & VPO_UNMANAGED) == 0, - ("vm_page_unwire_managed: page %p is unmanaged", m)); - - /* - * Update LRU state before releasing the wiring reference. - * Use a release store when updating the reference count to - * synchronize with vm_page_free_prep(). - */ - old = m->ref_count; - do { - KASSERT(VPRC_WIRE_COUNT(old) > 0, - ("vm_page_unwire: wire count underflow for page %p", m)); - if (VPRC_WIRE_COUNT(old) == 1 && - !vm_page_release_toq(m, queue, noreuse)) { - old = atomic_load_int(&m->ref_count); - continue; - } - } while (!atomic_fcmpset_rel_int(&m->ref_count, &old, old - 1)); - - if (VPRC_WIRE_COUNT(old) == 1) { - vm_wire_sub(1); - if (old == 1) - vm_page_free(m); - } -} - /* * Release one wiring of the specified page, potentially allowing it to be * paged out. @@ -3655,6 +3669,8 @@ vm_page_unwire_managed(vm_page_t m, uint8_t queue, bool noreuse) void vm_page_unwire(vm_page_t m, uint8_t queue) { + u_int old; + bool locked; KASSERT(queue < PQ_COUNT, ("vm_page_unwire: invalid queue %u request for page %p", queue, m)); @@ -3662,8 +3678,42 @@ vm_page_unwire(vm_page_t m, uint8_t queue) if ((m->oflags & VPO_UNMANAGED) != 0) { if (vm_page_unwire_noq(m) && m->ref_count == 0) vm_page_free(m); - } else { - vm_page_unwire_managed(m, queue, false); + return; + } + + /* + * Update LRU state before releasing the wiring reference. + * We only need to do this once since we hold the page lock. + * Use a release store when updating the reference count to + * synchronize with vm_page_free_prep(). + */ + old = m->ref_count; + locked = false; + do { + KASSERT(VPRC_WIRE_COUNT(old) > 0, + ("vm_page_unwire: wire count underflow for page %p", m)); + if (!locked && VPRC_WIRE_COUNT(old) == 1) { + vm_page_lock(m); + locked = true; + if (queue == PQ_ACTIVE && vm_page_queue(m) == PQ_ACTIVE) + vm_page_reference(m); + else + vm_page_mvqueue(m, queue); + } + } while (!atomic_fcmpset_rel_int(&m->ref_count, &old, old - 1)); + + /* + * Release the lock only after the wiring is released, to ensure that + * the page daemon does not encounter and dequeue the page while it is + * still wired. + */ + if (locked) + vm_page_unlock(m); + + if (VPRC_WIRE_COUNT(old) == 1) { + vm_wire_sub(1); + if (old == 1) + vm_page_free(m); } } @@ -3700,45 +3750,25 @@ vm_page_unwire_noq(vm_page_t m) * before releasing the page lock, otherwise the page daemon may immediately * dequeue the page. * - * In many cases this function's parameters are known at compile-time, so - * it is inlined into its callers so as to allow constant folding to remove - * branches. - * * A managed page must be locked. */ static __always_inline void -vm_page_mvqueue(vm_page_t m, const uint8_t nqueue, const uint16_t nflag) +vm_page_mvqueue(vm_page_t m, const uint8_t nqueue) { - vm_page_astate_t old, new; + vm_page_assert_locked(m); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("vm_page_mvqueue: page %p is unmanaged", m)); - KASSERT(m->ref_count > 0, - ("vm_page_mvqueue: page %p is missing refs", m)); - KASSERT(nflag == PGA_REQUEUE || nflag == PGA_REQUEUE_HEAD, - ("vm_page_mvqueue: unexpected queue state flag")); - KASSERT(nflag != PGA_REQUEUE_HEAD || nqueue == PQ_INACTIVE, - ("vm_page_mvqueue: wrong queue %d for PGA_REQUEUE_HEAD", nqueue)); - for (old = vm_page_astate_load(m);;) { - if ((old.flags & PGA_DEQUEUE) != 0) - break; - new = old; - if (nqueue == PQ_ACTIVE) - new.act_count = max(old.act_count, ACT_INIT); - - if (old.queue == nqueue) { - if (nqueue != PQ_ACTIVE) - new.flags |= nflag; - if (new._bits == old._bits) - break; - } else { - new.flags |= nflag; - new.queue = nqueue; - } - if (vm_page_pqstate_commit(m, &old, new)) - break; + if (vm_page_queue(m) != nqueue) { + vm_page_dequeue(m); + vm_page_enqueue(m, nqueue); + } else if (nqueue != PQ_ACTIVE) { + vm_page_requeue(m); } + + if (nqueue == PQ_ACTIVE && m->act_count < ACT_INIT) + m->act_count = ACT_INIT; } /* @@ -3748,9 +3778,9 @@ void vm_page_activate(vm_page_t m) { - if ((m->oflags & VPO_UNMANAGED) != 0) + if ((m->oflags & VPO_UNMANAGED) != 0 || vm_page_wired(m)) return; - vm_page_mvqueue(m, PQ_ACTIVE, PGA_REQUEUE); + vm_page_mvqueue(m, PQ_ACTIVE); } /* @@ -3761,9 +3791,30 @@ void vm_page_deactivate(vm_page_t m) { - if ((m->oflags & VPO_UNMANAGED) != 0) + if ((m->oflags & VPO_UNMANAGED) != 0 || vm_page_wired(m)) return; - vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE); + vm_page_mvqueue(m, PQ_INACTIVE); +} + +/* + * Move the specified page close to the head of the inactive queue, + * bypassing LRU. A marker page is used to maintain FIFO ordering. + * As with regular enqueues, we use a per-CPU batch queue to reduce + * contention on the page queue lock. + */ +static void +_vm_page_deactivate_noreuse(vm_page_t m) +{ + + vm_page_assert_locked(m); + + if (!vm_page_inactive(m)) { + vm_page_dequeue(m); + m->queue = PQ_INACTIVE; + } + if ((m->aflags & PGA_REQUEUE_HEAD) == 0) + vm_page_aflag_set(m, PGA_REQUEUE_HEAD); + vm_page_pqbatch_submit(m, PQ_INACTIVE); } void @@ -3773,9 +3824,8 @@ vm_page_deactivate_noreuse(vm_page_t m) KASSERT(m->object != NULL, ("vm_page_deactivate_noreuse: page %p has no object", m)); - if ((m->oflags & VPO_UNMANAGED) != 0) - return; - vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE_HEAD); + if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_wired(m)) + _vm_page_deactivate_noreuse(m); } /* @@ -3787,7 +3837,7 @@ vm_page_launder(vm_page_t m) if ((m->oflags & VPO_UNMANAGED) != 0 || vm_page_wired(m)) return; - vm_page_mvqueue(m, PQ_LAUNDRY, PGA_REQUEUE); + vm_page_mvqueue(m, PQ_LAUNDRY); } /* @@ -3805,17 +3855,11 @@ vm_page_unswappable(vm_page_t m) vm_page_enqueue(m, PQ_UNSWAPPABLE); } -/* XXX comment */ -static bool -vm_page_release_toq(vm_page_t m, uint8_t nqueue, bool noreuse) +static void +vm_page_release_toq(vm_page_t m, int flags) { - vm_page_astate_t old, new; - uint16_t nflag; - KASSERT((m->oflags & VPO_UNMANAGED) == 0, - ("vm_page_release_toq: page %p is unmanaged", m)); - KASSERT(m->ref_count > 0, - ("vm_page_release_toq: page %p is missing refs", m)); + vm_page_assert_locked(m); /* * Use a check of the valid bits to determine whether we should @@ -3827,35 +3871,12 @@ vm_page_release_toq(vm_page_t m, uint8_t nqueue, bool noreuse) * If we were asked to not cache the page, place it near the head of the * inactive queue so that is reclaimed sooner. */ - nflag = (noreuse || m->valid == 0) ? PGA_REQUEUE_HEAD : PGA_REQUEUE; - - /* XXX explain */ - vm_page_aflag_clear(m, PGA_DEQUEUE); - - for (old = vm_page_astate_load(m);;) { - new = old; - if ((new.flags & PGA_DEQUEUE) != 0) - return (false); - if (nflag != PGA_REQUEUE_HEAD && old.queue == PQ_ACTIVE) { - new.flags |= PGA_REFERENCED; - } else { - if (nqueue == PQ_ACTIVE) - new.act_count = max(old.act_count, ACT_INIT); - else - new.flags |= nflag; - new.queue = nqueue; - } - - /* - * If the page queue state is not changing, we have nothing - * to do. - */ - if (new._bits == old._bits) - break; - if (vm_page_pqstate_commit(m, &old, new)) - break; - } - return (true); + if ((flags & (VPR_TRYFREE | VPR_NOREUSE)) != 0 || m->valid == 0) + _vm_page_deactivate_noreuse(m); + else if (vm_page_active(m)) + vm_page_reference(m); + else + vm_page_mvqueue(m, PQ_INACTIVE); } /* @@ -3865,6 +3886,8 @@ void vm_page_release(vm_page_t m, int flags) { vm_object_t object; + u_int old; + bool locked; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("vm_page_release: page %p is unmanaged", m)); @@ -3890,7 +3913,36 @@ vm_page_release(vm_page_t m, int flags) } } - vm_page_unwire_managed(m, PQ_INACTIVE, flags != 0); + /* + * Update LRU state before releasing the wiring reference. + * Use a release store when updating the reference count to + * synchronize with vm_page_free_prep(). + */ + old = m->ref_count; + locked = false; + do { + KASSERT(VPRC_WIRE_COUNT(old) > 0, + ("vm_page_unwire: wire count underflow for page %p", m)); + if (!locked && VPRC_WIRE_COUNT(old) == 1) { + vm_page_lock(m); + locked = true; + vm_page_release_toq(m, flags); + } + } while (!atomic_fcmpset_rel_int(&m->ref_count, &old, old - 1)); + + /* + * Release the lock only after the wiring is released, to ensure that + * the page daemon does not encounter and dequeue the page while it is + * still wired. + */ + if (locked) + vm_page_unlock(m); + + if (VPRC_WIRE_COUNT(old) == 1) { + vm_wire_sub(1); + if (old == 1) + vm_page_free(m); + } } /* See vm_page_release(). */ @@ -3908,7 +3960,9 @@ vm_page_release_locked(vm_page_t m, int flags) m->dirty == 0 && !vm_page_busied(m)) { vm_page_free(m); } else { - (void)vm_page_release_toq(m, PQ_INACTIVE, flags != 0); + vm_page_lock(m); + vm_page_release_toq(m, flags); + vm_page_unlock(m); } } } @@ -4720,22 +4774,6 @@ vm_page_object_lock_assert(vm_page_t m) VM_OBJECT_ASSERT_WLOCKED(m->object); } -void -vm_page_pagequeue_lock_assert(vm_page_t m, uint8_t queue) -{ - - if ((m->flags & PG_MARKER) != 0) - return; - - /* - * The page's page queue index may only change while the - * current queue's lock is held. - */ - KASSERT(queue != PQ_NONE, - ("page %p does not belong to a queue", m)); - vm_pagequeue_assert_locked(_vm_page_pagequeue(m, queue)); -} - void vm_page_assert_pga_writeable(vm_page_t m, uint8_t bits) { @@ -4815,7 +4853,7 @@ DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo) "page %p obj %p pidx 0x%jx phys 0x%jx q %d ref %u\n" " af 0x%x of 0x%x f 0x%x act %d busy %x valid 0x%x dirty 0x%x\n", m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr, - m->astate.queue, m->ref_count, m->astate.flags, m->oflags, - m->flags, m->astate.act_count, m->busy_lock, m->valid, m->dirty); + m->queue, m->ref_count, m->aflags, m->oflags, + m->flags, m->act_count, m->busy_lock, m->valid, m->dirty); } #endif /* DDB */ diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index 4d5726c0e39a..0c3f3a9bade2 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -190,15 +190,6 @@ typedef uint32_t vm_page_bits_t; typedef uint64_t vm_page_bits_t; #endif -typedef union { - struct { - uint16_t flags; - uint8_t queue; - uint8_t act_count; - }; - uint32_t _bits; -} vm_page_astate_t; - struct vm_page { union { TAILQ_ENTRY(vm_page) q; /* page queue or free list (Q) */ @@ -221,13 +212,15 @@ struct vm_page { u_int ref_count; /* page references */ }; volatile u_int busy_lock; /* busy owners lock */ - vm_page_astate_t astate; /* atomically updated state */ - uint8_t flags; /* page PG_* flags (P) */ + uint16_t flags; /* page PG_* flags (P) */ uint8_t order; /* index of the buddy queue (F) */ uint8_t pool; /* vm_phys freepool index (F) */ + uint8_t aflags; /* access is atomic */ + uint8_t oflags; /* page VPO_* flags (O) */ + uint8_t queue; /* page queue index (Q) */ int8_t psind; /* pagesizes[] index (O) */ int8_t segind; /* vm_phys segment index (C) */ - uint8_t oflags; /* page VPO_* flags (O) */ + u_char act_count; /* page usage count (P) */ /* NOTE that these must support one bit per DEV_BSIZE in a page */ /* so, on normal X86 kernels, they must be at least 8 bits wide */ vm_page_bits_t valid; /* map of valid DEV_BSIZE chunks (O) */ @@ -406,8 +399,8 @@ extern struct mtx_padalign pa_lock[]; #define PGA_REQUEUE 0x20 /* page is due to be requeued */ #define PGA_REQUEUE_HEAD 0x40 /* page requeue should bypass LRU */ -#define PGA_QUEUE_OP_MASK (PGA_DEQUEUE | PGA_REQUEUE | PGA_REQUEUE_HEAD) -#define PGA_QUEUE_STATE_MASK (PGA_ENQUEUED | PGA_QUEUE_OP_MASK) +#define PGA_QUEUE_STATE_MASK (PGA_ENQUEUED | PGA_DEQUEUE | PGA_REQUEUE | \ + PGA_REQUEUE_HEAD) /* * Page flags. If changed at any other time than page allocation or @@ -417,11 +410,11 @@ extern struct mtx_padalign pa_lock[]; * allocated from a per-CPU cache. It is cleared the next time that the * page is allocated from the physical memory allocator. */ -#define PG_PCPU_CACHE 0x01 /* was allocated from per-CPU caches */ -#define PG_FICTITIOUS 0x04 /* physical page doesn't exist */ -#define PG_ZERO 0x08 /* page is zeroed */ -#define PG_MARKER 0x10 /* special queue marker page */ -#define PG_NODUMP 0x80 /* don't include this page in a dump */ +#define PG_PCPU_CACHE 0x0001 /* was allocated from per-CPU caches */ +#define PG_FICTITIOUS 0x0004 /* physical page doesn't exist */ +#define PG_ZERO 0x0008 /* page is zeroed */ +#define PG_MARKER 0x0010 /* special queue marker page */ +#define PG_NODUMP 0x0080 /* don't include this page in a dump */ /* * Misc constants. @@ -579,6 +572,7 @@ int vm_page_grab_valid(vm_page_t *mp, vm_object_t object, vm_pindex_t pindex, void vm_page_deactivate(vm_page_t); void vm_page_deactivate_noreuse(vm_page_t); void vm_page_dequeue(vm_page_t m); +void vm_page_dequeue_deferred(vm_page_t m); vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t); bool vm_page_free_prep(vm_page_t m); vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr); @@ -590,8 +584,6 @@ vm_page_t vm_page_next(vm_page_t m); int vm_page_pa_tryrelock(pmap_t, vm_paddr_t, vm_paddr_t *); void vm_page_pqbatch_drain(void); void vm_page_pqbatch_submit(vm_page_t m, uint8_t queue); -bool vm_page_pqstate_commit(vm_page_t m, vm_page_astate_t *old, - vm_page_astate_t new); vm_page_t vm_page_prev(vm_page_t m); bool vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m); void vm_page_putfake(vm_page_t m); @@ -696,52 +688,64 @@ void vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line); #ifdef INVARIANTS void vm_page_object_lock_assert(vm_page_t m); #define VM_PAGE_OBJECT_LOCK_ASSERT(m) vm_page_object_lock_assert(m) -void vm_page_pagequeue_lock_assert(vm_page_t m, uint8_t queue); -#define VM_PAGE_PAGEQUEUE_LOCK_ASSERT(m, q) vm_page_pagequeue_lock_assert(m, q) void vm_page_assert_pga_writeable(vm_page_t m, uint8_t bits); #define VM_PAGE_ASSERT_PGA_WRITEABLE(m, bits) \ vm_page_assert_pga_writeable(m, bits) #else #define VM_PAGE_OBJECT_LOCK_ASSERT(m) (void)0 -#define VM_PAGE_PAGEQUEUE_LOCK_ASSERT(m, q) (void)0 #define VM_PAGE_ASSERT_PGA_WRITEABLE(m, bits) (void)0 #endif /* - * We want to use atomic updates for the aflags field, which is 16 bits wide. - * However, not all architectures support atomic operations on 16-bit + * We want to use atomic updates for the aflags field, which is 8 bits wide. + * However, not all architectures support atomic operations on 8-bit * destinations. In order that we can easily use a 32-bit operation, we * require that the aflags field be 32-bit aligned. */ -_Static_assert(offsetof(struct vm_page, astate.flags) % sizeof(uint32_t) == 0, +_Static_assert(offsetof(struct vm_page, aflags) % sizeof(uint32_t) == 0, "aflags field is not 32-bit aligned"); -#define VM_PAGE_AFLAG_SHIFT __offsetof(vm_page_astate_t, flags) - /* - * Return the atomic flag set for the page. + * We want to be able to update the aflags and queue fields atomically in + * the same operation. */ -static inline int -vm_page_aflags(vm_page_t m) -{ +_Static_assert(offsetof(struct vm_page, aflags) / sizeof(uint32_t) == + offsetof(struct vm_page, queue) / sizeof(uint32_t), + "aflags and queue fields do not belong to the same 32-bit word"); +_Static_assert(offsetof(struct vm_page, queue) % sizeof(uint32_t) == 2, + "queue field is at an unexpected offset"); +_Static_assert(sizeof(((struct vm_page *)NULL)->queue) == 1, + "queue field has an unexpected size"); - return (m->astate.flags); -} +#if BYTE_ORDER == LITTLE_ENDIAN +#define VM_PAGE_AFLAG_SHIFT 0 +#define VM_PAGE_QUEUE_SHIFT 16 +#else +#define VM_PAGE_AFLAG_SHIFT 24 +#define VM_PAGE_QUEUE_SHIFT 8 +#endif +#define VM_PAGE_QUEUE_MASK (0xff << VM_PAGE_QUEUE_SHIFT) /* * Clear the given bits in the specified page. */ static inline void -vm_page_aflag_clear(vm_page_t m, uint16_t bits) +vm_page_aflag_clear(vm_page_t m, uint8_t bits) { uint32_t *addr, val; + /* + * The PGA_REFERENCED flag can only be cleared if the page is locked. + */ + if ((bits & PGA_REFERENCED) != 0) + vm_page_assert_locked(m); + /* * Access the whole 32-bit word containing the aflags field with an * atomic update. Parallel non-atomic updates to the other fields * within this word are handled properly by the atomic update. */ - addr = (void *)&m->astate; + addr = (void *)&m->aflags; val = bits << VM_PAGE_AFLAG_SHIFT; atomic_clear_32(addr, val); } @@ -750,7 +754,7 @@ vm_page_aflag_clear(vm_page_t m, uint16_t bits) * Set the given bits in the specified page. */ static inline void -vm_page_aflag_set(vm_page_t m, uint16_t bits) +vm_page_aflag_set(vm_page_t m, uint8_t bits) { uint32_t *addr, val; @@ -761,43 +765,44 @@ vm_page_aflag_set(vm_page_t m, uint16_t bits) * atomic update. Parallel non-atomic updates to the other fields * within this word are handled properly by the atomic update. */ - addr = (void *)&m->astate; + addr = (void *)&m->aflags; val = bits << VM_PAGE_AFLAG_SHIFT; atomic_set_32(addr, val); } -static inline vm_page_astate_t -vm_page_astate_load(vm_page_t m) -{ - vm_page_astate_t astate; - - astate._bits = atomic_load_32(&m->astate); - return (astate); -} - +/* + * Atomically update the queue state of the page. The operation fails if + * any of the queue flags in "fflags" are set or if the "queue" field of + * the page does not match the expected value; if the operation is + * successful, the flags in "nflags" are set and all other queue state + * flags are cleared. + */ static inline bool -vm_page_astate_fcmpset(vm_page_t m, vm_page_astate_t *old, - vm_page_astate_t new) +vm_page_pqstate_cmpset(vm_page_t m, uint32_t oldq, uint32_t newq, + uint32_t fflags, uint32_t nflags) { - int ret; + uint32_t *addr, nval, oval, qsmask; - KASSERT(new.queue == PQ_INACTIVE || (new.flags & PGA_REQUEUE_HEAD) == 0, - ("vm_page_astate_fcmpset: unexecpted head requeue for page %p", - m)); - KASSERT((new.flags & PGA_ENQUEUED) == 0 || new.queue != PQ_NONE, - ("vm_page_astate_fcmpset: setting PGA_ENQUEUED without a queue")); - KASSERT(new._bits != old->_bits, - ("vm_page_astate_fcmpset: bits are not changing")); + vm_page_assert_locked(m); - ret = atomic_fcmpset_32(&m->astate._bits, &old->_bits, new._bits); - if (ret != 0) { - if (old->queue != PQ_NONE && old->queue != new.queue) - VM_PAGE_PAGEQUEUE_LOCK_ASSERT(m, old->queue); - KASSERT((new.flags & PGA_ENQUEUED) == 0 || old->queue == new.queue, - ("vm_page_astate_fcmpset: PGA_ENQUEUED set after queue change for page %p", m)); - } + fflags <<= VM_PAGE_AFLAG_SHIFT; + nflags <<= VM_PAGE_AFLAG_SHIFT; + newq <<= VM_PAGE_QUEUE_SHIFT; + oldq <<= VM_PAGE_QUEUE_SHIFT; + qsmask = ((PGA_DEQUEUE | PGA_REQUEUE | PGA_REQUEUE_HEAD) << + VM_PAGE_AFLAG_SHIFT) | VM_PAGE_QUEUE_MASK; - return (ret != 0); + addr = (void *)&m->aflags; + oval = atomic_load_32(addr); + do { + if ((oval & fflags) != 0) + return (false); + if ((oval & VM_PAGE_QUEUE_MASK) != oldq) + return (false); + nval = (oval & ~qsmask) | nflags | newq; + } while (!atomic_fcmpset_32(addr, &oval, nval)); + + return (true); } /* @@ -853,17 +858,19 @@ vm_page_replace_checked(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex, /* * vm_page_queue: * - * Return the index of the queue containing m. + * Return the index of the queue containing m. This index is guaranteed + * not to change while the page lock is held. */ static inline uint8_t vm_page_queue(vm_page_t m) { - vm_page_astate_t as; - as = vm_page_astate_load(m); - if ((as.flags & PGA_DEQUEUE) != 0) + vm_page_assert_locked(m); + + if ((m->aflags & PGA_DEQUEUE) != 0) return (PQ_NONE); - return (as.queue); + atomic_thread_fence_acq(); + return (m->queue); } static inline bool diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index 848239eea411..c7f03129d070 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -218,7 +218,7 @@ vm_pageout_init_scan(struct scan_state *ss, struct vm_pagequeue *pq, { vm_pagequeue_assert_locked(pq); - KASSERT((vm_page_aflags(marker) & PGA_ENQUEUED) == 0, + KASSERT((marker->aflags & PGA_ENQUEUED) == 0, ("marker %p already enqueued", marker)); if (after == NULL) @@ -242,7 +242,7 @@ vm_pageout_end_scan(struct scan_state *ss) pq = ss->pq; vm_pagequeue_assert_locked(pq); - KASSERT((vm_page_aflags(ss->marker) & PGA_ENQUEUED) != 0, + KASSERT((ss->marker->aflags & PGA_ENQUEUED) != 0, ("marker %p not enqueued", ss->marker)); TAILQ_REMOVE(&pq->pq_pl, ss->marker, plinks.q); @@ -271,7 +271,7 @@ vm_pageout_collect_batch(struct scan_state *ss, const bool dequeue) marker = ss->marker; pq = ss->pq; - KASSERT((marker->astate.flags & PGA_ENQUEUED) != 0, + KASSERT((marker->aflags & PGA_ENQUEUED) != 0, ("marker %p not enqueued", ss->marker)); vm_pagequeue_lock(pq); @@ -280,7 +280,7 @@ vm_pageout_collect_batch(struct scan_state *ss, const bool dequeue) m = n, ss->scanned++) { n = TAILQ_NEXT(m, plinks.q); if ((m->flags & PG_MARKER) == 0) { - KASSERT((m->astate.flags & PGA_ENQUEUED) != 0, + KASSERT((m->aflags & PGA_ENQUEUED) != 0, ("page %p not enqueued", m)); KASSERT((m->flags & PG_FICTITIOUS) == 0, ("Fictitious page %p cannot be in page queue", m)); @@ -370,10 +370,13 @@ vm_pageout_cluster(vm_page_t m) ib = 0; break; } + vm_page_lock(p); if (!vm_page_in_laundry(p) || !vm_page_try_remove_write(p)) { + vm_page_unlock(p); ib = 0; break; } + vm_page_unlock(p); mc[--page_base] = pb = p; ++pageout_count; ++ib; @@ -393,8 +396,12 @@ vm_pageout_cluster(vm_page_t m) vm_page_test_dirty(p); if (p->dirty == 0) break; - if (!vm_page_in_laundry(p) || !vm_page_try_remove_write(p)) + vm_page_lock(p); + if (!vm_page_in_laundry(p) || !vm_page_try_remove_write(p)) { + vm_page_unlock(p); break; + } + vm_page_unlock(p); mc[page_base + pageout_count] = ps = p; ++pageout_count; ++is; @@ -451,7 +458,7 @@ vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen, KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, ("vm_pageout_flush: partially invalid page %p index %d/%d", mc[i], i, count)); - KASSERT((vm_page_aflags(mc[i]) & PGA_WRITEABLE) == 0, + KASSERT((mc[i]->aflags & PGA_WRITEABLE) == 0, ("vm_pageout_flush: writeable page %p", mc[i])); vm_page_sbusy(mc[i]); } @@ -570,6 +577,7 @@ vm_pageout_clean(vm_page_t m, int *numpagedout) vm_pindex_t pindex; int error, lockmode; + vm_page_assert_locked(m); object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); error = 0; @@ -589,6 +597,7 @@ vm_pageout_clean(vm_page_t m, int *numpagedout) * of time. */ if (object->type == OBJT_VNODE) { + vm_page_unlock(m); vp = object->handle; if (vp->v_type == VREG && vn_start_write(vp, &mp, V_NOWAIT) != 0) { @@ -618,6 +627,7 @@ vm_pageout_clean(vm_page_t m, int *numpagedout) error = ENOENT; goto unlock_all; } + vm_page_lock(m); /* * While the object and page were unlocked, the page @@ -653,6 +663,7 @@ vm_pageout_clean(vm_page_t m, int *numpagedout) error = EBUSY; goto unlock_all; } + vm_page_unlock(m); /* * If a page is dirty, then it is either being washed @@ -688,13 +699,14 @@ vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall) { struct scan_state ss; struct vm_pagequeue *pq; + struct mtx *mtx; vm_object_t object; vm_page_t m, marker; - vm_page_astate_t old, new; - int act_delta, error, numpagedout, queue, refs, starting_target; + int act_delta, error, numpagedout, queue, starting_target; int vnodes_skipped; bool pageout_ok; + mtx = NULL; object = NULL; starting_target = launder; vnodes_skipped = 0; @@ -722,45 +734,77 @@ vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall) if (__predict_false((m->flags & PG_MARKER) != 0)) continue; + vm_page_change_lock(m, &mtx); + +recheck: /* - * Perform some quick and racy checks of the page's queue state. - * Bail if things are not as we expect. + * The page may have been disassociated from the queue + * or even freed while locks were dropped. We thus must be + * careful whenever modifying page state. Once the object lock + * has been acquired, we have a stable reference to the page. */ - old = vm_page_astate_load(m); - if (old.queue != PQ_LAUNDRY || (old.flags & PGA_ENQUEUED) == 0) + if (vm_page_queue(m) != queue) continue; - if ((old.flags & PGA_QUEUE_OP_MASK) != 0) { + + /* + * A requeue was requested, so this page gets a second + * chance. + */ + if ((m->aflags & PGA_REQUEUE) != 0) { vm_page_pqbatch_submit(m, queue); continue; } + /* + * Wired pages may not be freed. Complete their removal + * from the queue now to avoid needless revisits during + * future scans. This check is racy and must be reverified once + * we hold the object lock and have verified that the page + * is not busy. + */ + if (vm_page_wired(m)) { + vm_page_dequeue_deferred(m); + continue; + } + if (object != m->object) { if (object != NULL) VM_OBJECT_WUNLOCK(object); + + /* + * A page's object pointer may be set to NULL before + * the object lock is acquired. + */ object = (vm_object_t)atomic_load_ptr(&m->object); - if (object == NULL) - continue; - VM_OBJECT_WLOCK(object); - if (m->object != object) { - VM_OBJECT_WUNLOCK(object); - object = NULL; - continue; + if (object != NULL && !VM_OBJECT_TRYWLOCK(object)) { + mtx_unlock(mtx); + /* Depends on type-stability. */ + VM_OBJECT_WLOCK(object); + mtx_lock(mtx); + goto recheck; } } + if (__predict_false(m->object == NULL)) + /* + * The page has been removed from its object. + */ + continue; + KASSERT(m->object == object, ("page %p does not belong to %p", + m, object)); if (vm_page_busied(m)) continue; /* - * Check for wirings now that we hold the object lock and have - * verified that the page is unbusied. If the page is mapped, - * it may still be wired by pmap lookups. The call to + * Re-check for wirings now that we hold the object lock and + * have verified that the page is unbusied. If the page is + * mapped, it may still be wired by pmap lookups. The call to * vm_page_try_remove_all() below atomically checks for such * wirings and removes mappings. If the page is unmapped, the * wire count is guaranteed not to increase. */ if (__predict_false(vm_page_wired(m))) { - vm_page_pqbatch_submit(m, queue); + vm_page_dequeue_deferred(m); continue; } @@ -780,64 +824,46 @@ vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall) * that a reference from a concurrently destroyed mapping is * observed here and now. */ - refs = object->ref_count != 0 ? pmap_ts_referenced(m) : 0; + if (object->ref_count != 0) + act_delta = pmap_ts_referenced(m); + else { + KASSERT(!pmap_page_is_mapped(m), + ("page %p is mapped", m)); + act_delta = 0; + } + if ((m->aflags & PGA_REFERENCED) != 0) { + vm_page_aflag_clear(m, PGA_REFERENCED); + act_delta++; + } + if (act_delta != 0) { + if (object->ref_count != 0) { + VM_CNT_INC(v_reactivated); + vm_page_activate(m); - for (old = vm_page_astate_load(m);;) { - if (old.queue != queue || - (old.flags & PGA_ENQUEUED) == 0) - goto next_page; + /* + * Increase the activation count if the page + * was referenced while in the laundry queue. + * This makes it less likely that the page will + * be returned prematurely to the inactive + * queue. + */ + m->act_count += act_delta + ACT_ADVANCE; - if ((old.flags & PGA_QUEUE_OP_MASK) != 0) { - vm_page_pqbatch_submit(m, queue); - goto next_page; + /* + * If this was a background laundering, count + * activated pages towards our target. The + * purpose of background laundering is to ensure + * that pages are eventually cycled through the + * laundry queue, and an activation is a valid + * way out. + */ + if (!in_shortfall) + launder--; + continue; + } else if ((object->flags & OBJ_DEAD) == 0) { + vm_page_requeue(m); + continue; } - - new = old; - act_delta = refs; - if ((old.flags & PGA_REFERENCED) != 0) { - new.flags &= ~PGA_REFERENCED; - act_delta++; - } - if (act_delta != 0) { - if (object->ref_count != 0) { - /* - * Increase the activation count if the - * page was referenced while in the - * laundry queue. This makes it less - * likely that the page will be returned - * prematurely to the inactive queue. - */ - new.act_count += ACT_ADVANCE + - act_delta; - if (new.act_count > ACT_MAX) - new.act_count = ACT_MAX; - - new.flags |= PGA_REQUEUE; - new.queue = PQ_ACTIVE; - if (!vm_page_pqstate_commit(m, &old, - new)) - continue; - - VM_CNT_INC(v_reactivated); - - /* - * If this was a background laundering, - * count activated pages towards our - * target. The purpose of background - * laundering is to ensure that pages - * are eventually cycled through the - * laundry queue, and an activation is a - * valid way out. - */ - if (!in_shortfall) - launder--; - goto next_page; - } else if ((object->flags & OBJ_DEAD) == 0) { - vm_page_launder(m); - goto next_page; - } - } - break; } /* @@ -850,7 +876,7 @@ vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall) if (object->ref_count != 0) { vm_page_test_dirty(m); if (m->dirty == 0 && !vm_page_try_remove_all(m)) { - vm_page_pqbatch_submit(m, queue); + vm_page_dequeue_deferred(m); continue; } } @@ -874,7 +900,7 @@ vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall) else pageout_ok = true; if (!pageout_ok) { - vm_page_launder(m); + vm_page_requeue(m); continue; } @@ -899,9 +925,13 @@ vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall) pageout_lock_miss++; vnodes_skipped++; } + mtx = NULL; object = NULL; } -next_page:; + } + if (mtx != NULL) { + mtx_unlock(mtx); + mtx = NULL; } if (object != NULL) { VM_OBJECT_WUNLOCK(object); @@ -1139,13 +1169,12 @@ static void vm_pageout_scan_active(struct vm_domain *vmd, int page_shortage) { struct scan_state ss; + struct mtx *mtx; vm_object_t object; vm_page_t m, marker; - vm_page_astate_t old, new; struct vm_pagequeue *pq; long min_scan; - int act_delta, max_scan, ps_delta, refs, scan_tick; - uint8_t nqueue; + int act_delta, max_scan, scan_tick; marker = &vmd->vmd_markers[PQ_ACTIVE]; pq = &vmd->vmd_pagequeues[PQ_ACTIVE]; @@ -1179,6 +1208,7 @@ vm_pageout_scan_active(struct vm_domain *vmd, int page_shortage) * and scanning resumes. */ max_scan = page_shortage > 0 ? pq->pq_cnt : min_scan; + mtx = NULL; act_scan: vm_pageout_init_scan(&ss, pq, marker, &vmd->vmd_clock[0], max_scan); while ((m = vm_pageout_next(&ss, false)) != NULL) { @@ -1197,6 +1227,29 @@ vm_pageout_scan_active(struct vm_domain *vmd, int page_shortage) if (__predict_false((m->flags & PG_MARKER) != 0)) continue; + vm_page_change_lock(m, &mtx); + + /* + * The page may have been disassociated from the queue + * or even freed while locks were dropped. We thus must be + * careful whenever modifying page state. Once the object lock + * has been acquired, we have a stable reference to the page. + */ + if (vm_page_queue(m) != PQ_ACTIVE) + continue; + + /* + * Wired pages are dequeued lazily. + */ + if (vm_page_wired(m)) { + vm_page_dequeue_deferred(m); + continue; + } + + /* + * A page's object pointer may be set to NULL before + * the object lock is acquired. + */ object = (vm_object_t)atomic_load_ptr(&m->object); if (__predict_false(object == NULL)) /* @@ -1211,104 +1264,80 @@ vm_pageout_scan_active(struct vm_domain *vmd, int page_shortage) * that a reference from a concurrently destroyed mapping is * observed here and now. * - * Perform an unsynchronized object ref count check. While the - * page lock ensures that the page is not reallocated to another - * object, in particular, one with unmanaged mappings that - * cannot support pmap_ts_referenced(), two races are, + * Perform an unsynchronized object ref count check. While + * the page lock ensures that the page is not reallocated to + * another object, in particular, one with unmanaged mappings + * that cannot support pmap_ts_referenced(), two races are, * nonetheless, possible: - * * 1) The count was transitioning to zero, but we saw a non- - * zero value. pmap_ts_referenced() will return zero because - * the page is not mapped. - * 2) The count was transitioning to one, but we saw zero. This - * race delays the detection of a new reference. At worst, - * we will deactivate and reactivate the page. + * zero value. pmap_ts_referenced() will return zero + * because the page is not mapped. + * 2) The count was transitioning to one, but we saw zero. + * This race delays the detection of a new reference. At + * worst, we will deactivate and reactivate the page. */ - refs = object->ref_count != 0 ? pmap_ts_referenced(m) : 0; + if (object->ref_count != 0) + act_delta = pmap_ts_referenced(m); + else + act_delta = 0; + if ((m->aflags & PGA_REFERENCED) != 0) { + vm_page_aflag_clear(m, PGA_REFERENCED); + act_delta++; + } - for (old = vm_page_astate_load(m);;) { - if (old.queue != PQ_ACTIVE || - (old.flags & PGA_ENQUEUED) == 0) - /* - * Something has moved the page out of the - * active queue. Don't touch it. - */ - break; - if ((old.flags & PGA_DEQUEUE) != 0) { - vm_page_pqbatch_submit(m, PQ_ACTIVE); - break; - } - - new = old; - act_delta = refs; - if ((old.flags & PGA_REFERENCED) != 0) { - new.flags &= ~PGA_REFERENCED; - act_delta++; - } + /* + * Advance or decay the act_count based on recent usage. + */ + if (act_delta != 0) { + m->act_count += ACT_ADVANCE + act_delta; + if (m->act_count > ACT_MAX) + m->act_count = ACT_MAX; + } else + m->act_count -= min(m->act_count, ACT_DECLINE); + if (m->act_count == 0) { /* - * Advance or decay the act_count based on recent usage. + * When not short for inactive pages, let dirty pages go + * through the inactive queue before moving to the + * laundry queues. This gives them some extra time to + * be reactivated, potentially avoiding an expensive + * pageout. However, during a page shortage, the + * inactive queue is necessarily small, and so dirty + * pages would only spend a trivial amount of time in + * the inactive queue. Therefore, we might as well + * place them directly in the laundry queue to reduce + * queuing overhead. */ - if (act_delta != 0) { - new.act_count += ACT_ADVANCE + act_delta; - if (new.act_count > ACT_MAX) - new.act_count = ACT_MAX; - } else { - new.act_count -= min(new.act_count, ACT_DECLINE); - } - - if (new.act_count > 0) { - /* - * Adjust the activation count and keep the page - * in the active queue. The count might be left - * unchanged if it is saturated. - */ - if (new.act_count == old.act_count || - vm_page_astate_fcmpset(m, &old, new)) - break; + if (page_shortage <= 0) { + vm_page_swapqueue(m, PQ_ACTIVE, PQ_INACTIVE); } else { /* - * When not short for inactive pages, let dirty - * pages go through the inactive queue before - * moving to the laundry queues. This gives - * them some extra time to be reactivated, - * potentially avoiding an expensive pageout. - * However, during a page shortage, the inactive - * queue is necessarily small, and so dirty - * pages would only spend a trivial amount of - * time in the inactive queue. Therefore, we - * might as well place them directly in the - * laundry queue to reduce queuing overhead. - * * Calling vm_page_test_dirty() here would * require acquisition of the object's write * lock. However, during a page shortage, - * directing dirty pages into the laundry queue - * is only an optimization and not a + * directing dirty pages into the laundry + * queue is only an optimization and not a * requirement. Therefore, we simply rely on - * the opportunistic updates to the page's dirty - * field by the pmap. + * the opportunistic updates to the page's + * dirty field by the pmap. */ - if (page_shortage <= 0) { - nqueue = PQ_INACTIVE; - ps_delta = 0; - } else if (m->dirty == 0) { - nqueue = PQ_INACTIVE; - ps_delta = act_scan_laundry_weight; + if (m->dirty == 0) { + vm_page_swapqueue(m, PQ_ACTIVE, + PQ_INACTIVE); + page_shortage -= + act_scan_laundry_weight; } else { - nqueue = PQ_LAUNDRY; - ps_delta = 1; - } - - new.flags |= PGA_REQUEUE; - new.queue = nqueue; - if (vm_page_pqstate_commit(m, &old, new)) { - page_shortage -= ps_delta; - break; + vm_page_swapqueue(m, PQ_ACTIVE, + PQ_LAUNDRY); + page_shortage--; } } } } + if (mtx != NULL) { + mtx_unlock(mtx); + mtx = NULL; + } vm_pagequeue_lock(pq); TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q); TAILQ_INSERT_AFTER(&pq->pq_pl, marker, &vmd->vmd_clock[0], plinks.q); @@ -1320,30 +1349,20 @@ static int vm_pageout_reinsert_inactive_page(struct scan_state *ss, vm_page_t m) { struct vm_domain *vmd; - vm_page_astate_t old, new; - for (old = vm_page_astate_load(m);;) { - if (old.queue != PQ_INACTIVE || - (old.flags & (PGA_DEQUEUE | PGA_ENQUEUED)) != 0) - break; - - new = old; - new.flags |= PGA_ENQUEUED; - new.flags &= ~(PGA_REQUEUE | PGA_REQUEUE_HEAD); - if (!vm_page_astate_fcmpset(m, &old, new)) - continue; - - if ((old.flags & PGA_REQUEUE_HEAD) != 0) { - vmd = vm_pagequeue_domain(m); - TAILQ_INSERT_BEFORE(&vmd->vmd_inacthead, m, plinks.q); - } else if ((old.flags & PGA_REQUEUE) != 0) { - TAILQ_INSERT_TAIL(&ss->pq->pq_pl, m, plinks.q); - } else { - TAILQ_INSERT_BEFORE(ss->marker, m, plinks.q); - } - return (1); - } - return (0); + if (m->queue != PQ_INACTIVE || (m->aflags & PGA_ENQUEUED) != 0) + return (0); + vm_page_aflag_set(m, PGA_ENQUEUED); + if ((m->aflags & PGA_REQUEUE_HEAD) != 0) { + vmd = vm_pagequeue_domain(m); + TAILQ_INSERT_BEFORE(&vmd->vmd_inacthead, m, plinks.q); + vm_page_aflag_clear(m, PGA_REQUEUE | PGA_REQUEUE_HEAD); + } else if ((m->aflags & PGA_REQUEUE) != 0) { + TAILQ_INSERT_TAIL(&ss->pq->pq_pl, m, plinks.q); + vm_page_aflag_clear(m, PGA_REQUEUE | PGA_REQUEUE_HEAD); + } else + TAILQ_INSERT_BEFORE(ss->marker, m, plinks.q); + return (1); } /* @@ -1386,11 +1405,11 @@ vm_pageout_scan_inactive(struct vm_domain *vmd, int shortage, { struct scan_state ss; struct vm_batchqueue rq; + struct mtx *mtx; vm_page_t m, marker; - vm_page_astate_t old, new; struct vm_pagequeue *pq; vm_object_t object; - int act_delta, addl_page_shortage, deficit, page_shortage, refs; + int act_delta, addl_page_shortage, deficit, page_shortage; int starting_page_shortage; /* @@ -1410,6 +1429,7 @@ vm_pageout_scan_inactive(struct vm_domain *vmd, int shortage, deficit = atomic_readandclear_int(&vmd->vmd_pageout_deficit); starting_page_shortage = page_shortage = shortage + deficit; + mtx = NULL; object = NULL; vm_batchqueue_init(&rq); @@ -1427,31 +1447,65 @@ vm_pageout_scan_inactive(struct vm_domain *vmd, int shortage, KASSERT((m->flags & PG_MARKER) == 0, ("marker page %p was dequeued", m)); + vm_page_change_lock(m, &mtx); + +recheck: /* - * Perform some quick and racy checks of the page's queue state. - * Bail if things are not as we expect. + * The page may have been disassociated from the queue + * or even freed while locks were dropped. We thus must be + * careful whenever modifying page state. Once the object lock + * has been acquired, we have a stable reference to the page. */ - old = vm_page_astate_load(m); - if (old.queue != PQ_INACTIVE || (old.flags & PGA_ENQUEUED) != 0) + if (vm_page_queue(m) != PQ_INACTIVE) { + addl_page_shortage++; continue; - if ((old.flags & PGA_QUEUE_OP_MASK) != 0) { - vm_page_pqbatch_submit(m, PQ_INACTIVE); + } + + /* + * The page was re-enqueued after the page queue lock was + * dropped, or a requeue was requested. This page gets a second + * chance. + */ + if ((m->aflags & (PGA_ENQUEUED | PGA_REQUEUE | + PGA_REQUEUE_HEAD)) != 0) + goto reinsert; + + /* + * Wired pages may not be freed. Complete their removal + * from the queue now to avoid needless revisits during + * future scans. This check is racy and must be reverified once + * we hold the object lock and have verified that the page + * is not busy. + */ + if (vm_page_wired(m)) { + vm_page_dequeue_deferred(m); continue; } if (object != m->object) { if (object != NULL) VM_OBJECT_WUNLOCK(object); + + /* + * A page's object pointer may be set to NULL before + * the object lock is acquired. + */ object = (vm_object_t)atomic_load_ptr(&m->object); - if (object == NULL) - continue; - VM_OBJECT_WLOCK(object); - if (m->object != object) { - VM_OBJECT_WUNLOCK(object); - object = NULL; - goto reinsert; + if (object != NULL && !VM_OBJECT_TRYWLOCK(object)) { + mtx_unlock(mtx); + /* Depends on type-stability. */ + VM_OBJECT_WLOCK(object); + mtx_lock(mtx); + goto recheck; } } + if (__predict_false(m->object == NULL)) + /* + * The page has been removed from its object. + */ + continue; + KASSERT(m->object == object, ("page %p does not belong to %p", + m, object)); if (vm_page_busied(m)) { /* @@ -1467,15 +1521,15 @@ vm_pageout_scan_inactive(struct vm_domain *vmd, int shortage, } /* - * Check for wirings now that we hold the object lock and have - * verified that the page is unbusied. If the page is mapped, - * it may still be wired by pmap lookups. The call to + * Re-check for wirings now that we hold the object lock and + * have verified that the page is unbusied. If the page is + * mapped, it may still be wired by pmap lookups. The call to * vm_page_try_remove_all() below atomically checks for such * wirings and removes mappings. If the page is unmapped, the * wire count is guaranteed not to increase. */ if (__predict_false(vm_page_wired(m))) { - vm_page_pqbatch_submit(m, PQ_INACTIVE); + vm_page_dequeue_deferred(m); continue; } @@ -1495,52 +1549,35 @@ vm_pageout_scan_inactive(struct vm_domain *vmd, int shortage, * that a reference from a concurrently destroyed mapping is * observed here and now. */ - refs = object->ref_count != 0 ? pmap_ts_referenced(m) : 0; + if (object->ref_count != 0) + act_delta = pmap_ts_referenced(m); + else { + KASSERT(!pmap_page_is_mapped(m), + ("page %p is mapped", m)); + act_delta = 0; + } + if ((m->aflags & PGA_REFERENCED) != 0) { + vm_page_aflag_clear(m, PGA_REFERENCED); + act_delta++; + } + if (act_delta != 0) { + if (object->ref_count != 0) { + VM_CNT_INC(v_reactivated); + vm_page_activate(m); - for (old = vm_page_astate_load(m);;) { - if (old.queue != PQ_INACTIVE || - (old.flags & PGA_ENQUEUED) != 0) - goto next_page; - - if ((old.flags & PGA_QUEUE_OP_MASK) != 0) { - vm_page_pqbatch_submit(m, PQ_INACTIVE); - goto next_page; + /* + * Increase the activation count if the page + * was referenced while in the inactive queue. + * This makes it less likely that the page will + * be returned prematurely to the inactive + * queue. + */ + m->act_count += act_delta + ACT_ADVANCE; + continue; + } else if ((object->flags & OBJ_DEAD) == 0) { + vm_page_aflag_set(m, PGA_REQUEUE); + goto reinsert; } - - new = old; - act_delta = refs; - if ((old.flags & PGA_REFERENCED) != 0) { - new.flags &= ~PGA_REFERENCED; - act_delta++; - } - if (act_delta != 0) { - if (object->ref_count != 0) { - /* - * Increase the activation count if the - * page was referenced while in the - * inactive queue. This makes it less - * likely that the page will be returned - * prematurely to the inactive queue. - */ - new.act_count += ACT_ADVANCE + - act_delta; - if (new.act_count > ACT_MAX) - new.act_count = ACT_MAX; - - new.flags |= PGA_REQUEUE; - new.queue = PQ_ACTIVE; - if (!vm_page_pqstate_commit(m, &old, - new)) - continue; - - VM_CNT_INC(v_reactivated); - goto next_page; - } else if ((object->flags & OBJ_DEAD) == 0) { - vm_page_aflag_set(m, PGA_REQUEUE); - goto reinsert; - } - } - break; } /* @@ -1553,7 +1590,7 @@ vm_pageout_scan_inactive(struct vm_domain *vmd, int shortage, if (object->ref_count != 0) { vm_page_test_dirty(m); if (m->dirty == 0 && !vm_page_try_remove_all(m)) { - vm_page_pqbatch_submit(m, PQ_INACTIVE); + vm_page_dequeue_deferred(m); continue; } } @@ -1567,30 +1604,25 @@ vm_pageout_scan_inactive(struct vm_domain *vmd, int shortage, */ if (m->dirty == 0) { free_page: - /* XXX comment */ - old = vm_page_astate_load(m); - if (old.queue != PQ_INACTIVE || - (old.flags & PGA_QUEUE_STATE_MASK) != 0) { - vm_page_pqbatch_submit(m, PQ_INACTIVE); - goto next_page; - } - /* * Because we dequeued the page and have already * checked for concurrent dequeue and enqueue * requests, we can safely disassociate the page * from the inactive queue. */ - m->astate.queue = PQ_NONE; + KASSERT((m->aflags & PGA_QUEUE_STATE_MASK) == 0, + ("page %p has queue state", m)); + m->queue = PQ_NONE; vm_page_free(m); page_shortage--; } else if ((object->flags & OBJ_DEAD) == 0) vm_page_launder(m); -next_page: continue; reinsert: vm_pageout_reinsert_inactive(&ss, &rq, m); } + if (mtx != NULL) + mtx_unlock(mtx); if (object != NULL) VM_OBJECT_WUNLOCK(object); vm_pageout_reinsert_inactive(&ss, &rq, NULL); diff --git a/sys/vm/vm_pagequeue.h b/sys/vm/vm_pagequeue.h index b3e244755a05..ba5e77ce6c8d 100644 --- a/sys/vm/vm_pagequeue.h +++ b/sys/vm/vm_pagequeue.h @@ -202,8 +202,6 @@ static inline void vm_pagequeue_remove(struct vm_pagequeue *pq, vm_page_t m) { - vm_pagequeue_assert_locked(pq); - TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_dec(pq); } @@ -251,22 +249,6 @@ vm_pagequeue_domain(vm_page_t m) return (VM_DOMAIN(vm_phys_domain(m))); } -static inline struct vm_pagequeue * -_vm_page_pagequeue(vm_page_t m, uint8_t queue) -{ - - if (queue == PQ_NONE) - return (NULL); - return (&vm_pagequeue_domain(m)->vmd_pagequeues[queue]); -} - -static inline struct vm_pagequeue * -vm_page_pagequeue(vm_page_t m) -{ - - return (_vm_page_pagequeue(m, atomic_load_8(&m->astate.queue))); -} - /* * Return the number of pages we need to free-up or cache * A positive number indicates that we do not have enough free pages. diff --git a/sys/vm/vm_swapout.c b/sys/vm/vm_swapout.c index e45034348dec..d71c3d07a0d9 100644 --- a/sys/vm/vm_swapout.c +++ b/sys/vm/vm_swapout.c @@ -108,9 +108,8 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include -#include #include +#include #include #include #include @@ -171,56 +170,6 @@ static void swapout_procs(int action); static void vm_req_vmdaemon(int req); static void vm_thread_swapout(struct thread *td); -static void -vm_swapout_object_deactivate_page(vm_page_t m, int remove_mode) -{ - vm_page_astate_t old, new; - int act_delta, refs; - - refs = pmap_ts_referenced(m); - - for (old = vm_page_astate_load(m);;) { - if ((old.flags & PGA_DEQUEUE) != 0) - break; - - act_delta = refs; - if ((old.flags & PGA_REFERENCED) != 0) { - new.flags &= ~PGA_REFERENCED; - act_delta++; - } - - if (old.queue != PQ_ACTIVE && act_delta != 0) { - if (new.act_count == ACT_MAX) - break; - new.act_count += act_delta; - new.flags |= PGA_REQUEUE; - new.queue = PQ_ACTIVE; - if (vm_page_pqstate_commit(m, &old, new)) - break; - } else if (old.queue == PQ_ACTIVE) { - if (act_delta == 0) { - new.act_count -= min(new.act_count, - ACT_DECLINE); - if (!remove_mode && new.act_count == 0) { - (void)vm_page_try_remove_all(m); - - new.flags |= PGA_REQUEUE; - new.queue = PQ_INACTIVE; - } - if (vm_page_pqstate_commit(m, &old, new)) - break; - } else { - if (new.act_count < ACT_MAX - ACT_ADVANCE) - new.act_count += ACT_ADVANCE; - if (vm_page_astate_fcmpset(m, &old, new)) - break; - } - } else { - (void)vm_page_try_remove_all(m); - } - } -} - /* * vm_swapout_object_deactivate_pages * @@ -235,7 +184,7 @@ vm_swapout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object, { vm_object_t backing_object, object; vm_page_t p; - int remove_mode; + int act_delta, remove_mode; VM_OBJECT_ASSERT_LOCKED(first_object); if ((first_object->flags & OBJ_FICTITIOUS) != 0) @@ -271,8 +220,37 @@ vm_swapout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object, VM_CNT_INC(v_pdpages); if (!pmap_page_exists_quick(pmap, p)) continue; - - vm_swapout_object_deactivate_page(p, remove_mode); + act_delta = pmap_ts_referenced(p); + vm_page_lock(p); + if ((p->aflags & PGA_REFERENCED) != 0) { + if (act_delta == 0) + act_delta = 1; + vm_page_aflag_clear(p, PGA_REFERENCED); + } + if (!vm_page_active(p) && act_delta != 0) { + vm_page_activate(p); + p->act_count += act_delta; + } else if (vm_page_active(p)) { + /* + * The page daemon does not requeue pages + * after modifying their activation count. + */ + if (act_delta == 0) { + p->act_count -= min(p->act_count, + ACT_DECLINE); + if (!remove_mode && p->act_count == 0) { + (void)vm_page_try_remove_all(p); + vm_page_deactivate(p); + } + } else { + vm_page_activate(p); + if (p->act_count < ACT_MAX - + ACT_ADVANCE) + p->act_count += ACT_ADVANCE; + } + } else if (vm_page_inactive(p)) + (void)vm_page_try_remove_all(p); + vm_page_unlock(p); } if ((backing_object = object->backing_object) == NULL) goto unlock_return;