From 9d5cba36c50aca29c2a77f3f9c8e1ae03272c66a Mon Sep 17 00:00:00 2001 From: markj Date: Wed, 13 Feb 2019 17:19:37 +0000 Subject: [PATCH] Implement transparent 2MB superpage promotion for RISC-V. This includes support for pmap_enter(..., psind=1) as described in the commit log message for r321378. The changes are largely modelled after amd64. arm64 has more stringent requirements around superpage creation to avoid the possibility of TLB conflict aborts, and these requirements do not apply to RISC-V, which like amd64 permits simultaneous caching of 4KB and 2MB translations for a given page. RISC-V's PTE format includes only two software bits, and as these are already consumed we do not have an analogue for amd64's PG_PROMOTED. Instead, pmap_remove_l2() always invalidates the entire 2MB address range. pmap_ts_referenced() is modified to clear PTE_A, now that we support both hardware- and software-managed reference and dirty bits. Also fix pmap_fault_fixup() so that it does not set PTE_A or PTE_D on kernel mappings. Reviewed by: kib (earlier version) Discussed with: jhb Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D18863 Differential Revision: https://reviews.freebsd.org/D18864 Differential Revision: https://reviews.freebsd.org/D18865 Differential Revision: https://reviews.freebsd.org/D18866 Differential Revision: https://reviews.freebsd.org/D18867 Differential Revision: https://reviews.freebsd.org/D18868 --- sys/riscv/include/param.h | 2 +- sys/riscv/include/pmap.h | 4 + sys/riscv/include/pte.h | 5 +- sys/riscv/include/vmparam.h | 4 +- sys/riscv/riscv/pmap.c | 1469 +++++++++++++++++++++++++++++------ sys/vm/vm_fault.c | 8 +- 6 files changed, 1247 insertions(+), 245 deletions(-) diff --git a/sys/riscv/include/param.h b/sys/riscv/include/param.h index f22e747ffdcc..4b4610bea8d1 100644 --- a/sys/riscv/include/param.h +++ b/sys/riscv/include/param.h @@ -82,7 +82,7 @@ #define PAGE_SIZE (1 << PAGE_SHIFT) /* Page size */ #define PAGE_MASK (PAGE_SIZE - 1) -#define MAXPAGESIZES 1 /* maximum number of supported page sizes */ +#define MAXPAGESIZES 3 /* maximum number of supported page sizes */ #ifndef KSTACK_PAGES #define KSTACK_PAGES 4 /* pages of kernel stack (with pcb) */ diff --git a/sys/riscv/include/pmap.h b/sys/riscv/include/pmap.h index 79c5dc2a3da2..92eeee26331f 100644 --- a/sys/riscv/include/pmap.h +++ b/sys/riscv/include/pmap.h @@ -44,6 +44,8 @@ #include #include +#include + #ifdef _KERNEL #define vtophys(va) pmap_kextract((vm_offset_t)(va)) @@ -80,6 +82,7 @@ struct pmap { pd_entry_t *pm_l1; TAILQ_HEAD(,pv_chunk) pm_pvchunk; /* list of mappings in pmap */ LIST_ENTRY(pmap) pm_list; /* List of all pmaps */ + struct vm_radix pm_root; }; typedef struct pv_entry { @@ -139,6 +142,7 @@ void pmap_kenter_device(vm_offset_t, vm_size_t, vm_paddr_t); vm_paddr_t pmap_kextract(vm_offset_t va); void pmap_kremove(vm_offset_t); void pmap_kremove_device(vm_offset_t, vm_size_t); +bool pmap_ps_enabled(pmap_t); void *pmap_mapdev(vm_offset_t, vm_size_t); void *pmap_mapbios(vm_paddr_t, vm_size_t); diff --git a/sys/riscv/include/pte.h b/sys/riscv/include/pte.h index 723c9cf26ffb..a88566d890ae 100644 --- a/sys/riscv/include/pte.h +++ b/sys/riscv/include/pte.h @@ -62,7 +62,8 @@ typedef uint64_t pn_t; /* page number */ #define L3_SIZE (1 << L3_SHIFT) #define L3_OFFSET (L3_SIZE - 1) -#define Ln_ENTRIES (1 << 9) +#define Ln_ENTRIES_SHIFT 9 +#define Ln_ENTRIES (1 << Ln_ENTRIES_SHIFT) #define Ln_ADDR_MASK (Ln_ENTRIES - 1) /* Bits 9:8 are reserved for software */ @@ -79,6 +80,8 @@ typedef uint64_t pn_t; /* page number */ #define PTE_RWX (PTE_R | PTE_W | PTE_X) #define PTE_RX (PTE_R | PTE_X) #define PTE_KERN (PTE_V | PTE_R | PTE_W | PTE_A | PTE_D) +#define PTE_PROMOTE (PTE_V | PTE_RWX | PTE_D | PTE_A | PTE_G | PTE_U | \ + PTE_SW_MANAGED | PTE_SW_WIRED) #define PTE_PPN0_S 10 #define PTE_PPN1_S 19 diff --git a/sys/riscv/include/vmparam.h b/sys/riscv/include/vmparam.h index 49c720e681c2..ee03f7b09cc2 100644 --- a/sys/riscv/include/vmparam.h +++ b/sys/riscv/include/vmparam.h @@ -99,10 +99,10 @@ #define VM_NFREEORDER 12 /* - * Disable superpage reservations. + * Enable superpage reservations: 1 level. */ #ifndef VM_NRESERVLEVEL -#define VM_NRESERVLEVEL 0 +#define VM_NRESERVLEVEL 1 #endif /* diff --git a/sys/riscv/riscv/pmap.c b/sys/riscv/riscv/pmap.c index 2e9c3a12579b..3929fe1b3e51 100644 --- a/sys/riscv/riscv/pmap.c +++ b/sys/riscv/riscv/pmap.c @@ -118,6 +118,7 @@ __FBSDID("$FreeBSD$"); */ #include +#include #include #include #include @@ -145,6 +146,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -154,9 +156,8 @@ __FBSDID("$FreeBSD$"); #include #include -#define NPDEPG (PAGE_SIZE/(sizeof (pd_entry_t))) -#define NUPDE (NPDEPG * NPDEPG) -#define NUSERPGTBLS (NUPDE + NPDEPG) +#define NUL1E (Ln_ENTRIES * Ln_ENTRIES) +#define NUL2E (Ln_ENTRIES * NUL1E) #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ @@ -175,11 +176,12 @@ __FBSDID("$FreeBSD$"); #endif #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) +#define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) #define NPV_LIST_LOCKS MAXCPU #define PHYS_TO_PV_LIST_LOCK(pa) \ - (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) + (&pv_list_locks[pmap_l2_pindex(pa) % NPV_LIST_LOCKS]) #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ struct rwlock **_lockp = (lockp); \ @@ -230,12 +232,51 @@ CTASSERT((DMAP_MAX_ADDRESS & ~L1_OFFSET) == DMAP_MAX_ADDRESS); static struct rwlock_padalign pvh_global_lock; static struct mtx_padalign allpmaps_lock; +static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, + "VM/pmap parameters"); + +static int superpages_enabled = 1; +SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, + CTLFLAG_RDTUN, &superpages_enabled, 0, + "Enable support for transparent superpages"); + +static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD, 0, + "2MB page mapping counters"); + +static u_long pmap_l2_demotions; +SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, + &pmap_l2_demotions, 0, + "2MB page demotions"); + +static u_long pmap_l2_mappings; +SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, + &pmap_l2_mappings, 0, + "2MB page mappings"); + +static u_long pmap_l2_p_failures; +SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, + &pmap_l2_p_failures, 0, + "2MB page promotion failures"); + +static u_long pmap_l2_promotions; +SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, + &pmap_l2_promotions, 0, + "2MB page promotions"); + /* * Data for the pv entry allocation mechanism */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static struct mtx pv_chunks_mutex; static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; +static struct md_page *pv_table; +static struct md_page pv_dummy; + +/* + * Internal flags for pmap_enter()'s helper functions. + */ +#define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ +#define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); @@ -244,6 +285,11 @@ static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); +static bool pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va); +static bool pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, + vm_offset_t va, struct rwlock **lockp); +static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, + u_int flags, vm_page_t m, struct rwlock **lockp); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, @@ -254,9 +300,9 @@ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); -static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, +static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); -static int pmap_unuse_l3(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); +static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); #define pmap_clear(pte) pmap_store(pte, 0) #define pmap_clear_bits(pte, bits) atomic_clear_64(pte, bits) @@ -636,7 +682,8 @@ pmap_page_init(vm_page_t m) void pmap_init(void) { - int i; + vm_size_t s; + int i, pv_npg; /* * Initialize the pv chunk and pmap list mutexes. @@ -649,6 +696,24 @@ pmap_init(void) */ for (i = 0; i < NPV_LIST_LOCKS; i++) rw_init(&pv_list_locks[i], "pmap pv list"); + + /* + * Calculate the size of the pv head table for superpages. + */ + pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE); + + /* + * Allocate memory for the pv head table for superpages. + */ + s = (vm_size_t)(pv_npg * sizeof(struct md_page)); + s = round_page(s); + pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); + for (i = 0; i < pv_npg; i++) + TAILQ_INIT(&pv_table[i].pv_list); + TAILQ_INIT(&pv_dummy.pv_list); + + if (superpages_enabled) + pagesizes[1] = L2_SIZE; } #ifdef SMP @@ -999,6 +1064,13 @@ pmap_qremove(vm_offset_t sva, int count) pmap_invalidate_range(kernel_pmap, sva, va); } +bool +pmap_ps_enabled(pmap_t pmap __unused) +{ + + return (superpages_enabled); +} + /*************************************************** * Page table page management routines..... ***************************************************/ @@ -1018,6 +1090,34 @@ pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } + +/* + * Inserts the specified page table page into the specified pmap's collection + * of idle page table pages. Each of a pmap's page table pages is responsible + * for mapping a distinct range of virtual addresses. The pmap's collection is + * ordered by this virtual address range. + */ +static __inline int +pmap_insert_pt_page(pmap_t pmap, vm_page_t ml3) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + return (vm_radix_insert(&pmap->pm_root, ml3)); +} + +/* + * Removes the page table page mapping the specified virtual address from the + * specified pmap's collection of idle page table pages, and returns it. + * Otherwise, returns NULL if there is no page table page corresponding to the + * specified virtual address. + */ +static __inline vm_page_t +pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); +} /* * Decrements a page table page's wire count, which is used to record the @@ -1026,12 +1126,12 @@ pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, * page table page was unmapped and FALSE otherwise. */ static inline boolean_t -pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) +pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { --m->wire_count; if (m->wire_count == 0) { - _pmap_unwire_l3(pmap, va, m, free); + _pmap_unwire_ptp(pmap, va, m, free); return (TRUE); } else { return (FALSE); @@ -1039,36 +1139,30 @@ pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) } static void -_pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) +_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { vm_paddr_t phys; PMAP_LOCK_ASSERT(pmap, MA_OWNED); - /* - * unmap the page table page - */ - if (m->pindex >= NUPDE) { - /* PD page */ + if (m->pindex >= NUL1E) { pd_entry_t *l1; l1 = pmap_l1(pmap, va); pmap_clear(l1); pmap_distribute_l1(pmap, pmap_l1_index(va), 0); } else { - /* PTE page */ pd_entry_t *l2; l2 = pmap_l2(pmap, va); pmap_clear(l2); } pmap_resident_count_dec(pmap, 1); - if (m->pindex < NUPDE) { + if (m->pindex < NUL1E) { pd_entry_t *l1; - /* We just released a PT, unhold the matching PD */ vm_page_t pdpg; l1 = pmap_l1(pmap, va); phys = PTE_TO_PHYS(pmap_load(l1)); pdpg = PHYS_TO_VM_PAGE(phys); - pmap_unwire_l3(pmap, va, pdpg, free); + pmap_unwire_ptp(pmap, va, pdpg, free); } pmap_invalidate_page(pmap, va); @@ -1082,24 +1176,20 @@ _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) } /* - * After removing an l3 entry, this routine is used to + * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int -pmap_unuse_l3(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, +pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, struct spglist *free) { - vm_paddr_t phys; vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); - - phys = PTE_TO_PHYS(ptepde); - - mpte = PHYS_TO_VM_PAGE(phys); - return (pmap_unwire_l3(pmap, va, mpte, free)); + mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(ptepde)); + return (pmap_unwire_ptp(pmap, va, mpte, free)); } void @@ -1140,6 +1230,8 @@ pmap_pinit(pmap_t pmap) LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); mtx_unlock(&allpmaps_lock); + vm_radix_init(&pmap->pm_root); + return (1); } @@ -1193,11 +1285,11 @@ _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) * it isn't already there. */ - if (ptepindex >= NUPDE) { + if (ptepindex >= NUL1E) { pd_entry_t *l1; vm_pindex_t l1index; - l1index = ptepindex - NUPDE; + l1index = ptepindex - NUL1E; l1 = &pmap->pm_l1[l1index]; pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE); @@ -1213,7 +1305,7 @@ _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) l1 = &pmap->pm_l1[l1index]; if (pmap_load(l1) == 0) { /* recurse for allocating page dir */ - if (_pmap_alloc_l3(pmap, NUPDE + l1index, + if (_pmap_alloc_l3(pmap, NUL1E + l1index, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); @@ -1240,6 +1332,29 @@ _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) return (m); } +static vm_page_t +pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) +{ + pd_entry_t *l1; + vm_page_t l2pg; + vm_pindex_t l2pindex; + +retry: + l1 = pmap_l1(pmap, va); + if (l1 != NULL && (pmap_load(l1) & PTE_RWX) == 0) { + /* Add a reference to the L2 page. */ + l2pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l1))); + l2pg->wire_count++; + } else { + /* Allocate a L2 page. */ + l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; + l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); + if (l2pg == NULL && lockp != NULL) + goto retry; + } + return (l2pg); +} + static vm_page_t pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { @@ -1598,6 +1713,79 @@ get_pv_entry(pmap_t pmap, struct rwlock **lockp) return (pv); } +/* + * Ensure that the number of spare PV entries in the specified pmap meets or + * exceeds the given count, "needed". + * + * The given PV list lock may be released. + */ +static void +reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) +{ + struct pch new_tail; + struct pv_chunk *pc; + vm_page_t m; + int avail, free; + bool reclaimed; + + rw_assert(&pvh_global_lock, RA_LOCKED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); + + /* + * Newly allocated PV chunks must be stored in a private list until + * the required number of PV chunks have been allocated. Otherwise, + * reclaim_pv_chunk() could recycle one of these chunks. In + * contrast, these chunks must be added to the pmap upon allocation. + */ + TAILQ_INIT(&new_tail); +retry: + avail = 0; + TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { + bit_count((bitstr_t *)pc->pc_map, 0, + sizeof(pc->pc_map) * NBBY, &free); + if (free == 0) + break; + avail += free; + if (avail >= needed) + break; + } + for (reclaimed = false; avail < needed; avail += _NPCPV) { + m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | + VM_ALLOC_WIRED); + if (m == NULL) { + m = reclaim_pv_chunk(pmap, lockp); + if (m == NULL) + goto retry; + reclaimed = true; + } + /* XXX PV STATS */ +#if 0 + dump_add_page(m->phys_addr); +#endif + pc = (void *)PHYS_TO_DMAP(m->phys_addr); + pc->pc_pmap = pmap; + pc->pc_map[0] = PC_FREE0; + pc->pc_map[1] = PC_FREE1; + pc->pc_map[2] = PC_FREE2; + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); + + /* + * The reclaim might have freed a chunk from the current pmap. + * If that chunk contained available entries, we need to + * re-count the number of available entries. + */ + if (reclaimed) + goto retry; + } + if (!TAILQ_EMPTY(&new_tail)) { + mtx_lock(&pv_chunks_mutex); + TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); + mtx_unlock(&pv_chunks_mutex); + } +} + /* * First find and then remove the pv entry for the specified pmap and virtual * address from the specified pv list. Returns the pv entry if found and NULL @@ -1632,7 +1820,7 @@ pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) pv = pmap_pvh_remove(pvh, pmap, va); - KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); + KASSERT(pv != NULL, ("pmap_pvh_free: pv not found for %#lx", va)); free_pv_entry(pmap, pv); } @@ -1659,6 +1847,222 @@ pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, return (FALSE); } +/* + * After demotion from a 2MB page mapping to 512 4KB page mappings, + * destroy the pv entry for the 2MB page mapping and reinstantiate the pv + * entries for each of the 4KB page mappings. + */ +static void __unused +pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, + struct rwlock **lockp) +{ + struct md_page *pvh; + struct pv_chunk *pc; + pv_entry_t pv; + vm_page_t m; + vm_offset_t va_last; + int bit, field; + + rw_assert(&pvh_global_lock, RA_LOCKED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); + + /* + * Transfer the 2mpage's pv entry for this mapping to the first + * page's pv list. Once this transfer begins, the pv list lock + * must not be released until the last pv entry is reinstantiated. + */ + pvh = pa_to_pvh(pa); + va &= ~L2_OFFSET; + pv = pmap_pvh_remove(pvh, pmap, va); + KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); + m = PHYS_TO_VM_PAGE(pa); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); + m->md.pv_gen++; + /* Instantiate the remaining 511 pv entries. */ + va_last = va + L2_SIZE - PAGE_SIZE; + for (;;) { + pc = TAILQ_FIRST(&pmap->pm_pvchunk); + KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || + pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare")); + for (field = 0; field < _NPCM; field++) { + while (pc->pc_map[field] != 0) { + bit = ffsl(pc->pc_map[field]) - 1; + pc->pc_map[field] &= ~(1ul << bit); + pv = &pc->pc_pventry[field * 64 + bit]; + va += PAGE_SIZE; + pv->pv_va = va; + m++; + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_pv_demote_l2: page %p is not managed", m)); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); + m->md.pv_gen++; + if (va == va_last) + goto out; + } + } + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); + } +out: + if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); + } + /* XXX PV stats */ +} + +#if VM_NRESERVLEVEL > 0 +static void +pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, + struct rwlock **lockp) +{ + struct md_page *pvh; + pv_entry_t pv; + vm_page_t m; + vm_offset_t va_last; + + rw_assert(&pvh_global_lock, RA_LOCKED); + KASSERT((va & L2_OFFSET) == 0, + ("pmap_pv_promote_l2: misaligned va %#lx", va)); + + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); + + m = PHYS_TO_VM_PAGE(pa); + pv = pmap_pvh_remove(&m->md, pmap, va); + KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv for %#lx not found", va)); + pvh = pa_to_pvh(pa); + TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); + pvh->pv_gen++; + + va_last = va + L2_SIZE - PAGE_SIZE; + do { + m++; + va += PAGE_SIZE; + pmap_pvh_free(&m->md, pmap, va); + } while (va < va_last); +} +#endif /* VM_NRESERVLEVEL > 0 */ + +/* + * Create the PV entry for a 2MB page mapping. Always returns true unless the + * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns + * false if the PV entry cannot be allocated without resorting to reclamation. + */ +static bool +pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, + struct rwlock **lockp) +{ + struct md_page *pvh; + pv_entry_t pv; + vm_paddr_t pa; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + /* Pass NULL instead of the lock pointer to disable reclamation. */ + if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? + NULL : lockp)) == NULL) + return (false); + pv->pv_va = va; + pa = PTE_TO_PHYS(l2e); + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); + pvh = pa_to_pvh(pa); + TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); + pvh->pv_gen++; + return (true); +} + +static void +pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) +{ + pt_entry_t newl2, oldl2; + vm_page_t ml3; + vm_paddr_t ml3pa; + + KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); + KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + ml3 = pmap_remove_pt_page(pmap, va); + if (ml3 == NULL) + panic("pmap_remove_kernel_l2: Missing pt page"); + + ml3pa = VM_PAGE_TO_PHYS(ml3); + newl2 = ml3pa | PTE_V; + + /* + * Initialize the page table page. + */ + pagezero((void *)PHYS_TO_DMAP(ml3pa)); + + /* + * Demote the mapping. + */ + oldl2 = pmap_load_store(l2, newl2); + KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", + __func__, l2, oldl2)); +} + +/* + * pmap_remove_l2: Do the things to unmap a level 2 superpage. + */ +static int +pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, + pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) +{ + struct md_page *pvh; + pt_entry_t oldl2; + vm_offset_t eva, va; + vm_page_t m, ml3; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); + oldl2 = pmap_load_clear(l2); + KASSERT((oldl2 & PTE_RWX) != 0, + ("pmap_remove_l2: L2e %lx is not a superpage mapping", oldl2)); + + /* + * The sfence.vma documentation states that it is sufficient to specify + * a single address within a superpage mapping. However, since we do + * not perform any invalidation upon promotion, TLBs may still be + * caching 4KB mappings within the superpage, so we must invalidate the + * entire range. + */ + pmap_invalidate_range(pmap, sva, sva + L2_SIZE); + if ((oldl2 & PTE_SW_WIRED) != 0) + pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; + pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); + if ((oldl2 & PTE_SW_MANAGED) != 0) { + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, PTE_TO_PHYS(oldl2)); + pvh = pa_to_pvh(PTE_TO_PHYS(oldl2)); + pmap_pvh_free(pvh, pmap, sva); + eva = sva + L2_SIZE; + for (va = sva, m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(oldl2)); + va < eva; va += PAGE_SIZE, m++) { + if ((oldl2 & PTE_D) != 0) + vm_page_dirty(m); + if ((oldl2 & PTE_A) != 0) + vm_page_aflag_set(m, PGA_REFERENCED); + if (TAILQ_EMPTY(&m->md.pv_list) && + TAILQ_EMPTY(&pvh->pv_list)) + vm_page_aflag_clear(m, PGA_WRITEABLE); + } + } + if (pmap == kernel_pmap) { + pmap_remove_kernel_l2(pmap, l2, sva); + } else { + ml3 = pmap_remove_pt_page(pmap, sva); + if (ml3 != NULL) { + pmap_resident_count_dec(pmap, 1); + KASSERT(ml3->wire_count == Ln_ENTRIES, + ("pmap_remove_l2: l3 page wire count error")); + ml3->wire_count = 1; + vm_page_unwire_noq(ml3); + pmap_add_delayed_free_list(ml3, free, FALSE); + } + } + return (pmap_unuse_pt(pmap, sva, l1e, free)); +} + /* * pmap_remove_l3: do the things to unmap a page in a process */ @@ -1687,7 +2091,7 @@ pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, pmap_pvh_free(&m->md, pmap, va); } - return (pmap_unuse_l3(pmap, va, l2e, free)); + return (pmap_unuse_pt(pmap, va, l2e, free)); } /* @@ -1699,11 +2103,11 @@ pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { + struct spglist free; struct rwlock *lock; vm_offset_t va, va_next; - pd_entry_t *l1, *l2; - pt_entry_t l3_pte, *l3; - struct spglist free; + pd_entry_t *l1, *l2, l2e; + pt_entry_t *l3; /* * Perform an unsynchronized read. This is, however, safe. @@ -1739,16 +2143,22 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) l2 = pmap_l1_to_l2(l1, sva); if (l2 == NULL) continue; - - l3_pte = pmap_load(l2); - - /* - * Weed out invalid mappings. - */ - if (l3_pte == 0) - continue; - if ((pmap_load(l2) & PTE_RX) != 0) + if ((l2e = pmap_load(l2)) == 0) continue; + if ((l2e & PTE_RWX) != 0) { + if (sva + L2_SIZE == va_next && eva >= va_next) { + (void)pmap_remove_l2(pmap, l2, sva, + pmap_load(l1), &free, &lock); + continue; + } else if (!pmap_demote_l2_locked(pmap, l2, sva, + &lock)) { + /* + * The large page mapping was destroyed. + */ + continue; + } + l2e = pmap_load(l2); + } /* * Limit our scan to either the end of the va represented @@ -1761,8 +2171,6 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) va = va_next; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { - if (l3 == NULL) - panic("l3 == NULL"); if (pmap_load(l3) == 0) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); @@ -1772,8 +2180,7 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) } if (va == va_next) va = sva; - if (pmap_remove_l3(pmap, l3, sva, l3_pte, &free, - &lock)) { + if (pmap_remove_l3(pmap, l3, sva, l2e, &free, &lock)) { sva += L3_SIZE; break; } @@ -1783,7 +2190,7 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) } if (lock != NULL) rw_wunlock(lock); - rw_runlock(&pvh_global_lock); + rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, false); } @@ -1804,42 +2211,54 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) void pmap_remove_all(vm_page_t m) { - pv_entry_t pv; - pmap_t pmap; - pt_entry_t *l3, tl3; - pd_entry_t *l2, tl2; struct spglist free; + struct md_page *pvh; + pmap_t pmap; + pt_entry_t *l3, l3e; + pd_entry_t *l2, l2e; + pv_entry_t pv; + vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); + pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : + pa_to_pvh(VM_PAGE_TO_PHYS(m)); + rw_wlock(&pvh_global_lock); + while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + va = pv->pv_va; + l2 = pmap_l2(pmap, va); + (void)pmap_demote_l2(pmap, l2, va); + PMAP_UNLOCK(pmap); + } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pmap_resident_count_dec(pmap, 1); l2 = pmap_l2(pmap, pv->pv_va); KASSERT(l2 != NULL, ("pmap_remove_all: no l2 table found")); - tl2 = pmap_load(l2); + l2e = pmap_load(l2); - KASSERT((tl2 & PTE_RX) == 0, - ("pmap_remove_all: found a table when expecting " - "a block in %p's pv list", m)); + KASSERT((l2e & PTE_RX) == 0, + ("pmap_remove_all: found a superpage in %p's pv list", m)); l3 = pmap_l2_to_l3(l2, pv->pv_va); - tl3 = pmap_load_clear(l3); + l3e = pmap_load_clear(l3); pmap_invalidate_page(pmap, pv->pv_va); - if (tl3 & PTE_SW_WIRED) + if (l3e & PTE_SW_WIRED) pmap->pm_stats.wired_count--; - if ((tl3 & PTE_A) != 0) + if ((l3e & PTE_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ - if ((tl3 & PTE_D) != 0) + if ((l3e & PTE_D) != 0) vm_page_dirty(m); - pmap_unuse_l3(pmap, pv->pv_va, pmap_load(l2), &free); + pmap_unuse_pt(pmap, pv->pv_va, pmap_load(l2), &free); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; free_pv_entry(pmap, pv); @@ -1857,10 +2276,12 @@ pmap_remove_all(vm_page_t m) void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { - pd_entry_t *l1, *l2; + pd_entry_t *l1, *l2, l2e; pt_entry_t *l3, l3e, mask; vm_page_t m; - vm_offset_t va_next; + vm_paddr_t pa; + vm_offset_t va, va_next; + bool anychanged, pv_lists_locked; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); @@ -1871,12 +2292,14 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) (VM_PROT_WRITE | VM_PROT_EXECUTE)) return; + anychanged = false; + pv_lists_locked = false; mask = 0; if ((prot & VM_PROT_WRITE) == 0) mask |= PTE_W | PTE_D; if ((prot & VM_PROT_EXECUTE) == 0) mask |= PTE_X; - +resume: PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l1 = pmap_l1(pmap, sva); @@ -1892,10 +2315,41 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); - if (l2 == NULL || pmap_load(l2) == 0) - continue; - if ((pmap_load(l2) & PTE_RX) != 0) + if (l2 == NULL || (l2e = pmap_load(l2)) == 0) continue; + if ((l2e & PTE_RWX) != 0) { + if (sva + L2_SIZE == va_next && eva >= va_next) { +retryl2: + if ((l2e & (PTE_SW_MANAGED | PTE_D)) == + (PTE_SW_MANAGED | PTE_D)) { + pa = PTE_TO_PHYS(l2e); + for (va = sva, m = PHYS_TO_VM_PAGE(pa); + va < va_next; m++, va += PAGE_SIZE) + vm_page_dirty(m); + } + if (!atomic_fcmpset_long(l2, &l2e, l2e & ~mask)) + goto retryl2; + anychanged = true; + } else { + if (!pv_lists_locked) { + pv_lists_locked = true; + if (!rw_try_rlock(&pvh_global_lock)) { + if (anychanged) + pmap_invalidate_all( + pmap); + PMAP_UNLOCK(pmap); + rw_rlock(&pvh_global_lock); + goto resume; + } + } + if (!pmap_demote_l2(pmap, l2, sva)) { + /* + * The large page mapping was destroyed. + */ + continue; + } + } + } if (va_next > eva) va_next = eva; @@ -1903,7 +2357,7 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { l3e = pmap_load(l3); -retry: +retryl3: if ((l3e & PTE_V) == 0) continue; if ((prot & VM_PROT_WRITE) == 0 && @@ -1913,59 +2367,235 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) vm_page_dirty(m); } if (!atomic_fcmpset_long(l3, &l3e, l3e & ~mask)) - goto retry; - /* XXX: Use pmap_invalidate_range */ - pmap_invalidate_page(pmap, sva); + goto retryl3; + anychanged = true; } } + if (anychanged) + pmap_invalidate_all(pmap); + if (pv_lists_locked) + rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } int pmap_fault_fixup(pmap_t pmap, vm_offset_t va, vm_prot_t ftype) { - pt_entry_t orig_l3; - pt_entry_t new_l3; - pt_entry_t *l3; + pd_entry_t *l2, l2e; + pt_entry_t bits, *pte, oldpte; int rv; rv = 0; - PMAP_LOCK(pmap); - - l3 = pmap_l3(pmap, va); - if (l3 == NULL) - goto done; - - orig_l3 = pmap_load(l3); - if ((orig_l3 & PTE_V) == 0 || - (ftype == VM_PROT_WRITE && (orig_l3 & PTE_W) == 0) || - (ftype == VM_PROT_EXECUTE && (orig_l3 & PTE_X) == 0) || - (ftype == VM_PROT_READ && (orig_l3 & PTE_R) == 0)) - goto done; - - new_l3 = orig_l3 | PTE_A; - if (ftype == VM_PROT_WRITE) - new_l3 |= PTE_D; - - if (orig_l3 != new_l3) { - pmap_store(l3, new_l3); - pmap_invalidate_page(pmap, va); - rv = 1; + l2 = pmap_l2(pmap, va); + if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0) goto done; + if ((l2e & PTE_RWX) == 0) { + pte = pmap_l2_to_l3(l2, va); + if (pte == NULL || ((oldpte = pmap_load(pte) & PTE_V)) == 0) + goto done; + } else { + pte = l2; + oldpte = l2e; } - /* - * XXX: This case should never happen since it means - * the PTE shouldn't have resulted in a fault. - */ + if ((pmap != kernel_pmap && (oldpte & PTE_U) == 0) || + (ftype == VM_PROT_WRITE && (oldpte & PTE_W) == 0) || + (ftype == VM_PROT_EXECUTE && (oldpte & PTE_X) == 0) || + (ftype == VM_PROT_READ && (oldpte & PTE_R) == 0)) + goto done; + bits = PTE_A; + if (ftype == VM_PROT_WRITE) + bits |= PTE_D; + + /* + * Spurious faults can occur if the implementation caches invalid + * entries in the TLB, or if simultaneous accesses on multiple CPUs + * race with each other. + */ + if ((oldpte & bits) != bits) + pmap_store_bits(pte, bits); + sfence_vma(); + rv = 1; done: PMAP_UNLOCK(pmap); - return (rv); } +static bool +pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va) +{ + struct rwlock *lock; + bool rv; + + lock = NULL; + rv = pmap_demote_l2_locked(pmap, l2, va, &lock); + if (lock != NULL) + rw_wunlock(lock); + return (rv); +} + +/* + * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page + * mapping is invalidated. + */ +static bool +pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, + struct rwlock **lockp) +{ + struct spglist free; + vm_page_t mpte; + pd_entry_t newl2, oldl2; + pt_entry_t *firstl3, newl3; + vm_paddr_t mptepa; + int i; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + oldl2 = pmap_load(l2); + KASSERT((oldl2 & PTE_RWX) != 0, + ("pmap_demote_l2_locked: oldl2 is not a leaf entry")); + if ((oldl2 & PTE_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == + NULL) { + if ((oldl2 & PTE_A) == 0 || (mpte = vm_page_alloc(NULL, + pmap_l2_pindex(va), (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : + VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == + NULL) { + SLIST_INIT(&free); + (void)pmap_remove_l2(pmap, l2, va & ~L2_OFFSET, + pmap_load(pmap_l1(pmap, va)), &free, lockp); + vm_page_free_pages_toq(&free, true); + CTR2(KTR_PMAP, "pmap_demote_l2_locked: " + "failure for va %#lx in pmap %p", va, pmap); + return (false); + } + if (va < VM_MAXUSER_ADDRESS) + pmap_resident_count_inc(pmap, 1); + } + mptepa = VM_PAGE_TO_PHYS(mpte); + firstl3 = (pt_entry_t *)PHYS_TO_DMAP(mptepa); + newl2 = ((mptepa / PAGE_SIZE) << PTE_PPN0_S) | PTE_V; + KASSERT((oldl2 & PTE_A) != 0, + ("pmap_demote_l2_locked: oldl2 is missing PTE_A")); + KASSERT((oldl2 & (PTE_D | PTE_W)) != PTE_W, + ("pmap_demote_l2_locked: oldl2 is missing PTE_D")); + newl3 = oldl2; + + /* + * If the page table page is new, initialize it. + */ + if (mpte->wire_count == 1) { + mpte->wire_count = Ln_ENTRIES; + for (i = 0; i < Ln_ENTRIES; i++) + pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); + } + KASSERT(PTE_TO_PHYS(pmap_load(firstl3)) == PTE_TO_PHYS(newl3), + ("pmap_demote_l2_locked: firstl3 and newl3 map different physical " + "addresses")); + + /* + * If the mapping has changed attributes, update the page table + * entries. + */ + if ((pmap_load(firstl3) & PTE_PROMOTE) != (newl3 & PTE_PROMOTE)) + for (i = 0; i < Ln_ENTRIES; i++) + pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); + + /* + * The spare PV entries must be reserved prior to demoting the + * mapping, that is, prior to changing the L2 entry. Otherwise, the + * state of the L2 entry and the PV lists will be inconsistent, which + * can result in reclaim_pv_chunk() attempting to remove a PV entry from + * the wrong PV list and pmap_pv_demote_l2() failing to find the + * expected PV entry for the 2MB page mapping that is being demoted. + */ + if ((oldl2 & PTE_SW_MANAGED) != 0) + reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); + + /* + * Demote the mapping. + */ + pmap_store(l2, newl2); + + /* + * Demote the PV entry. + */ + if ((oldl2 & PTE_SW_MANAGED) != 0) + pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp); + + atomic_add_long(&pmap_l2_demotions, 1); + CTR2(KTR_PMAP, "pmap_demote_l2_locked: success for va %#lx in pmap %p", + va, pmap); + return (true); +} + +#if VM_NRESERVLEVEL > 0 +static void +pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, + struct rwlock **lockp) +{ + pt_entry_t *firstl3, *l3; + vm_paddr_t pa; + vm_page_t ml3; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + va &= ~L2_OFFSET; + KASSERT((pmap_load(l2) & PTE_RWX) == 0, + ("pmap_promote_l2: invalid l2 entry %p", l2)); + + firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2))); + pa = PTE_TO_PHYS(pmap_load(firstl3)); + if ((pa & L2_OFFSET) != 0) { + CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", + va, pmap); + atomic_add_long(&pmap_l2_p_failures, 1); + return; + } + + pa += PAGE_SIZE; + for (l3 = firstl3 + 1; l3 < firstl3 + Ln_ENTRIES; l3++) { + if (PTE_TO_PHYS(pmap_load(l3)) != pa) { + CTR2(KTR_PMAP, + "pmap_promote_l2: failure for va %#lx pmap %p", + va, pmap); + atomic_add_long(&pmap_l2_p_failures, 1); + return; + } + if ((pmap_load(l3) & PTE_PROMOTE) != + (pmap_load(firstl3) & PTE_PROMOTE)) { + CTR2(KTR_PMAP, + "pmap_promote_l2: failure for va %#lx pmap %p", + va, pmap); + atomic_add_long(&pmap_l2_p_failures, 1); + return; + } + pa += PAGE_SIZE; + } + + ml3 = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); + KASSERT(ml3->pindex == pmap_l2_pindex(va), + ("pmap_promote_l2: page table page's pindex is wrong")); + if (pmap_insert_pt_page(pmap, ml3)) { + CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", + va, pmap); + atomic_add_long(&pmap_l2_p_failures, 1); + return; + } + + if ((pmap_load(firstl3) & PTE_SW_MANAGED) != 0) + pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(pmap_load(firstl3)), + lockp); + + pmap_store(l2, pmap_load(firstl3)); + + atomic_add_long(&pmap_l2_promotions, 1); + CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, + pmap); +} +#endif + /* * Insert the given physical page (p) at * the specified virtual address (v) in the @@ -1980,20 +2610,19 @@ pmap_fault_fixup(pmap_t pmap, vm_offset_t va, vm_prot_t ftype) */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, - u_int flags, int8_t psind __unused) + u_int flags, int8_t psind) { struct rwlock *lock; - pd_entry_t *l1, *l2; + pd_entry_t *l1, *l2, l2e; pt_entry_t new_l3, orig_l3; pt_entry_t *l3; pv_entry_t pv; vm_paddr_t opa, pa, l2_pa, l3_pa; vm_page_t mpte, om, l2_m, l3_m; - boolean_t nosleep; pt_entry_t entry; - pn_t l2_pn; - pn_t l3_pn; - pn_t pn; + pn_t l2_pn, l3_pn, pn; + int rv; + bool nosleep; va = trunc_page(va); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) @@ -2008,7 +2637,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, new_l3 |= PTE_D; if (prot & VM_PROT_WRITE) new_l3 |= PTE_W; - if ((va >> 63) == 0) + if (va < VM_MAX_USER_ADDRESS) new_l3 |= PTE_U; new_l3 |= (pn << PTE_PPN0_S); @@ -2028,13 +2657,29 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); - mpte = NULL; - lock = NULL; + mpte = NULL; rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); + if (psind == 1) { + /* Assert the required virtual and physical alignment. */ + KASSERT((va & L2_OFFSET) == 0, + ("pmap_enter: va %#lx unaligned", va)); + KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); + rv = pmap_enter_l2(pmap, va, new_l3, flags, m, &lock); + goto out; + } - if (va < VM_MAXUSER_ADDRESS) { + l2 = pmap_l2(pmap, va); + if (l2 != NULL && ((l2e = pmap_load(l2)) & PTE_V) != 0 && + ((l2e & PTE_RWX) == 0 || pmap_demote_l2_locked(pmap, l2, + va, &lock))) { + l3 = pmap_l2_to_l3(l2, va); + if (va < VM_MAXUSER_ADDRESS) { + mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); + mpte->wire_count++; + } + } else if (va < VM_MAXUSER_ADDRESS) { nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock); if (mpte == NULL && nosleep) { @@ -2050,7 +2695,6 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, l3 = pmap_l3(pmap, va); /* TODO: This is not optimal, but should mostly work */ if (l3 == NULL) { - l2 = pmap_l2(pmap, va); if (l2 == NULL) { l2_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | @@ -2071,9 +2715,6 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, l2 = pmap_l1_to_l2(l1, va); } - KASSERT(l2 != NULL, - ("No l2 table after allocating one")); - l3_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (l3_m == NULL) @@ -2162,6 +2803,8 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_aflag_set(om, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); pv = pmap_pvh_remove(&om->md, pmap, va); + KASSERT(pv != NULL, + ("pmap_enter: no PV entry for %#lx", va)); if ((new_l3 & PTE_SW_MANAGED) == 0) free_pv_entry(pmap, pv); if ((om->aflags & PGA_WRITEABLE) != 0 && @@ -2216,10 +2859,163 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, pmap_store(l3, new_l3); } +#if VM_NRESERVLEVEL > 0 + if (mpte != NULL && mpte->wire_count == Ln_ENTRIES && + pmap_ps_enabled(pmap) && + (m->flags & PG_FICTITIOUS) == 0 && + vm_reserv_level_iffullpop(m) == 0) + pmap_promote_l2(pmap, l2, va, &lock); +#endif + + rv = KERN_SUCCESS; +out: if (lock != NULL) rw_wunlock(lock); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); + return (rv); +} + +/* + * Tries to create a read- and/or execute-only 2MB page mapping. Returns true + * if successful. Returns false if (1) a page table page cannot be allocated + * without sleeping, (2) a mapping already exists at the specified virtual + * address, or (3) a PV entry cannot be allocated without reclaiming another + * PV entry. + */ +static bool +pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, + struct rwlock **lockp) +{ + pd_entry_t new_l2; + pn_t pn; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + pn = VM_PAGE_TO_PHYS(m) / PAGE_SIZE; + new_l2 = (pd_entry_t)((pn << PTE_PPN0_S) | PTE_R | PTE_V); + if ((m->oflags & VPO_UNMANAGED) == 0) + new_l2 |= PTE_SW_MANAGED; + if ((prot & VM_PROT_EXECUTE) != 0) + new_l2 |= PTE_X; + if (va < VM_MAXUSER_ADDRESS) + new_l2 |= PTE_U; + return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | + PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == + KERN_SUCCESS); +} + +/* + * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if + * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE + * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and + * a mapping already exists at the specified virtual address. Returns + * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table + * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if + * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. + * + * The parameter "m" is only used when creating a managed, writeable mapping. + */ +static int +pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, + vm_page_t m, struct rwlock **lockp) +{ + struct spglist free; + pd_entry_t *l2, *l3, oldl2; + vm_offset_t sva; + vm_page_t l2pg, mt; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? + NULL : lockp)) == NULL) { + CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", + va, pmap); + return (KERN_RESOURCE_SHORTAGE); + } + + l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); + l2 = &l2[pmap_l2_index(va)]; + if ((oldl2 = pmap_load(l2)) != 0) { + KASSERT(l2pg->wire_count > 1, + ("pmap_enter_l2: l2pg's wire count is too low")); + if ((flags & PMAP_ENTER_NOREPLACE) != 0) { + l2pg->wire_count--; + CTR2(KTR_PMAP, + "pmap_enter_l2: failure for va %#lx in pmap %p", + va, pmap); + return (KERN_FAILURE); + } + SLIST_INIT(&free); + if ((oldl2 & PTE_RWX) != 0) + (void)pmap_remove_l2(pmap, l2, va, + pmap_load(pmap_l1(pmap, va)), &free, lockp); + else + for (sva = va; sva < va + L2_SIZE; sva += PAGE_SIZE) { + l3 = pmap_l2_to_l3(l2, sva); + if ((pmap_load(l3) & PTE_V) != 0 && + pmap_remove_l3(pmap, l3, sva, oldl2, &free, + lockp) != 0) + break; + } + vm_page_free_pages_toq(&free, true); + if (va >= VM_MAXUSER_ADDRESS) { + mt = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); + if (pmap_insert_pt_page(pmap, mt)) { + /* + * XXX Currently, this can't happen bacuse + * we do not perform pmap_enter(psind == 1) + * on the kernel pmap. + */ + panic("pmap_enter_l2: trie insert failed"); + } + } else + KASSERT(pmap_load(l2) == 0, + ("pmap_enter_l2: non-zero L2 entry %p", l2)); + } + + if ((new_l2 & PTE_SW_MANAGED) != 0) { + /* + * Abort this mapping if its PV entry could not be created. + */ + if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { + SLIST_INIT(&free); + if (pmap_unwire_ptp(pmap, va, l2pg, &free)) { + /* + * Although "va" is not mapped, paging-structure + * caches could nonetheless have entries that + * refer to the freed page table pages. + * Invalidate those entries. + */ + pmap_invalidate_page(pmap, va); + vm_page_free_pages_toq(&free, true); + } + CTR2(KTR_PMAP, + "pmap_enter_l2: failure for va %#lx in pmap %p", + va, pmap); + return (KERN_RESOURCE_SHORTAGE); + } + if ((new_l2 & PTE_W) != 0) + for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) + vm_page_aflag_set(mt, PGA_WRITEABLE); + } + + /* + * Increment counters. + */ + if ((new_l2 & PTE_SW_WIRED) != 0) + pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; + pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; + + /* + * Map the superpage. + */ + pmap_store(l2, new_l2); + + atomic_add_long(&pmap_l2_mappings, 1); + CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", + va, pmap); + return (KERN_SUCCESS); } @@ -2254,7 +3050,13 @@ pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); - mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, &lock); + if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && + m->psind == 1 && pmap_ps_enabled(pmap) && + pmap_enter_2mpage(pmap, va, m, prot, &lock)) + m = &m[L2_SIZE / PAGE_SIZE - 1]; + else + mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, + &lock); m = TAILQ_NEXT(m, listq); } if (lock != NULL) @@ -2365,7 +3167,7 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { if (mpte != NULL) { SLIST_INIT(&free); - if (pmap_unwire_l3(pmap, va, mpte, &free)) { + if (pmap_unwire_ptp(pmap, va, mpte, &free)) { pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, false); } @@ -2429,11 +3231,12 @@ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va_next; - pd_entry_t *l1, *l2; - pt_entry_t *l3; - boolean_t pv_lists_locked; + pd_entry_t *l1, *l2, l2e; + pt_entry_t *l3, l3e; + bool pv_lists_locked; - pv_lists_locked = FALSE; + pv_lists_locked = false; +retry: PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l1 = pmap_l1(pmap, sva); @@ -2449,25 +3252,46 @@ pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); - if (pmap_load(l2) == 0) + if ((l2e = pmap_load(l2)) == 0) continue; + if ((l2e & PTE_RWX) != 0) { + if (sva + L2_SIZE == va_next && eva >= va_next) { + if ((l2e & PTE_SW_WIRED) == 0) + panic("pmap_unwire: l2 %#jx is missing " + "PTE_SW_WIRED", (uintmax_t)l2e); + pmap_clear_bits(l2, PTE_SW_WIRED); + continue; + } else { + if (!pv_lists_locked) { + pv_lists_locked = true; + if (!rw_try_rlock(&pvh_global_lock)) { + PMAP_UNLOCK(pmap); + rw_rlock(&pvh_global_lock); + /* Repeat sva. */ + goto retry; + } + } + if (!pmap_demote_l2(pmap, l2, sva)) + panic("pmap_unwire: demotion failed"); + } + } if (va_next > eva) va_next = eva; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { - if (pmap_load(l3) == 0) + if ((l3e = pmap_load(l3)) == 0) continue; - if ((pmap_load(l3) & PTE_SW_WIRED) == 0) + if ((l3e & PTE_SW_WIRED) == 0) panic("pmap_unwire: l3 %#jx is missing " - "PTE_SW_WIRED", (uintmax_t)*l3); + "PTE_SW_WIRED", (uintmax_t)l3e); /* * PG_W must be cleared atomically. Although the pmap * lock synchronizes access to PG_W, another processor * could be setting PG_M and/or PG_A concurrently. */ - atomic_clear_long(l3, PTE_SW_WIRED); + pmap_clear_bits(l3, PTE_SW_WIRED); pmap->pm_stats.wired_count--; } } @@ -2595,6 +3419,7 @@ pmap_quick_remove_page(vm_offset_t addr) boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { + struct md_page *pvh; struct rwlock *lock; pv_entry_t pv; int loops = 0; @@ -2615,6 +3440,18 @@ pmap_page_exists_quick(pmap_t pmap, vm_page_t m) if (loops >= 16) break; } + if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { + if (PV_PMAP(pv) == pmap) { + rv = TRUE; + break; + } + loops++; + if (loops >= 16) + break; + } + } rw_runlock(lock); rw_runlock(&pvh_global_lock); return (rv); @@ -2629,11 +3466,13 @@ pmap_page_exists_quick(pmap_t pmap, vm_page_t m) int pmap_page_wired_mappings(vm_page_t m) { + struct md_page *pvh; struct rwlock *lock; pmap_t pmap; + pd_entry_t *l2; pt_entry_t *l3; pv_entry_t pv; - int count, md_gen; + int count, md_gen, pvh_gen; if ((m->oflags & VPO_UNMANAGED) != 0) return (0); @@ -2659,11 +3498,72 @@ pmap_page_wired_mappings(vm_page_t m) count++; PMAP_UNLOCK(pmap); } + if ((m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + md_gen = m->md.pv_gen; + pvh_gen = pvh->pv_gen; + rw_runlock(lock); + PMAP_LOCK(pmap); + rw_rlock(lock); + if (md_gen != m->md.pv_gen || + pvh_gen != pvh->pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + l2 = pmap_l2(pmap, pv->pv_va); + if ((pmap_load(l2) & PTE_SW_WIRED) != 0) + count++; + PMAP_UNLOCK(pmap); + } + } rw_runlock(lock); rw_runlock(&pvh_global_lock); return (count); } +static void +pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv, + struct spglist *free, bool superpage) +{ + struct md_page *pvh; + vm_page_t mpte, mt; + + if (superpage) { + pmap_resident_count_dec(pmap, Ln_ENTRIES); + pvh = pa_to_pvh(m->phys_addr); + TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); + pvh->pv_gen++; + if (TAILQ_EMPTY(&pvh->pv_list)) { + for (mt = m; mt < &m[Ln_ENTRIES]; mt++) + if (TAILQ_EMPTY(&mt->md.pv_list) && + (mt->aflags & PGA_WRITEABLE) != 0) + vm_page_aflag_clear(mt, PGA_WRITEABLE); + } + mpte = pmap_remove_pt_page(pmap, pv->pv_va); + if (mpte != NULL) { + pmap_resident_count_dec(pmap, 1); + KASSERT(mpte->wire_count == Ln_ENTRIES, + ("pmap_remove_pages: pte page wire count error")); + mpte->wire_count = 0; + pmap_add_delayed_free_list(mpte, free, FALSE); + } + } else { + pmap_resident_count_dec(pmap, 1); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); + m->md.pv_gen++; + if (TAILQ_EMPTY(&m->md.pv_list) && + (m->aflags & PGA_WRITEABLE) != 0) { + pvh = pa_to_pvh(m->phys_addr); + if (TAILQ_EMPTY(&pvh->pv_list)) + vm_page_aflag_clear(m, PGA_WRITEABLE); + } + } +} + /* * Destroy all managed, non-wired mappings in the given user-space * pmap. This pmap cannot be active on any processor besides the @@ -2683,17 +3583,17 @@ pmap_page_wired_mappings(vm_page_t m) void pmap_remove_pages(pmap_t pmap) { - pd_entry_t ptepde, *l2; - pt_entry_t *l3, tl3; struct spglist free; - vm_page_t m; + pd_entry_t ptepde; + pt_entry_t *pte, tpte; + vm_page_t m, mt; pv_entry_t pv; struct pv_chunk *pc, *npc; struct rwlock *lock; int64_t bit; uint64_t inuse, bitmask; int allfree, field, freed, idx; - vm_paddr_t pa; + bool superpage; lock = NULL; @@ -2712,53 +3612,57 @@ pmap_remove_pages(pmap_t pmap) pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; - l2 = pmap_l2(pmap, pv->pv_va); - ptepde = pmap_load(l2); - l3 = pmap_l2_to_l3(l2, pv->pv_va); - tl3 = pmap_load(l3); + pte = pmap_l1(pmap, pv->pv_va); + ptepde = pmap_load(pte); + pte = pmap_l1_to_l2(pte, pv->pv_va); + tpte = pmap_load(pte); + if ((tpte & PTE_RWX) != 0) { + superpage = true; + } else { + ptepde = tpte; + pte = pmap_l2_to_l3(pte, pv->pv_va); + tpte = pmap_load(pte); + superpage = false; + } /* * We cannot remove wired pages from a * process' mapping at this time. */ - if (tl3 & PTE_SW_WIRED) { + if (tpte & PTE_SW_WIRED) { allfree = 0; continue; } - pa = PTE_TO_PHYS(tl3); - m = PHYS_TO_VM_PAGE(pa); - KASSERT(m->phys_addr == pa, - ("vm_page_t %p phys_addr mismatch %016jx %016jx", - m, (uintmax_t)m->phys_addr, - (uintmax_t)tl3)); - + m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], - ("pmap_remove_pages: bad l3 %#jx", - (uintmax_t)tl3)); + ("pmap_remove_pages: bad pte %#jx", + (uintmax_t)tpte)); - pmap_clear(l3); + pmap_clear(pte); /* * Update the vm_page_t clean/reference bits. */ - if ((tl3 & PTE_D) != 0) - vm_page_dirty(m); + if ((tpte & (PTE_D | PTE_W)) == + (PTE_D | PTE_W)) { + if (superpage) + for (mt = m; + mt < &m[Ln_ENTRIES]; mt++) + vm_page_dirty(mt); + else + vm_page_dirty(m); + } CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); /* Mark free */ pc->pc_map[field] |= bitmask; - pmap_resident_count_dec(pmap, 1); - TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); - m->md.pv_gen++; - if (TAILQ_EMPTY(&m->md.pv_list) && - (m->aflags & PGA_WRITEABLE) != 0) - vm_page_aflag_clear(m, PGA_WRITEABLE); - - pmap_unuse_l3(pmap, pv->pv_va, ptepde, &free); + pmap_remove_pages_pv(pmap, m, pv, &free, + superpage); + pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); freed++; } } @@ -2778,20 +3682,23 @@ pmap_remove_pages(pmap_t pmap) vm_page_free_pages_toq(&free, false); } -/* - * This is used to check if a page has been accessed or modified. As we - * don't have a bit to see if it has been modified we have to assume it - * has been if the page is read/write. - */ -static boolean_t +static bool pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) { + struct md_page *pvh; struct rwlock *lock; + pd_entry_t *l2; + pt_entry_t *l3, mask; pv_entry_t pv; - pt_entry_t *l3, mask, value; pmap_t pmap; - int md_gen; - boolean_t rv; + int md_gen, pvh_gen; + bool rv; + + mask = 0; + if (modified) + mask |= PTE_D; + if (accessed) + mask |= PTE_A; rv = FALSE; rw_rlock(&pvh_global_lock); @@ -2811,33 +3718,34 @@ pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) } } l3 = pmap_l3(pmap, pv->pv_va); - mask = 0; - value = 0; - if (modified) { - mask |= PTE_D; - value |= PTE_D; - } - if (accessed) { - mask |= PTE_A; - value |= PTE_A; - } - -#if 0 - if (modified) { - mask |= ATTR_AP_RW_BIT; - value |= ATTR_AP(ATTR_AP_RW); - } - if (accessed) { - mask |= ATTR_AF | ATTR_DESCR_MASK; - value |= ATTR_AF | L3_PAGE; - } -#endif - - rv = (pmap_load(l3) & mask) == value; + rv = (pmap_load(l3) & mask) == mask; PMAP_UNLOCK(pmap); if (rv) goto out; } + if ((m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + md_gen = m->md.pv_gen; + pvh_gen = pvh->pv_gen; + rw_runlock(lock); + PMAP_LOCK(pmap); + rw_rlock(lock); + if (md_gen != m->md.pv_gen || + pvh_gen != pvh->pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + l2 = pmap_l2(pmap, pv->pv_va); + rv = (pmap_load(l2) & mask) == mask; + PMAP_UNLOCK(pmap); + if (rv) + goto out; + } + } out: rw_runlock(lock); rw_runlock(&pvh_global_lock); @@ -2911,12 +3819,14 @@ pmap_is_referenced(vm_page_t m) void pmap_remove_write(vm_page_t m) { - pmap_t pmap; + struct md_page *pvh; struct rwlock *lock; - pv_entry_t pv; - pt_entry_t *l3, oldl3; - pt_entry_t newl3; - int md_gen; + pmap_t pmap; + pd_entry_t *l2; + pt_entry_t *l3, oldl3, newl3; + pv_entry_t next_pv, pv; + vm_offset_t va; + int md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); @@ -2929,18 +3839,43 @@ pmap_remove_write(vm_page_t m) VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; - rw_rlock(&pvh_global_lock); lock = VM_PAGE_TO_PV_LIST_LOCK(m); + pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : + pa_to_pvh(VM_PAGE_TO_PHYS(m)); + rw_rlock(&pvh_global_lock); retry_pv_loop: rw_wlock(lock); + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen) { + PMAP_UNLOCK(pmap); + rw_wunlock(lock); + goto retry_pv_loop; + } + } + va = pv->pv_va; + l2 = pmap_l2(pmap, va); + if ((pmap_load(l2) & PTE_W) != 0) + (void)pmap_demote_l2_locked(pmap, l2, va, &lock); + KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), + ("inconsistent pv lock %p %p for page %p", + lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); + PMAP_UNLOCK(pmap); + } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); - if (md_gen != m->md.pv_gen) { + if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; @@ -2964,13 +3899,6 @@ pmap_remove_write(vm_page_t m) rw_runlock(&pvh_global_lock); } -static __inline boolean_t -safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) -{ - - return (FALSE); -} - /* * pmap_ts_referenced: * @@ -2990,38 +3918,104 @@ safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) int pmap_ts_referenced(vm_page_t m) { + struct spglist free; + struct md_page *pvh; + struct rwlock *lock; pv_entry_t pv, pvf; pmap_t pmap; - struct rwlock *lock; - pd_entry_t *l2; - pt_entry_t *l3, old_l3; + pd_entry_t *l2, l2e; + pt_entry_t *l3, l3e; vm_paddr_t pa; - int cleared, md_gen, not_cleared; - struct spglist free; + vm_offset_t va; + int md_gen, pvh_gen, ret; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); SLIST_INIT(&free); - cleared = 0; + ret = 0; pa = VM_PAGE_TO_PHYS(m); + pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); + lock = PHYS_TO_PV_LIST_LOCK(pa); rw_rlock(&pvh_global_lock); rw_wlock(lock); retry: - not_cleared = 0; + if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) + goto small_mappings; + pv = pvf; + do { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen) { + PMAP_UNLOCK(pmap); + goto retry; + } + } + va = pv->pv_va; + l2 = pmap_l2(pmap, va); + l2e = pmap_load(l2); + if ((l2e & (PTE_W | PTE_D)) == (PTE_W | PTE_D)) { + /* + * Although l2e is mapping a 2MB page, because + * this function is called at a 4KB page granularity, + * we only update the 4KB page under test. + */ + vm_page_dirty(m); + } + if ((l2e & PTE_A) != 0) { + /* + * Since this reference bit is shared by 512 4KB + * pages, it should not be cleared every time it is + * tested. Apply a simple "hash" function on the + * physical page number, the virtual superpage number, + * and the pmap address to select one 4KB page out of + * the 512 on which testing the reference bit will + * result in clearing that reference bit. This + * function is designed to avoid the selection of the + * same 4KB page for every 2MB page mapping. + * + * On demotion, a mapping that hasn't been referenced + * is simply destroyed. To avoid the possibility of a + * subsequent page fault on a demoted wired mapping, + * always leave its reference bit set. Moreover, + * since the superpage is wired, the current state of + * its reference bit won't affect page replacement. + */ + if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^ + (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && + (l2e & PTE_SW_WIRED) == 0) { + pmap_clear_bits(l2, PTE_A); + pmap_invalidate_page(pmap, va); + } + ret++; + } + PMAP_UNLOCK(pmap); + /* Rotate the PV list if it has more than one entry. */ + if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { + TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); + TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); + pvh->pv_gen++; + } + if (ret >= PMAP_TS_REFERENCED_MAX) + goto out; + } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); +small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { - if (pvf == NULL) - pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); - if (md_gen != m->md.pv_gen) { + if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto retry; } @@ -3032,36 +4026,21 @@ pmap_ts_referenced(vm_page_t m) ("pmap_ts_referenced: found an invalid l2 table")); l3 = pmap_l2_to_l3(l2, pv->pv_va); - old_l3 = pmap_load(l3); - if ((old_l3 & PTE_D) != 0) + l3e = pmap_load(l3); + if ((l3e & PTE_D) != 0) vm_page_dirty(m); - if ((old_l3 & PTE_A) != 0) { - if (safe_to_clear_referenced(pmap, old_l3)) { - /* - * TODO: We don't handle the access flag - * at all. We need to be able to set it in - * the exception handler. - */ - panic("RISCVTODO: safe_to_clear_referenced\n"); - } else if ((old_l3 & PTE_SW_WIRED) == 0) { + if ((l3e & PTE_A) != 0) { + if ((l3e & PTE_SW_WIRED) == 0) { /* * Wired pages cannot be paged out so * doing accessed bit emulation for * them is wasted effort. We do the * hard work for unwired pages only. */ - pmap_remove_l3(pmap, l3, pv->pv_va, - pmap_load(l2), &free, &lock); + pmap_clear_bits(l3, PTE_A); pmap_invalidate_page(pmap, pv->pv_va); - cleared++; - if (pvf == pv) - pvf = NULL; - pv = NULL; - KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), - ("inconsistent pv lock %p %p for page %p", - lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); - } else - not_cleared++; + } + ret++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ @@ -3070,13 +4049,13 @@ pmap_ts_referenced(vm_page_t m) TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; } - } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + - not_cleared < PMAP_TS_REFERENCED_MAX); + } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && ret < + PMAP_TS_REFERENCED_MAX); out: rw_wunlock(lock); rw_runlock(&pvh_global_lock); vm_page_free_pages_toq(&free, false); - return (cleared + not_cleared); + return (ret); } /* @@ -3163,7 +4142,7 @@ pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) l2 = pmap_l2(pmap, addr); if (l2 != NULL && ((tpte = pmap_load(l2)) & PTE_V) != 0) { - if ((tpte & (PTE_R | PTE_W | PTE_X)) != 0) { + if ((tpte & PTE_RWX) != 0) { pa = PTE_TO_PHYS(tpte) | (addr & L2_OFFSET); val = MINCORE_INCORE | MINCORE_SUPER; } else { @@ -3241,6 +4220,20 @@ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { + vm_offset_t superpage_offset; + + if (size < L2_SIZE) + return; + if (object != NULL && (object->flags & OBJ_COLORED) != 0) + offset += ptoa(object->pg_color); + superpage_offset = offset & L2_OFFSET; + if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || + (*addr & L2_OFFSET) == superpage_offset) + return; + if ((*addr & L2_OFFSET) < superpage_offset) + *addr = (*addr & ~L2_OFFSET) + superpage_offset; + else + *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; } /** diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index e095ccc69edf..4bf8869b997f 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -271,7 +271,8 @@ vm_fault_soft_fast(struct faultstate *fs, vm_offset_t vaddr, vm_prot_t prot, { vm_page_t m, m_map; #if (defined(__aarch64__) || defined(__amd64__) || (defined(__arm__) && \ - __ARM_ARCH >= 6) || defined(__i386__)) && VM_NRESERVLEVEL > 0 + __ARM_ARCH >= 6) || defined(__i386__) || defined(__riscv)) && \ + VM_NRESERVLEVEL > 0 vm_page_t m_super; int flags; #endif @@ -286,7 +287,8 @@ vm_fault_soft_fast(struct faultstate *fs, vm_offset_t vaddr, vm_prot_t prot, m_map = m; psind = 0; #if (defined(__aarch64__) || defined(__amd64__) || (defined(__arm__) && \ - __ARM_ARCH >= 6) || defined(__i386__)) && VM_NRESERVLEVEL > 0 + __ARM_ARCH >= 6) || defined(__i386__) || defined(__riscv)) && \ + VM_NRESERVLEVEL > 0 if ((m->flags & PG_FICTITIOUS) == 0 && (m_super = vm_reserv_to_superpage(m)) != NULL && rounddown2(vaddr, pagesizes[m_super->psind]) >= fs->entry->start && @@ -463,7 +465,7 @@ vm_fault_populate(struct faultstate *fs, vm_prot_t prot, int fault_type, pidx += npages, m = vm_page_next(&m[npages - 1])) { vaddr = fs->entry->start + IDX_TO_OFF(pidx) - fs->entry->offset; #if defined(__aarch64__) || defined(__amd64__) || (defined(__arm__) && \ - __ARM_ARCH >= 6) || defined(__i386__) + __ARM_ARCH >= 6) || defined(__i386__) || defined(__riscv) psind = m->psind; if (psind > 0 && ((vaddr & (pagesizes[psind] - 1)) != 0 || pidx + OFF_TO_IDX(pagesizes[psind]) - 1 > pager_last ||