From e7a9df16e6541a68ab21e51a5d0badb5feef1894 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Wed, 20 Feb 2019 09:51:13 +0000 Subject: [PATCH] Add kernel support for Intel userspace protection keys feature on Skylake Xeons. See SDM rev. 68 Vol 3 4.6.2 Protection Keys and the description of the RDPKRU and WRPKRU instructions. Reviewed by: markj Tested by: pho Sponsored by: The FreeBSD Foundation MFC after: 2 weeks Differential revision: https://reviews.freebsd.org/D18893 --- sys/amd64/amd64/initcpu.c | 3 + sys/amd64/amd64/pmap.c | 380 +++++++++++++++++++++++++++++++++- sys/amd64/amd64/sys_machdep.c | 61 +++++- sys/amd64/amd64/trap.c | 15 ++ sys/amd64/include/pmap.h | 14 +- sys/arm/include/pmap.h | 7 + sys/arm64/include/pmap.h | 7 + sys/i386/include/pmap.h | 7 + sys/mips/include/pmap.h | 7 + sys/powerpc/include/pmap.h | 7 + sys/riscv/include/pmap.h | 7 + sys/sparc64/include/pmap.h | 7 + sys/vm/vm_fault.c | 16 +- sys/vm/vm_map.c | 13 +- sys/x86/include/sysarch.h | 22 ++ 15 files changed, 563 insertions(+), 10 deletions(-) diff --git a/sys/amd64/amd64/initcpu.c b/sys/amd64/amd64/initcpu.c index 43e33e831f2d..7502849af423 100644 --- a/sys/amd64/amd64/initcpu.c +++ b/sys/amd64/amd64/initcpu.c @@ -233,6 +233,9 @@ initializecpu(void) if (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) cr4 |= CR4_FSGSBASE; + if (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) + cr4 |= CR4_PKE; + /* * Postpone enabling the SMEP on the boot CPU until the page * tables are switched from the boot loader identity mapping diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index c45b4316622b..9491bb4203f4 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -48,7 +48,7 @@ */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. - * Copyright (c) 2014-2018 The FreeBSD Foundation + * Copyright (c) 2014-2019 The FreeBSD Foundation * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, @@ -121,6 +121,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -155,6 +156,7 @@ __FBSDID("$FreeBSD$"); #ifdef SMP #include #endif +#include #include static __inline boolean_t @@ -285,6 +287,13 @@ pmap_modified_bit(pmap_t pmap) return (mask); } +static __inline pt_entry_t +pmap_pku_mask_bit(pmap_t pmap) +{ + + return (pmap->pm_type == PT_X86 ? X86_PG_PKU_MASK : 0); +} + #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline @@ -424,6 +433,22 @@ static pml4_entry_t *pti_pml4; static vm_pindex_t pti_pg_idx; static bool pti_finalized; +struct pmap_pkru_range { + struct rs_el pkru_rs_el; + u_int pkru_keyidx; + int pkru_flags; +}; + +static uma_zone_t pmap_pkru_ranges_zone; +static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); +static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va); +static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); +static void *pkru_dup_range(void *ctx, void *data); +static void pkru_free_range(void *ctx, void *node); +static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap); +static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); +static void pmap_pkru_deassign_all(pmap_t pmap); + static int pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS) { @@ -2846,6 +2871,12 @@ pmap_pinit0(pmap_t pmap) pmap->pm_pcids[i].pm_gen = 1; } pmap_activate_boot(pmap); + + if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { + pmap_pkru_ranges_zone = uma_zcreate("pkru ranges", + sizeof(struct pmap_pkru_range), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + } } void @@ -2934,6 +2965,10 @@ pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) pmap_pinit_pml4_pti(pml4pgu); pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu); } + if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { + rangeset_init(&pmap->pm_pkru, pkru_dup_range, + pkru_free_range, pmap, M_NOWAIT); + } } pmap->pm_root.rt_root = 0; @@ -3230,6 +3265,9 @@ pmap_release(pmap_t pmap) vm_page_unwire_noq(m); vm_page_free(m); } + if (pmap->pm_type == PT_X86 && + (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) + rangeset_fini(&pmap->pm_pkru); } static int @@ -4060,7 +4098,7 @@ pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, { pd_entry_t newpde, oldpde; pt_entry_t *firstpte, newpte; - pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V; + pt_entry_t PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V; vm_paddr_t mptepa; vm_page_t mpte; struct spglist free; @@ -4073,6 +4111,7 @@ pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, PG_RW = pmap_rw_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_PTE_CACHE = pmap_cache_mask(pmap, 0); + PG_PKU_MASK = pmap_pku_mask_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpde = *pde; @@ -4505,6 +4544,7 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) out: if (anyvalid) pmap_invalidate_all(pmap); + pmap_pkru_on_remove(pmap, sva, eva); PMAP_UNLOCK(pmap); pmap_delayed_invl_finished(); vm_page_free_pages_toq(&free, true); @@ -4816,7 +4856,7 @@ pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, { pd_entry_t newpde; pt_entry_t *firstpte, oldpte, pa, *pte; - pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V; + pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V, PG_PKU_MASK; vm_page_t mpte; int PG_PTE_CACHE; @@ -4825,6 +4865,7 @@ pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); + PG_PKU_MASK = pmap_pku_mask_bit(pmap); PG_PTE_CACHE = pmap_cache_mask(pmap, 0); PMAP_LOCK_ASSERT(pmap, MA_OWNED); @@ -5052,6 +5093,8 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, origpte = *pte; pv = NULL; + if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) + newpte |= pmap_pkru_get(pmap, va); /* * Is the specified virtual address already mapped? @@ -5271,6 +5314,25 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, " in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } + + /* + * If pkru is not same for the whole pde range, return failure + * and let vm_fault() cope. Check after pde allocation, since + * it could sleep. + */ + if (!pmap_pkru_same(pmap, va, va + NBPDR)) { + SLIST_INIT(&free); + if (pmap_unwire_ptp(pmap, va, pdpg, &free)) { + pmap_invalidate_page(pmap, va); + vm_page_free_pages_toq(&free, true); + } + return (KERN_FAILURE); + } + if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) { + newpde &= ~X86_PG_PKU_MASK; + newpde |= pmap_pkru_get(pmap, va); + } + pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); pde = &pde[pmap_pde_index(va)]; oldpde = *pde; @@ -5530,7 +5592,7 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, if ((prot & VM_PROT_EXECUTE) == 0) newpte |= pg_nx; if (va < VM_MAXUSER_ADDRESS) - newpte |= PG_U; + newpte |= PG_U | pmap_pkru_get(pmap, va); pte_store(pte, newpte); return (mpte); } @@ -5906,6 +5968,36 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, PMAP_UNLOCK(dst_pmap); } +int +pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap) +{ + int error; + + if (dst_pmap->pm_type != src_pmap->pm_type || + dst_pmap->pm_type != PT_X86 || + (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) + return (0); + for (;;) { + if (dst_pmap < src_pmap) { + PMAP_LOCK(dst_pmap); + PMAP_LOCK(src_pmap); + } else { + PMAP_LOCK(src_pmap); + PMAP_LOCK(dst_pmap); + } + error = pmap_pkru_copy(dst_pmap, src_pmap); + /* Clean up partial copy on failure due to no memory. */ + if (error == ENOMEM) + pmap_pkru_deassign_all(dst_pmap); + PMAP_UNLOCK(src_pmap); + PMAP_UNLOCK(dst_pmap); + if (error != ENOMEM) + break; + vm_wait(NULL); + } + return (error); +} + /* * Zero the specified hardware page. */ @@ -6305,6 +6397,7 @@ pmap_remove_pages(pmap_t pmap) if (lock != NULL) rw_wunlock(lock); pmap_invalidate_all(pmap); + pmap_pkru_deassign_all(pmap); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, true); } @@ -8941,6 +9034,285 @@ pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva) VM_OBJECT_WUNLOCK(pti_obj); } +static void * +pkru_dup_range(void *ctx __unused, void *data) +{ + struct pmap_pkru_range *node, *new_node; + + new_node = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); + if (new_node == NULL) + return (NULL); + node = data; + memcpy(new_node, node, sizeof(*node)); + return (new_node); +} + +static void +pkru_free_range(void *ctx __unused, void *node) +{ + + uma_zfree(pmap_pkru_ranges_zone, node); +} + +static int +pmap_pkru_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, + int flags) +{ + struct pmap_pkru_range *ppr; + int error; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + MPASS(pmap->pm_type == PT_X86); + MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); + if ((flags & AMD64_PKRU_EXCL) != 0 && + !rangeset_check_empty(&pmap->pm_pkru, sva, eva)) + return (EBUSY); + ppr = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); + if (ppr == NULL) + return (ENOMEM); + ppr->pkru_keyidx = keyidx; + ppr->pkru_flags = flags & AMD64_PKRU_PERSIST; + error = rangeset_insert(&pmap->pm_pkru, sva, eva, ppr); + if (error != 0) + uma_zfree(pmap_pkru_ranges_zone, ppr); + return (error); +} + +static int +pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + MPASS(pmap->pm_type == PT_X86); + MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); + return (rangeset_remove(&pmap->pm_pkru, sva, eva)); +} + +static void +pmap_pkru_deassign_all(pmap_t pmap) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + if (pmap->pm_type == PT_X86 && + (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) + rangeset_remove_all(&pmap->pm_pkru); +} + +static bool +pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + struct pmap_pkru_range *ppr, *prev_ppr; + vm_offset_t va; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + if (pmap->pm_type != PT_X86 || + (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || + sva >= VM_MAXUSER_ADDRESS) + return (true); + MPASS(eva <= VM_MAXUSER_ADDRESS); + for (va = sva, prev_ppr = NULL; va < eva;) { + ppr = rangeset_lookup(&pmap->pm_pkru, va); + if ((ppr == NULL) ^ (prev_ppr == NULL)) + return (false); + if (ppr == NULL) { + va += PAGE_SIZE; + continue; + } + if (prev_ppr->pkru_keyidx != ppr->pkru_keyidx) + return (false); + va = ppr->pkru_rs_el.re_end; + } + return (true); +} + +static pt_entry_t +pmap_pkru_get(pmap_t pmap, vm_offset_t va) +{ + struct pmap_pkru_range *ppr; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + if (pmap->pm_type != PT_X86 || + (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || + va >= VM_MAXUSER_ADDRESS) + return (0); + ppr = rangeset_lookup(&pmap->pm_pkru, va); + if (ppr != NULL) + return (X86_PG_PKU(ppr->pkru_keyidx)); + return (0); +} + +static bool +pred_pkru_on_remove(void *ctx __unused, void *r) +{ + struct pmap_pkru_range *ppr; + + ppr = r; + return ((ppr->pkru_flags & AMD64_PKRU_PERSIST) == 0); +} + +static void +pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + if (pmap->pm_type == PT_X86 && + (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { + rangeset_remove_pred(&pmap->pm_pkru, sva, eva, + pred_pkru_on_remove); + } +} + +static int +pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap) +{ + + PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED); + PMAP_LOCK_ASSERT(src_pmap, MA_OWNED); + MPASS(dst_pmap->pm_type == PT_X86); + MPASS(src_pmap->pm_type == PT_X86); + MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); + if (src_pmap->pm_pkru.rs_data_ctx == NULL) + return (0); + return (rangeset_copy(&dst_pmap->pm_pkru, &src_pmap->pm_pkru)); +} + +static void +pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, + u_int keyidx) +{ + pml4_entry_t *pml4e; + pdp_entry_t *pdpe; + pd_entry_t newpde, ptpaddr, *pde; + pt_entry_t newpte, *ptep, pte; + vm_offset_t va, va_next; + bool changed; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + MPASS(pmap->pm_type == PT_X86); + MPASS(keyidx <= PMAP_MAX_PKRU_IDX); + + for (changed = false, va = sva; va < eva; va = va_next) { + pml4e = pmap_pml4e(pmap, va); + if ((*pml4e & X86_PG_V) == 0) { + va_next = (va + NBPML4) & ~PML4MASK; + if (va_next < va) + va_next = eva; + continue; + } + + pdpe = pmap_pml4e_to_pdpe(pml4e, va); + if ((*pdpe & X86_PG_V) == 0) { + va_next = (va + NBPDP) & ~PDPMASK; + if (va_next < va) + va_next = eva; + continue; + } + + va_next = (va + NBPDR) & ~PDRMASK; + if (va_next < va) + va_next = eva; + + pde = pmap_pdpe_to_pde(pdpe, va); + ptpaddr = *pde; + if (ptpaddr == 0) + continue; + + MPASS((ptpaddr & X86_PG_V) != 0); + if ((ptpaddr & PG_PS) != 0) { + if (va + NBPDR == va_next && eva >= va_next) { + newpde = (ptpaddr & ~X86_PG_PKU_MASK) | + X86_PG_PKU(keyidx); + if (newpde != ptpaddr) { + *pde = newpde; + changed = true; + } + continue; + } else if (!pmap_demote_pde(pmap, pde, va)) { + continue; + } + } + + if (va_next > eva) + va_next = eva; + + for (ptep = pmap_pde_to_pte(pde, va); va != va_next; + ptep++, va += PAGE_SIZE) { + pte = *ptep; + if ((pte & X86_PG_V) == 0) + continue; + newpte = (pte & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx); + if (newpte != pte) { + *ptep = newpte; + changed = true; + } + } + } + if (changed) + pmap_invalidate_range(pmap, sva, eva); +} + +static int +pmap_pkru_check_uargs(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, + u_int keyidx, int flags) +{ + + if (pmap->pm_type != PT_X86 || keyidx > PMAP_MAX_PKRU_IDX || + (flags & ~(AMD64_PKRU_PERSIST | AMD64_PKRU_EXCL)) != 0) + return (EINVAL); + if (eva <= sva || eva > VM_MAXUSER_ADDRESS) + return (EFAULT); + if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) + return (ENOTSUP); + return (0); +} + +int +pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, + int flags) +{ + int error; + + sva = trunc_page(sva); + eva = round_page(eva); + error = pmap_pkru_check_uargs(pmap, sva, eva, keyidx, flags); + if (error != 0) + return (error); + for (;;) { + PMAP_LOCK(pmap); + error = pmap_pkru_assign(pmap, sva, eva, keyidx, flags); + if (error == 0) + pmap_pkru_update_range(pmap, sva, eva, keyidx); + PMAP_UNLOCK(pmap); + if (error != ENOMEM) + break; + vm_wait(NULL); + } + return (error); +} + +int +pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + int error; + + sva = trunc_page(sva); + eva = round_page(eva); + error = pmap_pkru_check_uargs(pmap, sva, eva, 0, 0); + if (error != 0) + return (error); + for (;;) { + PMAP_LOCK(pmap); + error = pmap_pkru_deassign(pmap, sva, eva); + if (error == 0) + pmap_pkru_update_range(pmap, sva, eva, 0); + PMAP_UNLOCK(pmap); + if (error != ENOMEM) + break; + vm_wait(NULL); + } + return (error); +} + #include "opt_ddb.h" #ifdef DDB #include diff --git a/sys/amd64/amd64/sys_machdep.c b/sys/amd64/amd64/sys_machdep.c index b1dff88a584e..2ba646638ded 100644 --- a/sys/amd64/amd64/sys_machdep.c +++ b/sys/amd64/amd64/sys_machdep.c @@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -53,6 +54,7 @@ __FBSDID("$FreeBSD$"); #include #include #include /* for kernel_map */ +#include #include #include @@ -170,13 +172,16 @@ update_gdt_fsbase(struct thread *td, uint32_t base) int sysarch(struct thread *td, struct sysarch_args *uap) { - int error = 0; - struct pcb *pcb = curthread->td_pcb; + struct pcb *pcb; + struct vm_map *map; uint32_t i386base; uint64_t a64base; struct i386_ioperm_args iargs; struct i386_get_xfpustate i386xfpu; + struct i386_set_pkru i386pkru; struct amd64_get_xfpustate a64xfpu; + struct amd64_set_pkru a64pkru; + int error; #ifdef CAPABILITY_MODE /* @@ -194,11 +199,15 @@ sysarch(struct thread *td, struct sysarch_args *uap) case I386_GET_GSBASE: case I386_SET_GSBASE: case I386_GET_XFPUSTATE: + case I386_SET_PKRU: + case I386_CLEAR_PKRU: case AMD64_GET_FSBASE: case AMD64_SET_FSBASE: case AMD64_GET_GSBASE: case AMD64_SET_GSBASE: case AMD64_GET_XFPUSTATE: + case AMD64_SET_PKRU: + case AMD64_CLEAR_PKRU: break; case I386_SET_IOPERM: @@ -214,6 +223,10 @@ sysarch(struct thread *td, struct sysarch_args *uap) if (uap->op == I386_GET_LDT || uap->op == I386_SET_LDT) return (sysarch_ldt(td, uap, UIO_USERSPACE)); + + error = 0; + pcb = td->td_pcb; + /* * XXXKIB check that the BSM generation code knows to encode * the op argument. @@ -233,11 +246,27 @@ sysarch(struct thread *td, struct sysarch_args *uap) a64xfpu.addr = (void *)(uintptr_t)i386xfpu.addr; a64xfpu.len = i386xfpu.len; break; + case I386_SET_PKRU: + case I386_CLEAR_PKRU: + if ((error = copyin(uap->parms, &i386pkru, + sizeof(struct i386_set_pkru))) != 0) + return (error); + a64pkru.addr = (void *)(uintptr_t)i386pkru.addr; + a64pkru.len = i386pkru.len; + a64pkru.keyidx = i386pkru.keyidx; + a64pkru.flags = i386pkru.flags; + break; case AMD64_GET_XFPUSTATE: if ((error = copyin(uap->parms, &a64xfpu, sizeof(struct amd64_get_xfpustate))) != 0) return (error); break; + case AMD64_SET_PKRU: + case AMD64_CLEAR_PKRU: + if ((error = copyin(uap->parms, &a64pkru, + sizeof(struct amd64_set_pkru))) != 0) + return (error); + break; default: break; } @@ -326,6 +355,34 @@ sysarch(struct thread *td, struct sysarch_args *uap) a64xfpu.addr, a64xfpu.len); break; + case I386_SET_PKRU: + case AMD64_SET_PKRU: + /* + * Read-lock the map to synchronize with parallel + * pmap_vmspace_copy() on fork. + */ + map = &td->td_proc->p_vmspace->vm_map; + vm_map_lock_read(map); + error = pmap_pkru_set(PCPU_GET(curpmap), + (vm_offset_t)a64pkru.addr, (vm_offset_t)a64pkru.addr + + a64pkru.len, a64pkru.keyidx, a64pkru.flags); + vm_map_unlock_read(map); + break; + + case I386_CLEAR_PKRU: + case AMD64_CLEAR_PKRU: + if (a64pkru.flags != 0 || a64pkru.keyidx != 0) { + error = EINVAL; + break; + } + map = &td->td_proc->p_vmspace->vm_map; + vm_map_lock_read(map); + error = pmap_pkru_clear(PCPU_GET(curpmap), + (vm_offset_t)a64pkru.addr, + (vm_offset_t)a64pkru.addr + a64pkru.len); + vm_map_unlock(map); + break; + default: error = EINVAL; break; diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index 5574fef03347..1aea232181c0 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -807,6 +807,20 @@ trap_pfault(struct trapframe *frame, int usermode) return (-1); } + /* + * User-mode protection key violation (PKU). May happen + * either from usermode or from kernel if copyin accessed + * key-protected mapping. + */ + if ((frame->tf_err & PGEX_PK) != 0) { + if (eva > VM_MAXUSER_ADDRESS) { + trap_fatal(frame, eva); + return (-1); + } + rv = KERN_PROTECTION_FAILURE; + goto after_vmfault; + } + /* * If nx protection of the usermode portion of kernel page * tables caused trap, panic. @@ -842,6 +856,7 @@ trap_pfault(struct trapframe *frame, int usermode) #endif return (0); } +after_vmfault: if (!usermode) { if (td->td_intr_nesting_level == 0 && curpcb->pcb_onfault != NULL) { diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h index e20a89e3a93f..91d6fb2f934b 100644 --- a/sys/amd64/include/pmap.h +++ b/sys/amd64/include/pmap.h @@ -66,6 +66,7 @@ #define X86_PG_AVAIL2 0x400 /* < programmers use */ #define X86_PG_AVAIL3 0x800 /* \ */ #define X86_PG_PDE_PAT 0x1000 /* PAT PAT index */ +#define X86_PG_PKU(idx) ((pt_entry_t)idx << 59) #define X86_PG_NX (1ul<<63) /* No-execute */ #define X86_PG_AVAIL(x) (1ul << (x)) @@ -73,6 +74,10 @@ #define X86_PG_PDE_CACHE (X86_PG_PDE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD) #define X86_PG_PTE_CACHE (X86_PG_PTE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD) +/* Protection keys indexes */ +#define PMAP_MAX_PKRU_IDX 0xf +#define X86_PG_PKU_MASK X86_PG_PKU(PMAP_MAX_PKRU_IDX) + /* * Intel extended page table (EPT) bit definitions. */ @@ -120,7 +125,7 @@ * (PTE) page mappings have identical settings for the following fields: */ #define PG_PTE_PROMOTE (PG_NX | PG_MANAGED | PG_W | PG_G | PG_PTE_CACHE | \ - PG_M | PG_A | PG_U | PG_RW | PG_V) + PG_M | PG_A | PG_U | PG_RW | PG_V | PG_PKU_MASK) /* * Page Protection Exception bits @@ -242,6 +247,8 @@ #include #include #include +#include +#include #include @@ -336,6 +343,7 @@ struct pmap { long pm_eptgen; /* EPT pmap generation id */ int pm_flags; struct pmap_pcids pm_pcids[MAXCPU]; + struct rangeset pm_pkru; }; /* flags */ @@ -454,6 +462,10 @@ void pmap_pti_pcid_invalidate(uint64_t ucr3, uint64_t kcr3); void pmap_pti_pcid_invlpg(uint64_t ucr3, uint64_t kcr3, vm_offset_t va); void pmap_pti_pcid_invlrng(uint64_t ucr3, uint64_t kcr3, vm_offset_t sva, vm_offset_t eva); +int pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); +int pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, + u_int keyidx, int flags); +int pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap); #endif /* _KERNEL */ /* Return various clipped indexes for a given VA */ diff --git a/sys/arm/include/pmap.h b/sys/arm/include/pmap.h index a5ac6bf0f2d3..50e7a58b5db2 100644 --- a/sys/arm/include/pmap.h +++ b/sys/arm/include/pmap.h @@ -71,5 +71,12 @@ void pmap_kremove_device(vm_offset_t, vm_size_t); vm_paddr_t pmap_kextract(vm_offset_t); #define vtophys(va) pmap_kextract((vm_offset_t)(va)) +static inline int +pmap_vmspace_copy(pmap_t dst_pmap __unused, pmap_t src_pmap __unused) +{ + + return (0); +} + #endif /* _KERNEL */ #endif /* !_MACHINE_PMAP_H_ */ diff --git a/sys/arm64/include/pmap.h b/sys/arm64/include/pmap.h index f1b86b671331..736ce0f3333b 100644 --- a/sys/arm64/include/pmap.h +++ b/sys/arm64/include/pmap.h @@ -171,6 +171,13 @@ struct pcb *pmap_switch(struct thread *, struct thread *); #define pmap_page_is_mapped(m) (!TAILQ_EMPTY(&(m)->md.pv_list)) +static inline int +pmap_vmspace_copy(pmap_t dst_pmap __unused, pmap_t src_pmap __unused) +{ + + return (0); +} + #endif /* _KERNEL */ #endif /* !LOCORE */ diff --git a/sys/i386/include/pmap.h b/sys/i386/include/pmap.h index 3a136b2837cf..eed5d868e2cd 100644 --- a/sys/i386/include/pmap.h +++ b/sys/i386/include/pmap.h @@ -244,6 +244,13 @@ extern vm_offset_t virtual_end; #define pmap_page_is_write_mapped(m) (((m)->aflags & PGA_WRITEABLE) != 0) #define pmap_unmapbios(va, sz) pmap_unmapdev((va), (sz)) +static inline int +pmap_vmspace_copy(pmap_t dst_pmap __unused, pmap_t src_pmap __unused) +{ + + return (0); +} + struct sf_buf; /* diff --git a/sys/mips/include/pmap.h b/sys/mips/include/pmap.h index 261f37e5f107..ba31de835d23 100644 --- a/sys/mips/include/pmap.h +++ b/sys/mips/include/pmap.h @@ -185,6 +185,13 @@ int pmap_emulate_modified(pmap_t pmap, vm_offset_t va); void pmap_page_set_memattr(vm_page_t, vm_memattr_t); int pmap_change_attr(vm_offset_t, vm_size_t, vm_memattr_t); +static inline int +pmap_vmspace_copy(pmap_t dst_pmap __unused, pmap_t src_pmap __unused) +{ + + return (0); +} + #endif /* _KERNEL */ #endif /* !LOCORE */ diff --git a/sys/powerpc/include/pmap.h b/sys/powerpc/include/pmap.h index d96de742e500..ae3971699308 100644 --- a/sys/powerpc/include/pmap.h +++ b/sys/powerpc/include/pmap.h @@ -288,6 +288,13 @@ vm_offset_t pmap_early_io_map(vm_paddr_t pa, vm_size_t size); void pmap_early_io_unmap(vm_offset_t va, vm_size_t size); void pmap_track_page(pmap_t pmap, vm_offset_t va); +static inline int +pmap_vmspace_copy(pmap_t dst_pmap __unused, pmap_t src_pmap __unused) +{ + + return (0); +} + #endif #endif /* !_MACHINE_PMAP_H_ */ diff --git a/sys/riscv/include/pmap.h b/sys/riscv/include/pmap.h index 2d55b764e9e8..fe8ddbbf7a4b 100644 --- a/sys/riscv/include/pmap.h +++ b/sys/riscv/include/pmap.h @@ -166,6 +166,13 @@ bool pmap_get_tables(pmap_t, vm_offset_t, pd_entry_t **, pd_entry_t **, int pmap_fault_fixup(pmap_t, vm_offset_t, vm_prot_t); +static inline int +pmap_vmspace_copy(pmap_t dst_pmap __unused, pmap_t src_pmap __unused) +{ + + return (0); +} + #endif /* _KERNEL */ #endif /* !LOCORE */ diff --git a/sys/sparc64/include/pmap.h b/sys/sparc64/include/pmap.h index e9b9a2942e6b..6a24bdb73f11 100644 --- a/sys/sparc64/include/pmap.h +++ b/sys/sparc64/include/pmap.h @@ -128,4 +128,11 @@ SYSCTL_DECL(_debug_pmap_stats); #endif +static inline int +pmap_vmspace_copy(pmap_t dst_pmap __unused, pmap_t src_pmap __unused) +{ + + return (0); +} + #endif /* !_MACHINE_PMAP_H_ */ diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index 4bf8869b997f..50e931596404 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -481,8 +481,20 @@ vm_fault_populate(struct faultstate *fs, vm_prot_t prot, int fault_type, fault_flags, true); } VM_OBJECT_WUNLOCK(fs->first_object); - pmap_enter(fs->map->pmap, vaddr, m, prot, fault_type | (wired ? - PMAP_ENTER_WIRED : 0), psind); + rv = pmap_enter(fs->map->pmap, vaddr, m, prot, fault_type | + (wired ? PMAP_ENTER_WIRED : 0), psind); +#if defined(__amd64__) + if (psind > 0 && rv == KERN_FAILURE) { + for (i = 0; i < npages; i++) { + rv = pmap_enter(fs->map->pmap, vaddr + ptoa(i), + &m[i], prot, fault_type | + (wired ? PMAP_ENTER_WIRED : 0), 0); + MPASS(rv == KERN_SUCCESS); + } + } +#else + MPASS(rv == KERN_SUCCESS); +#endif VM_OBJECT_WLOCK(fs->first_object); m_mtx = NULL; for (i = 0; i < npages; i++) { diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index 7fa738f40156..ea5582d04a59 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -3544,7 +3544,7 @@ vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge) vm_map_t new_map, old_map; vm_map_entry_t new_entry, old_entry; vm_object_t object; - int locked; + int error, locked; vm_inherit_t inh; old_map = &vm1->vm_map; @@ -3553,6 +3553,7 @@ vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge) pmap_pinit); if (vm2 == NULL) return (NULL); + vm2->vm_taddr = vm1->vm_taddr; vm2->vm_daddr = vm1->vm_daddr; vm2->vm_maxsaddr = vm1->vm_maxsaddr; @@ -3563,7 +3564,17 @@ vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge) locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */ KASSERT(locked, ("vmspace_fork: lock failed")); + error = pmap_vmspace_copy(new_map->pmap, old_map->pmap); + if (error != 0) { + sx_xunlock(&old_map->lock); + sx_xunlock(&new_map->lock); + vm_map_process_deferred(); + vmspace_free(vm2); + return (NULL); + } + new_map->anon_loc = old_map->anon_loc; + old_entry = old_map->header.next; while (old_entry != &old_map->header) { diff --git a/sys/x86/include/sysarch.h b/sys/x86/include/sysarch.h index efc278916fad..3ce7794ed1a3 100644 --- a/sys/x86/include/sysarch.h +++ b/sys/x86/include/sysarch.h @@ -52,6 +52,8 @@ #define I386_GET_GSBASE 9 #define I386_SET_GSBASE 10 #define I386_GET_XFPUSTATE 11 +#define I386_SET_PKRU 12 +#define I386_CLEAR_PKRU 13 /* Leave space for 0-127 for to avoid translating syscalls */ #define AMD64_GET_FSBASE 128 @@ -59,6 +61,12 @@ #define AMD64_GET_GSBASE 130 #define AMD64_SET_GSBASE 131 #define AMD64_GET_XFPUSTATE 132 +#define AMD64_SET_PKRU 133 +#define AMD64_CLEAR_PKRU 134 + +/* Flags for AMD64_SET_PKRU */ +#define AMD64_PKRU_EXCL 0x0001 +#define AMD64_PKRU_PERSIST 0x0002 struct i386_ioperm_args { unsigned int start; @@ -94,12 +102,26 @@ struct i386_get_xfpustate { int len; }; +struct i386_set_pkru { + unsigned int addr; + unsigned int len; + unsigned int keyidx; + int flags; +}; + struct amd64_get_xfpustate { void *addr; int len; }; #endif +struct amd64_set_pkru { + void *addr; + unsigned long len; + unsigned int keyidx; + int flags; +}; + #ifndef _KERNEL union descriptor; struct dbreg;