freebsd-dev/sys/amd64/include/asmacros.h

323 lines
10 KiB
C
Raw Normal View History

PTI for amd64. The implementation of the Kernel Page Table Isolation (KPTI) for amd64, first version. It provides a workaround for the 'meltdown' vulnerability. PTI is turned off by default for now, enable with the loader tunable vm.pmap.pti=1. The pmap page table is split into kernel-mode table and user-mode table. Kernel-mode table is identical to the non-PTI table, while usermode table is obtained from kernel table by leaving userspace mappings intact, but only leaving the following parts of the kernel mapped: kernel text (but not modules text) PCPU GDT/IDT/user LDT/task structures IST stacks for NMI and doublefault handlers. Kernel switches to user page table before returning to usermode, and restores full kernel page table on the entry. Initial kernel-mode stack for PTI trampoline is allocated in PCPU, it is only 16 qwords. Kernel entry trampoline switches page tables. then the hardware trap frame is copied to the normal kstack, and execution continues. IST stacks are kept mapped and no trampoline is needed for NMI/doublefault, but of course page table switch is performed. On return to usermode, the trampoline is used again, iret frame is copied to the trampoline stack, page tables are switched and iretq is executed. The case of iretq faulting due to the invalid usermode context is tricky, since the frame for fault is appended to the trampoline frame. Besides copying the fault frame and original (corrupted) frame to kstack, the fault frame must be patched to make it look as if the fault occured on the kstack, see the comment in doret_iret detection code in trap(). Currently kernel pages which are mapped during trampoline operation are identical for all pmaps. They are registered using pmap_pti_add_kva(). Besides initial registrations done during boot, LDT and non-common TSS segments are registered if user requested their use. In principle, they can be installed into kernel page table per pmap with some work. Similarly, PCPU can be hidden from userspace mapping using trampoline PCPU page, but again I do not see much benefits besides complexity. PDPE pages for the kernel half of the user page tables are pre-allocated during boot because we need to know pml4 entries which are copied to the top-level paging structure page, in advance on a new pmap creation. I enforce this to avoid iterating over the all existing pmaps if a new PDPE page is needed for PTI kernel mappings. The iteration is a known problematic operation on i386. The need to flush hidden kernel translations on the switch to user mode make global tables (PG_G) meaningless and even harming, so PG_G use is disabled for PTI case. Our existing use of PCID is incompatible with PTI and is automatically disabled if PTI is enabled. PCID can be forced on only for developer's benefit. MCE is known to be broken, it requires IST stack to operate completely correctly even for non-PTI case, and absolutely needs dedicated IST stack because MCE delivery while trampoline did not switched from PTI stack is fatal. The fix is pending. Reviewed by: markj (partially) Tested by: pho (previous version) Discussed with: jeff, jhb Sponsored by: The FreeBSD Foundation MFC after: 2 weeks
2018-01-17 11:44:21 +00:00
/* -*- mode: asm -*- */
1994-08-02 07:55:43 +00:00
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
1994-08-02 07:55:43 +00:00
* Copyright (c) 1993 The Regents of the University of California.
* All rights reserved.
*
PTI for amd64. The implementation of the Kernel Page Table Isolation (KPTI) for amd64, first version. It provides a workaround for the 'meltdown' vulnerability. PTI is turned off by default for now, enable with the loader tunable vm.pmap.pti=1. The pmap page table is split into kernel-mode table and user-mode table. Kernel-mode table is identical to the non-PTI table, while usermode table is obtained from kernel table by leaving userspace mappings intact, but only leaving the following parts of the kernel mapped: kernel text (but not modules text) PCPU GDT/IDT/user LDT/task structures IST stacks for NMI and doublefault handlers. Kernel switches to user page table before returning to usermode, and restores full kernel page table on the entry. Initial kernel-mode stack for PTI trampoline is allocated in PCPU, it is only 16 qwords. Kernel entry trampoline switches page tables. then the hardware trap frame is copied to the normal kstack, and execution continues. IST stacks are kept mapped and no trampoline is needed for NMI/doublefault, but of course page table switch is performed. On return to usermode, the trampoline is used again, iret frame is copied to the trampoline stack, page tables are switched and iretq is executed. The case of iretq faulting due to the invalid usermode context is tricky, since the frame for fault is appended to the trampoline frame. Besides copying the fault frame and original (corrupted) frame to kstack, the fault frame must be patched to make it look as if the fault occured on the kstack, see the comment in doret_iret detection code in trap(). Currently kernel pages which are mapped during trampoline operation are identical for all pmaps. They are registered using pmap_pti_add_kva(). Besides initial registrations done during boot, LDT and non-common TSS segments are registered if user requested their use. In principle, they can be installed into kernel page table per pmap with some work. Similarly, PCPU can be hidden from userspace mapping using trampoline PCPU page, but again I do not see much benefits besides complexity. PDPE pages for the kernel half of the user page tables are pre-allocated during boot because we need to know pml4 entries which are copied to the top-level paging structure page, in advance on a new pmap creation. I enforce this to avoid iterating over the all existing pmaps if a new PDPE page is needed for PTI kernel mappings. The iteration is a known problematic operation on i386. The need to flush hidden kernel translations on the switch to user mode make global tables (PG_G) meaningless and even harming, so PG_G use is disabled for PTI case. Our existing use of PCID is incompatible with PTI and is automatically disabled if PTI is enabled. PCID can be forced on only for developer's benefit. MCE is known to be broken, it requires IST stack to operate completely correctly even for non-PTI case, and absolutely needs dedicated IST stack because MCE delivery while trampoline did not switched from PTI stack is fatal. The fix is pending. Reviewed by: markj (partially) Tested by: pho (previous version) Discussed with: jeff, jhb Sponsored by: The FreeBSD Foundation MFC after: 2 weeks
2018-01-17 11:44:21 +00:00
* Copyright (c) 2018 The FreeBSD Foundation
* All rights reserved.
*
* Portions of this software were developed by
* Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
* the FreeBSD Foundation.
*
1994-08-02 07:55:43 +00:00
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
1994-08-02 07:55:43 +00:00
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
1999-08-28 01:08:13 +00:00
* $FreeBSD$
1994-08-02 07:55:43 +00:00
*/
#ifndef _MACHINE_ASMACROS_H_
#define _MACHINE_ASMACROS_H_
#include <sys/cdefs.h>
/* XXX too much duplication in various asm*.h's. */
/*
* CNAME is used to manage the relationship between symbol names in C
* and the equivalent assembly language names. CNAME is given a name as
* it would be used in a C program. It expands to the equivalent assembly
* language name.
*/
#define CNAME(csym) csym
#define ALIGN_DATA .p2align 3 /* 8 byte alignment, zero filled */
#ifdef GPROF
#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */
#else
#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */
#endif
#define SUPERALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */
First steps in rewriting locore.s, and making info useful when the machine panics. i386/i386/locore.s: 1) got rid of most .set directives that were being used like #define's, and replaced them with appropriate #define's in the appropriate header files (accessed via genassym). 2) added comments to header inclusions and global definitions, and global variables 3) replaced some hardcoded constants with cpp defines (such as PDESIZE and others) 4) aligned all comments to the same column to make them easier to read 5) moved macro definitions for ENTRY, ALIGN, NOP, etc. to /sys/i386/include/asmacros.h 6) added #ifdef BDE_DEBUGGER around all of Bruce's debugger code 7) added new global '_KERNend' to store last location+1 of kernel 8) cleaned up zeroing of bss so that only bss is zeroed 9) fix zeroing of page tables so that it really does zero them all - not just if they follow the bss. 10) rewrote page table initialization code so that 1) works correctly and 2) write protects the kernel text by default 11) properly initialize the kernel page directory, upages, p0stack PT, and page tables. The previous scheme was more than a bit screwy. 12) change allocation of virtual area of IO hole so that it is fixed at KERNBASE + 0xa0000. The previous scheme put it right after the kernel page tables and then later expected it to be at KERNBASE +0xa0000 13) change multiple bogus settings of user read/write of various areas of kernel VM - including the IO hole; we should never be accessing the IO hole in user mode through the kernel page tables 14) split kernel support routines such as bcopy, bzero, copyin, copyout, etc. into a seperate file 'support.s' 15) split swtch and related routines into a seperate 'swtch.s' 16) split routines related to traps, syscalls, and interrupts into a seperate file 'exception.s' 17) remove some unused global variables from locore that got inserted by Garrett when he pulled them out of some .h files. i386/isa/icu.s: 1) clean up global variable declarations 2) move in declaration of astpending and netisr i386/i386/pmap.c: 1) fix calculation of virtual_avail. It previously was calculated to be right in the middle of the kernel page tables - not a good place to start allocating kernel VM. 2) properly allocate kernel page dir/tables etc out of kernel map - previously only took out 2 pages. i386/i386/machdep.c: 1) modify boot() to print a warning that the system will reboot in PANIC_REBOOT_WAIT_TIME amount of seconds, and let the user abort with a key on the console. The machine will wait for ever if a key is typed before the reboot. The default is 15 seconds, but can be set to 0 to mean don't wait at all, -1 to mean wait forever, or any positive value to wait for that many seconds. 2) print "Rebooting..." just before doing it. kern/subr_prf.c: 1) remove PANICWAIT as it is deprecated by the change to machdep.c i386/i386/trap.c: 1) add table of trap type strings and use it to print a real trap/ panic message rather than just a number. Lot's of work to be done here, but this is the first step. Symbolic traceback is in the TODO. i386/i386/Makefile.i386: 1) add support in to build support.s, exception.s and swtch.s ...and various changes to various header files to make all of the above happen.
1993-11-13 02:25:21 +00:00
#define GEN_ENTRY(name) ALIGN_TEXT; .globl CNAME(name); \
.type CNAME(name),@function; CNAME(name):
#define NON_GPROF_ENTRY(name) GEN_ENTRY(name)
#define NON_GPROF_RET .byte 0xc3 /* opcode for `ret' */
New interrupt code from Bruce Evans. In additional to Bruce's attached list of changes, I've made the following additional changes: 1) i386/include/ipl.h renamed to spl.h as the name conflicts with the file of the same name in i386/isa/ipl.h. 2) changed all use of *mask (i.e. netmask, biomask, ttymask, etc) to *_imask (net_imask, etc). 3) changed vestige of splnet use in if_is to splimp. 4) got rid of "impmask" completely (Bruce had gotten rid of netmask), and are now using net_imask instead. 5) dozens of minor cruft to glue in Bruce's changes. These require changes I made to config(8) as well, and thus it must be rebuilt. -DG from Bruce Evans: sio: o No diff is supplied. Remove the define of setsofttty(). I hope that is enough. *.s: o i386/isa/debug.h no longer exists. The event counters became too much trouble to maintain. All function call entry and exception entry counters can be recovered by using profiling kernel (the new profiling supports all entry points; however, it is too slow to leave enabled all the time; it also). Only BDBTRAP() from debug.h is now used. That is moved to exception.s. It might be worth preserving SHOW_BITS() and calling it from _mcount() (if enabled). o T_ASTFLT is now only set just before calling trap(). o All exception handlers set SWI_AST_MASK in cpl as soon as possible after entry and arrange for _doreti to restore it atomically with exiting. It is not possible to set it atomically with entering the kernel, so it must be checked against the user mode bits in the trap frame before committing to using it. There is no place to store the old value of cpl for syscalls or traps, so there are some complications restoring it. Profiling stuff (mostly in *.s): o Changes to kern/subr_mcount.c, gcc and gprof are not supplied yet. o All interesting labels `foo' are renamed `_foo' and all uninteresting labels `_bar' are renamed `bar'. A small change to gprof allows ignoring labels not starting with underscores. o MCOUNT_LABEL() is to provide names for counters for times spent in exception handlers. o FAKE_MCOUNT() is a version of MCOUNT() suitable for exception handlers. Its arg is the pc where the exception occurred. The new mcount() pretends that this was a call from that pc to a suitable MCOUNT_LABEL(). o MEXITCOUNT is to turn off any timer started by MCOUNT(). /usr/src/sys/i386/i386/exception.s: o The non-BDB BPTTRAP() macros were doing a sti even when interrupts were disabled when the trap occurred. The sti (fixed) sti is actually a no-op unless you have my changes to machdep.c that make the debugger trap gates interrupt gates, but fixing that would make the ifdefs messier. ddb seems to be unharmed by both interrupts always disabled and always enabled (I had the branch in the fix back to front for some time :-(). o There is no known pushal bug. o tf_err can be left as garbage for syscalls. /usr/src/sys/i386/i386/locore.s: o Fix and update BDE_DEBUGGER support. o ENTRY(btext) before initialization was dangerous. o Warm boot shot was longer than intended. /usr/src/sys/i386/i386/machdep.c: o DON'T APPLY ALL OF THIS DIFF. It's what I'm using, but may require other changes. Use the following: o Remove aston() and setsoftclock(). Maybe use the following: o No netisr.h. o Spelling fix. o Delay to read the Rebooting message. o Fix for vm system unmapping a reduced area of memory after bounds_check_with_label() reduces the size of a physical i/o for a partition boundary. A similar fix is required in kern_physio.c. o Correct use of __CONCAT. It never worked here for non- ANSI cpp's. Is it time to drop support for non-ANSI? o gdt_segs init. 0xffffffffUL is bogus because ssd_limit is not 32 bits. The replacement may have the same value :-), but is more natural. o physmem was one page too low. Confusing variable names. Don't use the following: o Better numbers of buffers. Each 8K page requires up to 16 buffer headers. On my system, this results in 5576 buffers containing [up to] 2854912 bytes of memory. The usual allocation of about 384 buffers only holds 192K of disk if you use it on an fs with a block size of 512. o gdt changes for bdb. o *TGT -> *IDT changes for bdb. o #ifdefed changes for bdb. /usr/src/sys/i386/i386/microtime.s: o Use the correct asm macros. I think asm.h was copied from Mach just for microtime and isn't used now. It certainly doesn't belong in <sys>. Various macros are also duplicated in sys/i386/boot.h and libc/i386/*.h. o Don't switch to and from the IRR; it is guaranteed to be selected (default after ICU init and explicitly selected in isa.c too, and never changed until the old microtime clobbered it). /usr/src/sys/i386/i386/support.s: o Non-essential changes (none related to spls or profiling). o Removed slow loads of %gs again. The LDT support may require not relying on %gs, but loading it is not the way to fix it! Some places (copyin ...) forgot to load it. Loading it clobbers the user %gs. trap() still loads it after certain types of faults so that fuword() etc can rely on it without loading it explicitly. Exception handlers don't restore it. If we want to preserve the user %gs, then the fastest method is to not touch it except for context switches. Comparing with VM_MAXUSER_ADDRESS and branching takes only 2 or 4 cycles on a 486, while loading %gs takes 9 cycles and using it takes another. o Fixed a signed branch to unsigned. /usr/src/sys/i386/i386/swtch.s: o Move spl0() outside of idle loop. o Remove cli/sti from idle loop. sw1 does a cli, and in the unlikely event of an interrupt occurring and whichqs becoming zero, sw1 will just jump back to _idle. o There's no spl0() function in asm any more, so use splz(). o swtch() doesn't need to be superaligned, at least with the new mcounting. o Fixed a signed branch to unsigned. o Removed astoff(). /usr/src/sys/i386/i386/trap.c: o The decentralized extern decls were inconsistent, of course. o Fixed typo MATH_EMULTATE in comments. */ o Removed unused variables. o Old netmask is now impmask; print it instead. Perhaps we should print some of the new masks. o BTW, trap() should not print anything for normal debugger traps. /usr/src/sys/i386/include/asmacros.h: o DON'T APPLY ALL OF THIS DIFF. Just use some of the null macros as necessary. /usr/src/sys/i386/include/cpu.h: o CLKF_BASEPRI() changes since cpl == SWI_AST_MASK is now normal while the kernel is running. o Don't use var++ to set boolean variables. It fails after a mere 4G times :-) and is slower than storing a constant on [3-4]86s. /usr/src/sys/i386/include/cpufunc.h: o DON'T APPLY ALL OF THIS DIFF. You need mainly the include of <machine/ipl.h>. Unfortunately, <machine/ipl.h> is needed by almost everything for the inlines. /usr/src/sys/i386/include/ipl.h: o New file. Defines spl inlines and SWI macros and declares most variables related to hard and soft interrupt masks. /usr/src/sys/i386/isa/icu.h: o Moved definitions to <machine/ipl.h> /usr/src/sys/i386/isa/icu.s: o Software interrupts (SWIs) and delayed hardware interrupts (HWIs) are now handled uniformally, and dispatching them from splx() is more like dispatching them from _doreti. The dispatcher is essentially *(handler[ffs(ipending & ~cpl)](). o More care (not quite enough) is taken to avoid unbounded nesting of interrupts. o The interface to softclock() is changed so that a trap frame is not required. o Fast interrupt handlers are now handled more uniformally. Configuration is still too early (new handlers would require bits in <machine/ipl.h> and functions to vector.s). o splnnn() and splx() are no longer here; they are inline functions (could be macros for other compilers). splz() is the nontrivial part of the old splx(). /usr/src/sys/i386/isa/ipl.h o New file. Supposed to have only bus-dependent stuff. Perhaps the h/w masks should be declared here. /usr/src/sys/i386/isa/isa.c: o DON'T APPLY ALL OF THIS DIFF. You need only things involving *mask and *MASK and comments about them. netmask is now a pure software mask. It works like the softclock mask. /usr/src/sys/i386/isa/vector.s: o Reorganize AUTO_EOI* macros. o Option FAST_INTR_HANDLER_USERS_ES for people who don't trust fastintr handlers. o fastintr handlers need to metamorphose into ordinary interrupt handlers if their SWI bit has become set. Previously, sio had unintended latency for handling output completions and input of SLIP framing characters because this was not done. /usr/src/sys/net/netisr.h: o The machine-dependent stuff is now imported from <machine/ipl.h>. /usr/src/sys/sys/systm.h o DON'T APPLY ALL OF THIS DIFF. You need mainly the different splx() prototype. The spl*() prototypes are duplicated as inlines in <machine/ipl.h> but they need to be duplicated here in case there are no inlines. I sent systm.h and cpufunc.h to Garrett. We agree that spl0 should be replaced by splnone and not the other way around like I've done. /usr/src/sys/kern/kern_clock.c o splsoftclock() now lowers cpl so the direct call to softclock() works as intended. o softclock() interface changed to avoid passing the whole frame (some machines may need another change for profile_tick()). o profiling renamed _profiling to avoid ANSI namespace pollution. (I had to improve the mcount() interface and may as well fix it.) The GUPROF variant doesn't actually reference profiling here, but the 'U' in GUPROF should mean to select the microtimer mcount() and not change the interface.
1994-04-02 07:00:53 +00:00
#define END(name) .size name, . - name
First steps in rewriting locore.s, and making info useful when the machine panics. i386/i386/locore.s: 1) got rid of most .set directives that were being used like #define's, and replaced them with appropriate #define's in the appropriate header files (accessed via genassym). 2) added comments to header inclusions and global definitions, and global variables 3) replaced some hardcoded constants with cpp defines (such as PDESIZE and others) 4) aligned all comments to the same column to make them easier to read 5) moved macro definitions for ENTRY, ALIGN, NOP, etc. to /sys/i386/include/asmacros.h 6) added #ifdef BDE_DEBUGGER around all of Bruce's debugger code 7) added new global '_KERNend' to store last location+1 of kernel 8) cleaned up zeroing of bss so that only bss is zeroed 9) fix zeroing of page tables so that it really does zero them all - not just if they follow the bss. 10) rewrote page table initialization code so that 1) works correctly and 2) write protects the kernel text by default 11) properly initialize the kernel page directory, upages, p0stack PT, and page tables. The previous scheme was more than a bit screwy. 12) change allocation of virtual area of IO hole so that it is fixed at KERNBASE + 0xa0000. The previous scheme put it right after the kernel page tables and then later expected it to be at KERNBASE +0xa0000 13) change multiple bogus settings of user read/write of various areas of kernel VM - including the IO hole; we should never be accessing the IO hole in user mode through the kernel page tables 14) split kernel support routines such as bcopy, bzero, copyin, copyout, etc. into a seperate file 'support.s' 15) split swtch and related routines into a seperate 'swtch.s' 16) split routines related to traps, syscalls, and interrupts into a seperate file 'exception.s' 17) remove some unused global variables from locore that got inserted by Garrett when he pulled them out of some .h files. i386/isa/icu.s: 1) clean up global variable declarations 2) move in declaration of astpending and netisr i386/i386/pmap.c: 1) fix calculation of virtual_avail. It previously was calculated to be right in the middle of the kernel page tables - not a good place to start allocating kernel VM. 2) properly allocate kernel page dir/tables etc out of kernel map - previously only took out 2 pages. i386/i386/machdep.c: 1) modify boot() to print a warning that the system will reboot in PANIC_REBOOT_WAIT_TIME amount of seconds, and let the user abort with a key on the console. The machine will wait for ever if a key is typed before the reboot. The default is 15 seconds, but can be set to 0 to mean don't wait at all, -1 to mean wait forever, or any positive value to wait for that many seconds. 2) print "Rebooting..." just before doing it. kern/subr_prf.c: 1) remove PANICWAIT as it is deprecated by the change to machdep.c i386/i386/trap.c: 1) add table of trap type strings and use it to print a real trap/ panic message rather than just a number. Lot's of work to be done here, but this is the first step. Symbolic traceback is in the TODO. i386/i386/Makefile.i386: 1) add support in to build support.s, exception.s and swtch.s ...and various changes to various header files to make all of the above happen.
1993-11-13 02:25:21 +00:00
#ifdef GPROF
/*
* __mcount is like [.]mcount except that doesn't require its caller to set
* up a frame pointer. It must be called before pushing anything onto the
* stack. gcc should eventually generate code to call __mcount in most
* cases. This would make -pg in combination with -fomit-frame-pointer
* useful. gcc has a configuration variable PROFILE_BEFORE_PROLOGUE to
* allow profiling before setting up the frame pointer, but this is
* inadequate for good handling of special cases, e.g., -fpic works best
* with profiling after the prologue.
*
* [.]mexitcount is a new function to support non-statistical profiling if an
* accurate clock is available. For C sources, calls to it are generated
* by the FreeBSD extension `-mprofiler-epilogue' to gcc. It is best to
* call [.]mexitcount at the end of a function like the MEXITCOUNT macro does,
* but gcc currently generates calls to it at the start of the epilogue to
* avoid problems with -fpic.
*
* [.]mcount and __mcount may clobber the call-used registers and %ef.
* [.]mexitcount may clobber %ecx and %ef.
*
* Cross-jumping makes non-statistical profiling timing more complicated.
* It is handled in many cases by calling [.]mexitcount before jumping. It
* is handled for conditional jumps using CROSSJUMP() and CROSSJUMP_LABEL().
* It is handled for some fault-handling jumps by not sharing the exit
* routine.
*
* ALTENTRY() must be before a corresponding ENTRY() so that it can jump to
* the main entry point. Note that alt entries are counted twice. They
* have to be counted as ordinary entries for gprof to get the call times
* right for the ordinary entries.
*
* High local labels are used in macros to avoid clashes with local labels
* in functions.
*
* Ordinary `ret' is used instead of a macro `RET' because there are a lot
* of `ret's. 0xc3 is the opcode for `ret' (`#define ret ... ret' can't
* be used because this file is sometimes preprocessed in traditional mode).
* `ret' clobbers eflags but this doesn't matter.
First steps in rewriting locore.s, and making info useful when the machine panics. i386/i386/locore.s: 1) got rid of most .set directives that were being used like #define's, and replaced them with appropriate #define's in the appropriate header files (accessed via genassym). 2) added comments to header inclusions and global definitions, and global variables 3) replaced some hardcoded constants with cpp defines (such as PDESIZE and others) 4) aligned all comments to the same column to make them easier to read 5) moved macro definitions for ENTRY, ALIGN, NOP, etc. to /sys/i386/include/asmacros.h 6) added #ifdef BDE_DEBUGGER around all of Bruce's debugger code 7) added new global '_KERNend' to store last location+1 of kernel 8) cleaned up zeroing of bss so that only bss is zeroed 9) fix zeroing of page tables so that it really does zero them all - not just if they follow the bss. 10) rewrote page table initialization code so that 1) works correctly and 2) write protects the kernel text by default 11) properly initialize the kernel page directory, upages, p0stack PT, and page tables. The previous scheme was more than a bit screwy. 12) change allocation of virtual area of IO hole so that it is fixed at KERNBASE + 0xa0000. The previous scheme put it right after the kernel page tables and then later expected it to be at KERNBASE +0xa0000 13) change multiple bogus settings of user read/write of various areas of kernel VM - including the IO hole; we should never be accessing the IO hole in user mode through the kernel page tables 14) split kernel support routines such as bcopy, bzero, copyin, copyout, etc. into a seperate file 'support.s' 15) split swtch and related routines into a seperate 'swtch.s' 16) split routines related to traps, syscalls, and interrupts into a seperate file 'exception.s' 17) remove some unused global variables from locore that got inserted by Garrett when he pulled them out of some .h files. i386/isa/icu.s: 1) clean up global variable declarations 2) move in declaration of astpending and netisr i386/i386/pmap.c: 1) fix calculation of virtual_avail. It previously was calculated to be right in the middle of the kernel page tables - not a good place to start allocating kernel VM. 2) properly allocate kernel page dir/tables etc out of kernel map - previously only took out 2 pages. i386/i386/machdep.c: 1) modify boot() to print a warning that the system will reboot in PANIC_REBOOT_WAIT_TIME amount of seconds, and let the user abort with a key on the console. The machine will wait for ever if a key is typed before the reboot. The default is 15 seconds, but can be set to 0 to mean don't wait at all, -1 to mean wait forever, or any positive value to wait for that many seconds. 2) print "Rebooting..." just before doing it. kern/subr_prf.c: 1) remove PANICWAIT as it is deprecated by the change to machdep.c i386/i386/trap.c: 1) add table of trap type strings and use it to print a real trap/ panic message rather than just a number. Lot's of work to be done here, but this is the first step. Symbolic traceback is in the TODO. i386/i386/Makefile.i386: 1) add support in to build support.s, exception.s and swtch.s ...and various changes to various header files to make all of the above happen.
1993-11-13 02:25:21 +00:00
*/
#define ALTENTRY(name) GEN_ENTRY(name) ; MCOUNT ; MEXITCOUNT ; jmp 9f
#define CROSSJUMP(jtrue, label, jfalse) \
jfalse 8f; MEXITCOUNT; jmp __CONCAT(to,label); 8:
#define CROSSJUMPTARGET(label) \
ALIGN_TEXT; __CONCAT(to,label): ; MCOUNT; jmp label
#define ENTRY(name) GEN_ENTRY(name) ; 9: ; MCOUNT
#define FAKE_MCOUNT(caller) pushq caller ; call __mcount ; popq %rcx
#define MCOUNT call __mcount
#define MCOUNT_LABEL(name) GEN_ENTRY(name) ; nop ; ALIGN_TEXT
#ifdef GUPROF
#define MEXITCOUNT call .mexitcount
#define ret MEXITCOUNT ; NON_GPROF_RET
#else
#define MEXITCOUNT
#endif
#else /* !GPROF */
First steps in rewriting locore.s, and making info useful when the machine panics. i386/i386/locore.s: 1) got rid of most .set directives that were being used like #define's, and replaced them with appropriate #define's in the appropriate header files (accessed via genassym). 2) added comments to header inclusions and global definitions, and global variables 3) replaced some hardcoded constants with cpp defines (such as PDESIZE and others) 4) aligned all comments to the same column to make them easier to read 5) moved macro definitions for ENTRY, ALIGN, NOP, etc. to /sys/i386/include/asmacros.h 6) added #ifdef BDE_DEBUGGER around all of Bruce's debugger code 7) added new global '_KERNend' to store last location+1 of kernel 8) cleaned up zeroing of bss so that only bss is zeroed 9) fix zeroing of page tables so that it really does zero them all - not just if they follow the bss. 10) rewrote page table initialization code so that 1) works correctly and 2) write protects the kernel text by default 11) properly initialize the kernel page directory, upages, p0stack PT, and page tables. The previous scheme was more than a bit screwy. 12) change allocation of virtual area of IO hole so that it is fixed at KERNBASE + 0xa0000. The previous scheme put it right after the kernel page tables and then later expected it to be at KERNBASE +0xa0000 13) change multiple bogus settings of user read/write of various areas of kernel VM - including the IO hole; we should never be accessing the IO hole in user mode through the kernel page tables 14) split kernel support routines such as bcopy, bzero, copyin, copyout, etc. into a seperate file 'support.s' 15) split swtch and related routines into a seperate 'swtch.s' 16) split routines related to traps, syscalls, and interrupts into a seperate file 'exception.s' 17) remove some unused global variables from locore that got inserted by Garrett when he pulled them out of some .h files. i386/isa/icu.s: 1) clean up global variable declarations 2) move in declaration of astpending and netisr i386/i386/pmap.c: 1) fix calculation of virtual_avail. It previously was calculated to be right in the middle of the kernel page tables - not a good place to start allocating kernel VM. 2) properly allocate kernel page dir/tables etc out of kernel map - previously only took out 2 pages. i386/i386/machdep.c: 1) modify boot() to print a warning that the system will reboot in PANIC_REBOOT_WAIT_TIME amount of seconds, and let the user abort with a key on the console. The machine will wait for ever if a key is typed before the reboot. The default is 15 seconds, but can be set to 0 to mean don't wait at all, -1 to mean wait forever, or any positive value to wait for that many seconds. 2) print "Rebooting..." just before doing it. kern/subr_prf.c: 1) remove PANICWAIT as it is deprecated by the change to machdep.c i386/i386/trap.c: 1) add table of trap type strings and use it to print a real trap/ panic message rather than just a number. Lot's of work to be done here, but this is the first step. Symbolic traceback is in the TODO. i386/i386/Makefile.i386: 1) add support in to build support.s, exception.s and swtch.s ...and various changes to various header files to make all of the above happen.
1993-11-13 02:25:21 +00:00
/*
* ALTENTRY() has to align because it is before a corresponding ENTRY().
* ENTRY() has to align to because there may be no ALTENTRY() before it.
* If there is a previous ALTENTRY() then the alignment code for ENTRY()
* is empty.
First steps in rewriting locore.s, and making info useful when the machine panics. i386/i386/locore.s: 1) got rid of most .set directives that were being used like #define's, and replaced them with appropriate #define's in the appropriate header files (accessed via genassym). 2) added comments to header inclusions and global definitions, and global variables 3) replaced some hardcoded constants with cpp defines (such as PDESIZE and others) 4) aligned all comments to the same column to make them easier to read 5) moved macro definitions for ENTRY, ALIGN, NOP, etc. to /sys/i386/include/asmacros.h 6) added #ifdef BDE_DEBUGGER around all of Bruce's debugger code 7) added new global '_KERNend' to store last location+1 of kernel 8) cleaned up zeroing of bss so that only bss is zeroed 9) fix zeroing of page tables so that it really does zero them all - not just if they follow the bss. 10) rewrote page table initialization code so that 1) works correctly and 2) write protects the kernel text by default 11) properly initialize the kernel page directory, upages, p0stack PT, and page tables. The previous scheme was more than a bit screwy. 12) change allocation of virtual area of IO hole so that it is fixed at KERNBASE + 0xa0000. The previous scheme put it right after the kernel page tables and then later expected it to be at KERNBASE +0xa0000 13) change multiple bogus settings of user read/write of various areas of kernel VM - including the IO hole; we should never be accessing the IO hole in user mode through the kernel page tables 14) split kernel support routines such as bcopy, bzero, copyin, copyout, etc. into a seperate file 'support.s' 15) split swtch and related routines into a seperate 'swtch.s' 16) split routines related to traps, syscalls, and interrupts into a seperate file 'exception.s' 17) remove some unused global variables from locore that got inserted by Garrett when he pulled them out of some .h files. i386/isa/icu.s: 1) clean up global variable declarations 2) move in declaration of astpending and netisr i386/i386/pmap.c: 1) fix calculation of virtual_avail. It previously was calculated to be right in the middle of the kernel page tables - not a good place to start allocating kernel VM. 2) properly allocate kernel page dir/tables etc out of kernel map - previously only took out 2 pages. i386/i386/machdep.c: 1) modify boot() to print a warning that the system will reboot in PANIC_REBOOT_WAIT_TIME amount of seconds, and let the user abort with a key on the console. The machine will wait for ever if a key is typed before the reboot. The default is 15 seconds, but can be set to 0 to mean don't wait at all, -1 to mean wait forever, or any positive value to wait for that many seconds. 2) print "Rebooting..." just before doing it. kern/subr_prf.c: 1) remove PANICWAIT as it is deprecated by the change to machdep.c i386/i386/trap.c: 1) add table of trap type strings and use it to print a real trap/ panic message rather than just a number. Lot's of work to be done here, but this is the first step. Symbolic traceback is in the TODO. i386/i386/Makefile.i386: 1) add support in to build support.s, exception.s and swtch.s ...and various changes to various header files to make all of the above happen.
1993-11-13 02:25:21 +00:00
*/
#define ALTENTRY(name) GEN_ENTRY(name)
#define CROSSJUMP(jtrue, label, jfalse) jtrue label
#define CROSSJUMPTARGET(label)
#define ENTRY(name) GEN_ENTRY(name)
#define FAKE_MCOUNT(caller)
New interrupt code from Bruce Evans. In additional to Bruce's attached list of changes, I've made the following additional changes: 1) i386/include/ipl.h renamed to spl.h as the name conflicts with the file of the same name in i386/isa/ipl.h. 2) changed all use of *mask (i.e. netmask, biomask, ttymask, etc) to *_imask (net_imask, etc). 3) changed vestige of splnet use in if_is to splimp. 4) got rid of "impmask" completely (Bruce had gotten rid of netmask), and are now using net_imask instead. 5) dozens of minor cruft to glue in Bruce's changes. These require changes I made to config(8) as well, and thus it must be rebuilt. -DG from Bruce Evans: sio: o No diff is supplied. Remove the define of setsofttty(). I hope that is enough. *.s: o i386/isa/debug.h no longer exists. The event counters became too much trouble to maintain. All function call entry and exception entry counters can be recovered by using profiling kernel (the new profiling supports all entry points; however, it is too slow to leave enabled all the time; it also). Only BDBTRAP() from debug.h is now used. That is moved to exception.s. It might be worth preserving SHOW_BITS() and calling it from _mcount() (if enabled). o T_ASTFLT is now only set just before calling trap(). o All exception handlers set SWI_AST_MASK in cpl as soon as possible after entry and arrange for _doreti to restore it atomically with exiting. It is not possible to set it atomically with entering the kernel, so it must be checked against the user mode bits in the trap frame before committing to using it. There is no place to store the old value of cpl for syscalls or traps, so there are some complications restoring it. Profiling stuff (mostly in *.s): o Changes to kern/subr_mcount.c, gcc and gprof are not supplied yet. o All interesting labels `foo' are renamed `_foo' and all uninteresting labels `_bar' are renamed `bar'. A small change to gprof allows ignoring labels not starting with underscores. o MCOUNT_LABEL() is to provide names for counters for times spent in exception handlers. o FAKE_MCOUNT() is a version of MCOUNT() suitable for exception handlers. Its arg is the pc where the exception occurred. The new mcount() pretends that this was a call from that pc to a suitable MCOUNT_LABEL(). o MEXITCOUNT is to turn off any timer started by MCOUNT(). /usr/src/sys/i386/i386/exception.s: o The non-BDB BPTTRAP() macros were doing a sti even when interrupts were disabled when the trap occurred. The sti (fixed) sti is actually a no-op unless you have my changes to machdep.c that make the debugger trap gates interrupt gates, but fixing that would make the ifdefs messier. ddb seems to be unharmed by both interrupts always disabled and always enabled (I had the branch in the fix back to front for some time :-(). o There is no known pushal bug. o tf_err can be left as garbage for syscalls. /usr/src/sys/i386/i386/locore.s: o Fix and update BDE_DEBUGGER support. o ENTRY(btext) before initialization was dangerous. o Warm boot shot was longer than intended. /usr/src/sys/i386/i386/machdep.c: o DON'T APPLY ALL OF THIS DIFF. It's what I'm using, but may require other changes. Use the following: o Remove aston() and setsoftclock(). Maybe use the following: o No netisr.h. o Spelling fix. o Delay to read the Rebooting message. o Fix for vm system unmapping a reduced area of memory after bounds_check_with_label() reduces the size of a physical i/o for a partition boundary. A similar fix is required in kern_physio.c. o Correct use of __CONCAT. It never worked here for non- ANSI cpp's. Is it time to drop support for non-ANSI? o gdt_segs init. 0xffffffffUL is bogus because ssd_limit is not 32 bits. The replacement may have the same value :-), but is more natural. o physmem was one page too low. Confusing variable names. Don't use the following: o Better numbers of buffers. Each 8K page requires up to 16 buffer headers. On my system, this results in 5576 buffers containing [up to] 2854912 bytes of memory. The usual allocation of about 384 buffers only holds 192K of disk if you use it on an fs with a block size of 512. o gdt changes for bdb. o *TGT -> *IDT changes for bdb. o #ifdefed changes for bdb. /usr/src/sys/i386/i386/microtime.s: o Use the correct asm macros. I think asm.h was copied from Mach just for microtime and isn't used now. It certainly doesn't belong in <sys>. Various macros are also duplicated in sys/i386/boot.h and libc/i386/*.h. o Don't switch to and from the IRR; it is guaranteed to be selected (default after ICU init and explicitly selected in isa.c too, and never changed until the old microtime clobbered it). /usr/src/sys/i386/i386/support.s: o Non-essential changes (none related to spls or profiling). o Removed slow loads of %gs again. The LDT support may require not relying on %gs, but loading it is not the way to fix it! Some places (copyin ...) forgot to load it. Loading it clobbers the user %gs. trap() still loads it after certain types of faults so that fuword() etc can rely on it without loading it explicitly. Exception handlers don't restore it. If we want to preserve the user %gs, then the fastest method is to not touch it except for context switches. Comparing with VM_MAXUSER_ADDRESS and branching takes only 2 or 4 cycles on a 486, while loading %gs takes 9 cycles and using it takes another. o Fixed a signed branch to unsigned. /usr/src/sys/i386/i386/swtch.s: o Move spl0() outside of idle loop. o Remove cli/sti from idle loop. sw1 does a cli, and in the unlikely event of an interrupt occurring and whichqs becoming zero, sw1 will just jump back to _idle. o There's no spl0() function in asm any more, so use splz(). o swtch() doesn't need to be superaligned, at least with the new mcounting. o Fixed a signed branch to unsigned. o Removed astoff(). /usr/src/sys/i386/i386/trap.c: o The decentralized extern decls were inconsistent, of course. o Fixed typo MATH_EMULTATE in comments. */ o Removed unused variables. o Old netmask is now impmask; print it instead. Perhaps we should print some of the new masks. o BTW, trap() should not print anything for normal debugger traps. /usr/src/sys/i386/include/asmacros.h: o DON'T APPLY ALL OF THIS DIFF. Just use some of the null macros as necessary. /usr/src/sys/i386/include/cpu.h: o CLKF_BASEPRI() changes since cpl == SWI_AST_MASK is now normal while the kernel is running. o Don't use var++ to set boolean variables. It fails after a mere 4G times :-) and is slower than storing a constant on [3-4]86s. /usr/src/sys/i386/include/cpufunc.h: o DON'T APPLY ALL OF THIS DIFF. You need mainly the include of <machine/ipl.h>. Unfortunately, <machine/ipl.h> is needed by almost everything for the inlines. /usr/src/sys/i386/include/ipl.h: o New file. Defines spl inlines and SWI macros and declares most variables related to hard and soft interrupt masks. /usr/src/sys/i386/isa/icu.h: o Moved definitions to <machine/ipl.h> /usr/src/sys/i386/isa/icu.s: o Software interrupts (SWIs) and delayed hardware interrupts (HWIs) are now handled uniformally, and dispatching them from splx() is more like dispatching them from _doreti. The dispatcher is essentially *(handler[ffs(ipending & ~cpl)](). o More care (not quite enough) is taken to avoid unbounded nesting of interrupts. o The interface to softclock() is changed so that a trap frame is not required. o Fast interrupt handlers are now handled more uniformally. Configuration is still too early (new handlers would require bits in <machine/ipl.h> and functions to vector.s). o splnnn() and splx() are no longer here; they are inline functions (could be macros for other compilers). splz() is the nontrivial part of the old splx(). /usr/src/sys/i386/isa/ipl.h o New file. Supposed to have only bus-dependent stuff. Perhaps the h/w masks should be declared here. /usr/src/sys/i386/isa/isa.c: o DON'T APPLY ALL OF THIS DIFF. You need only things involving *mask and *MASK and comments about them. netmask is now a pure software mask. It works like the softclock mask. /usr/src/sys/i386/isa/vector.s: o Reorganize AUTO_EOI* macros. o Option FAST_INTR_HANDLER_USERS_ES for people who don't trust fastintr handlers. o fastintr handlers need to metamorphose into ordinary interrupt handlers if their SWI bit has become set. Previously, sio had unintended latency for handling output completions and input of SLIP framing characters because this was not done. /usr/src/sys/net/netisr.h: o The machine-dependent stuff is now imported from <machine/ipl.h>. /usr/src/sys/sys/systm.h o DON'T APPLY ALL OF THIS DIFF. You need mainly the different splx() prototype. The spl*() prototypes are duplicated as inlines in <machine/ipl.h> but they need to be duplicated here in case there are no inlines. I sent systm.h and cpufunc.h to Garrett. We agree that spl0 should be replaced by splnone and not the other way around like I've done. /usr/src/sys/kern/kern_clock.c o splsoftclock() now lowers cpl so the direct call to softclock() works as intended. o softclock() interface changed to avoid passing the whole frame (some machines may need another change for profile_tick()). o profiling renamed _profiling to avoid ANSI namespace pollution. (I had to improve the mcount() interface and may as well fix it.) The GUPROF variant doesn't actually reference profiling here, but the 'U' in GUPROF should mean to select the microtimer mcount() and not change the interface.
1994-04-02 07:00:53 +00:00
#define MCOUNT
#define MCOUNT_LABEL(name)
#define MEXITCOUNT
#endif /* GPROF */
First steps in rewriting locore.s, and making info useful when the machine panics. i386/i386/locore.s: 1) got rid of most .set directives that were being used like #define's, and replaced them with appropriate #define's in the appropriate header files (accessed via genassym). 2) added comments to header inclusions and global definitions, and global variables 3) replaced some hardcoded constants with cpp defines (such as PDESIZE and others) 4) aligned all comments to the same column to make them easier to read 5) moved macro definitions for ENTRY, ALIGN, NOP, etc. to /sys/i386/include/asmacros.h 6) added #ifdef BDE_DEBUGGER around all of Bruce's debugger code 7) added new global '_KERNend' to store last location+1 of kernel 8) cleaned up zeroing of bss so that only bss is zeroed 9) fix zeroing of page tables so that it really does zero them all - not just if they follow the bss. 10) rewrote page table initialization code so that 1) works correctly and 2) write protects the kernel text by default 11) properly initialize the kernel page directory, upages, p0stack PT, and page tables. The previous scheme was more than a bit screwy. 12) change allocation of virtual area of IO hole so that it is fixed at KERNBASE + 0xa0000. The previous scheme put it right after the kernel page tables and then later expected it to be at KERNBASE +0xa0000 13) change multiple bogus settings of user read/write of various areas of kernel VM - including the IO hole; we should never be accessing the IO hole in user mode through the kernel page tables 14) split kernel support routines such as bcopy, bzero, copyin, copyout, etc. into a seperate file 'support.s' 15) split swtch and related routines into a seperate 'swtch.s' 16) split routines related to traps, syscalls, and interrupts into a seperate file 'exception.s' 17) remove some unused global variables from locore that got inserted by Garrett when he pulled them out of some .h files. i386/isa/icu.s: 1) clean up global variable declarations 2) move in declaration of astpending and netisr i386/i386/pmap.c: 1) fix calculation of virtual_avail. It previously was calculated to be right in the middle of the kernel page tables - not a good place to start allocating kernel VM. 2) properly allocate kernel page dir/tables etc out of kernel map - previously only took out 2 pages. i386/i386/machdep.c: 1) modify boot() to print a warning that the system will reboot in PANIC_REBOOT_WAIT_TIME amount of seconds, and let the user abort with a key on the console. The machine will wait for ever if a key is typed before the reboot. The default is 15 seconds, but can be set to 0 to mean don't wait at all, -1 to mean wait forever, or any positive value to wait for that many seconds. 2) print "Rebooting..." just before doing it. kern/subr_prf.c: 1) remove PANICWAIT as it is deprecated by the change to machdep.c i386/i386/trap.c: 1) add table of trap type strings and use it to print a real trap/ panic message rather than just a number. Lot's of work to be done here, but this is the first step. Symbolic traceback is in the TODO. i386/i386/Makefile.i386: 1) add support in to build support.s, exception.s and swtch.s ...and various changes to various header files to make all of the above happen.
1993-11-13 02:25:21 +00:00
/*
* Convenience for adding frame pointers to hand-coded ASM. Useful for
* DTrace, HWPMC, and KDB.
*/
#define PUSH_FRAME_POINTER \
pushq %rbp ; \
movq %rsp, %rbp ;
#define POP_FRAME_POINTER \
popq %rbp
#ifdef LOCORE
/*
* Access per-CPU data.
*/
#define PCPU(member) %gs:PC_ ## member
#define PCPU_ADDR(member, reg) \
movq %gs:PC_PRVSPACE, reg ; \
addq $PC_ ## member, reg
PTI for amd64. The implementation of the Kernel Page Table Isolation (KPTI) for amd64, first version. It provides a workaround for the 'meltdown' vulnerability. PTI is turned off by default for now, enable with the loader tunable vm.pmap.pti=1. The pmap page table is split into kernel-mode table and user-mode table. Kernel-mode table is identical to the non-PTI table, while usermode table is obtained from kernel table by leaving userspace mappings intact, but only leaving the following parts of the kernel mapped: kernel text (but not modules text) PCPU GDT/IDT/user LDT/task structures IST stacks for NMI and doublefault handlers. Kernel switches to user page table before returning to usermode, and restores full kernel page table on the entry. Initial kernel-mode stack for PTI trampoline is allocated in PCPU, it is only 16 qwords. Kernel entry trampoline switches page tables. then the hardware trap frame is copied to the normal kstack, and execution continues. IST stacks are kept mapped and no trampoline is needed for NMI/doublefault, but of course page table switch is performed. On return to usermode, the trampoline is used again, iret frame is copied to the trampoline stack, page tables are switched and iretq is executed. The case of iretq faulting due to the invalid usermode context is tricky, since the frame for fault is appended to the trampoline frame. Besides copying the fault frame and original (corrupted) frame to kstack, the fault frame must be patched to make it look as if the fault occured on the kstack, see the comment in doret_iret detection code in trap(). Currently kernel pages which are mapped during trampoline operation are identical for all pmaps. They are registered using pmap_pti_add_kva(). Besides initial registrations done during boot, LDT and non-common TSS segments are registered if user requested their use. In principle, they can be installed into kernel page table per pmap with some work. Similarly, PCPU can be hidden from userspace mapping using trampoline PCPU page, but again I do not see much benefits besides complexity. PDPE pages for the kernel half of the user page tables are pre-allocated during boot because we need to know pml4 entries which are copied to the top-level paging structure page, in advance on a new pmap creation. I enforce this to avoid iterating over the all existing pmaps if a new PDPE page is needed for PTI kernel mappings. The iteration is a known problematic operation on i386. The need to flush hidden kernel translations on the switch to user mode make global tables (PG_G) meaningless and even harming, so PG_G use is disabled for PTI case. Our existing use of PCID is incompatible with PTI and is automatically disabled if PTI is enabled. PCID can be forced on only for developer's benefit. MCE is known to be broken, it requires IST stack to operate completely correctly even for non-PTI case, and absolutely needs dedicated IST stack because MCE delivery while trampoline did not switched from PTI stack is fatal. The fix is pending. Reviewed by: markj (partially) Tested by: pho (previous version) Discussed with: jeff, jhb Sponsored by: The FreeBSD Foundation MFC after: 2 weeks
2018-01-17 11:44:21 +00:00
/*
* Convenience macro for declaring interrupt entry points.
*/
#define IDTVEC(name) ALIGN_TEXT; .globl __CONCAT(X,name); \
.type __CONCAT(X,name),@function; __CONCAT(X,name):
.macro SAVE_SEGS
movw %fs,TF_FS(%rsp)
movw %gs,TF_GS(%rsp)
movw %es,TF_ES(%rsp)
movw %ds,TF_DS(%rsp)
.endm
.macro MOVE_STACKS qw
.L.offset=0
PTI for amd64. The implementation of the Kernel Page Table Isolation (KPTI) for amd64, first version. It provides a workaround for the 'meltdown' vulnerability. PTI is turned off by default for now, enable with the loader tunable vm.pmap.pti=1. The pmap page table is split into kernel-mode table and user-mode table. Kernel-mode table is identical to the non-PTI table, while usermode table is obtained from kernel table by leaving userspace mappings intact, but only leaving the following parts of the kernel mapped: kernel text (but not modules text) PCPU GDT/IDT/user LDT/task structures IST stacks for NMI and doublefault handlers. Kernel switches to user page table before returning to usermode, and restores full kernel page table on the entry. Initial kernel-mode stack for PTI trampoline is allocated in PCPU, it is only 16 qwords. Kernel entry trampoline switches page tables. then the hardware trap frame is copied to the normal kstack, and execution continues. IST stacks are kept mapped and no trampoline is needed for NMI/doublefault, but of course page table switch is performed. On return to usermode, the trampoline is used again, iret frame is copied to the trampoline stack, page tables are switched and iretq is executed. The case of iretq faulting due to the invalid usermode context is tricky, since the frame for fault is appended to the trampoline frame. Besides copying the fault frame and original (corrupted) frame to kstack, the fault frame must be patched to make it look as if the fault occured on the kstack, see the comment in doret_iret detection code in trap(). Currently kernel pages which are mapped during trampoline operation are identical for all pmaps. They are registered using pmap_pti_add_kva(). Besides initial registrations done during boot, LDT and non-common TSS segments are registered if user requested their use. In principle, they can be installed into kernel page table per pmap with some work. Similarly, PCPU can be hidden from userspace mapping using trampoline PCPU page, but again I do not see much benefits besides complexity. PDPE pages for the kernel half of the user page tables are pre-allocated during boot because we need to know pml4 entries which are copied to the top-level paging structure page, in advance on a new pmap creation. I enforce this to avoid iterating over the all existing pmaps if a new PDPE page is needed for PTI kernel mappings. The iteration is a known problematic operation on i386. The need to flush hidden kernel translations on the switch to user mode make global tables (PG_G) meaningless and even harming, so PG_G use is disabled for PTI case. Our existing use of PCID is incompatible with PTI and is automatically disabled if PTI is enabled. PCID can be forced on only for developer's benefit. MCE is known to be broken, it requires IST stack to operate completely correctly even for non-PTI case, and absolutely needs dedicated IST stack because MCE delivery while trampoline did not switched from PTI stack is fatal. The fix is pending. Reviewed by: markj (partially) Tested by: pho (previous version) Discussed with: jeff, jhb Sponsored by: The FreeBSD Foundation MFC after: 2 weeks
2018-01-17 11:44:21 +00:00
.rept \qw
movq .L.offset(%rsp),%rdx
movq %rdx,.L.offset(%rax)
.L.offset=.L.offset+8
PTI for amd64. The implementation of the Kernel Page Table Isolation (KPTI) for amd64, first version. It provides a workaround for the 'meltdown' vulnerability. PTI is turned off by default for now, enable with the loader tunable vm.pmap.pti=1. The pmap page table is split into kernel-mode table and user-mode table. Kernel-mode table is identical to the non-PTI table, while usermode table is obtained from kernel table by leaving userspace mappings intact, but only leaving the following parts of the kernel mapped: kernel text (but not modules text) PCPU GDT/IDT/user LDT/task structures IST stacks for NMI and doublefault handlers. Kernel switches to user page table before returning to usermode, and restores full kernel page table on the entry. Initial kernel-mode stack for PTI trampoline is allocated in PCPU, it is only 16 qwords. Kernel entry trampoline switches page tables. then the hardware trap frame is copied to the normal kstack, and execution continues. IST stacks are kept mapped and no trampoline is needed for NMI/doublefault, but of course page table switch is performed. On return to usermode, the trampoline is used again, iret frame is copied to the trampoline stack, page tables are switched and iretq is executed. The case of iretq faulting due to the invalid usermode context is tricky, since the frame for fault is appended to the trampoline frame. Besides copying the fault frame and original (corrupted) frame to kstack, the fault frame must be patched to make it look as if the fault occured on the kstack, see the comment in doret_iret detection code in trap(). Currently kernel pages which are mapped during trampoline operation are identical for all pmaps. They are registered using pmap_pti_add_kva(). Besides initial registrations done during boot, LDT and non-common TSS segments are registered if user requested their use. In principle, they can be installed into kernel page table per pmap with some work. Similarly, PCPU can be hidden from userspace mapping using trampoline PCPU page, but again I do not see much benefits besides complexity. PDPE pages for the kernel half of the user page tables are pre-allocated during boot because we need to know pml4 entries which are copied to the top-level paging structure page, in advance on a new pmap creation. I enforce this to avoid iterating over the all existing pmaps if a new PDPE page is needed for PTI kernel mappings. The iteration is a known problematic operation on i386. The need to flush hidden kernel translations on the switch to user mode make global tables (PG_G) meaningless and even harming, so PG_G use is disabled for PTI case. Our existing use of PCID is incompatible with PTI and is automatically disabled if PTI is enabled. PCID can be forced on only for developer's benefit. MCE is known to be broken, it requires IST stack to operate completely correctly even for non-PTI case, and absolutely needs dedicated IST stack because MCE delivery while trampoline did not switched from PTI stack is fatal. The fix is pending. Reviewed by: markj (partially) Tested by: pho (previous version) Discussed with: jeff, jhb Sponsored by: The FreeBSD Foundation MFC after: 2 weeks
2018-01-17 11:44:21 +00:00
.endr
.endm
.macro PTI_UUENTRY has_err
PTI for amd64. The implementation of the Kernel Page Table Isolation (KPTI) for amd64, first version. It provides a workaround for the 'meltdown' vulnerability. PTI is turned off by default for now, enable with the loader tunable vm.pmap.pti=1. The pmap page table is split into kernel-mode table and user-mode table. Kernel-mode table is identical to the non-PTI table, while usermode table is obtained from kernel table by leaving userspace mappings intact, but only leaving the following parts of the kernel mapped: kernel text (but not modules text) PCPU GDT/IDT/user LDT/task structures IST stacks for NMI and doublefault handlers. Kernel switches to user page table before returning to usermode, and restores full kernel page table on the entry. Initial kernel-mode stack for PTI trampoline is allocated in PCPU, it is only 16 qwords. Kernel entry trampoline switches page tables. then the hardware trap frame is copied to the normal kstack, and execution continues. IST stacks are kept mapped and no trampoline is needed for NMI/doublefault, but of course page table switch is performed. On return to usermode, the trampoline is used again, iret frame is copied to the trampoline stack, page tables are switched and iretq is executed. The case of iretq faulting due to the invalid usermode context is tricky, since the frame for fault is appended to the trampoline frame. Besides copying the fault frame and original (corrupted) frame to kstack, the fault frame must be patched to make it look as if the fault occured on the kstack, see the comment in doret_iret detection code in trap(). Currently kernel pages which are mapped during trampoline operation are identical for all pmaps. They are registered using pmap_pti_add_kva(). Besides initial registrations done during boot, LDT and non-common TSS segments are registered if user requested their use. In principle, they can be installed into kernel page table per pmap with some work. Similarly, PCPU can be hidden from userspace mapping using trampoline PCPU page, but again I do not see much benefits besides complexity. PDPE pages for the kernel half of the user page tables are pre-allocated during boot because we need to know pml4 entries which are copied to the top-level paging structure page, in advance on a new pmap creation. I enforce this to avoid iterating over the all existing pmaps if a new PDPE page is needed for PTI kernel mappings. The iteration is a known problematic operation on i386. The need to flush hidden kernel translations on the switch to user mode make global tables (PG_G) meaningless and even harming, so PG_G use is disabled for PTI case. Our existing use of PCID is incompatible with PTI and is automatically disabled if PTI is enabled. PCID can be forced on only for developer's benefit. MCE is known to be broken, it requires IST stack to operate completely correctly even for non-PTI case, and absolutely needs dedicated IST stack because MCE delivery while trampoline did not switched from PTI stack is fatal. The fix is pending. Reviewed by: markj (partially) Tested by: pho (previous version) Discussed with: jeff, jhb Sponsored by: The FreeBSD Foundation MFC after: 2 weeks
2018-01-17 11:44:21 +00:00
movq PCPU(KCR3),%rax
movq %rax,%cr3
movq PCPU(RSP0),%rax
subq $PTI_SIZE - 8 * (1 - \has_err),%rax
MOVE_STACKS ((PTI_SIZE / 8) - 1 + \has_err)
PTI for amd64. The implementation of the Kernel Page Table Isolation (KPTI) for amd64, first version. It provides a workaround for the 'meltdown' vulnerability. PTI is turned off by default for now, enable with the loader tunable vm.pmap.pti=1. The pmap page table is split into kernel-mode table and user-mode table. Kernel-mode table is identical to the non-PTI table, while usermode table is obtained from kernel table by leaving userspace mappings intact, but only leaving the following parts of the kernel mapped: kernel text (but not modules text) PCPU GDT/IDT/user LDT/task structures IST stacks for NMI and doublefault handlers. Kernel switches to user page table before returning to usermode, and restores full kernel page table on the entry. Initial kernel-mode stack for PTI trampoline is allocated in PCPU, it is only 16 qwords. Kernel entry trampoline switches page tables. then the hardware trap frame is copied to the normal kstack, and execution continues. IST stacks are kept mapped and no trampoline is needed for NMI/doublefault, but of course page table switch is performed. On return to usermode, the trampoline is used again, iret frame is copied to the trampoline stack, page tables are switched and iretq is executed. The case of iretq faulting due to the invalid usermode context is tricky, since the frame for fault is appended to the trampoline frame. Besides copying the fault frame and original (corrupted) frame to kstack, the fault frame must be patched to make it look as if the fault occured on the kstack, see the comment in doret_iret detection code in trap(). Currently kernel pages which are mapped during trampoline operation are identical for all pmaps. They are registered using pmap_pti_add_kva(). Besides initial registrations done during boot, LDT and non-common TSS segments are registered if user requested their use. In principle, they can be installed into kernel page table per pmap with some work. Similarly, PCPU can be hidden from userspace mapping using trampoline PCPU page, but again I do not see much benefits besides complexity. PDPE pages for the kernel half of the user page tables are pre-allocated during boot because we need to know pml4 entries which are copied to the top-level paging structure page, in advance on a new pmap creation. I enforce this to avoid iterating over the all existing pmaps if a new PDPE page is needed for PTI kernel mappings. The iteration is a known problematic operation on i386. The need to flush hidden kernel translations on the switch to user mode make global tables (PG_G) meaningless and even harming, so PG_G use is disabled for PTI case. Our existing use of PCID is incompatible with PTI and is automatically disabled if PTI is enabled. PCID can be forced on only for developer's benefit. MCE is known to be broken, it requires IST stack to operate completely correctly even for non-PTI case, and absolutely needs dedicated IST stack because MCE delivery while trampoline did not switched from PTI stack is fatal. The fix is pending. Reviewed by: markj (partially) Tested by: pho (previous version) Discussed with: jeff, jhb Sponsored by: The FreeBSD Foundation MFC after: 2 weeks
2018-01-17 11:44:21 +00:00
movq %rax,%rsp
popq %rdx
popq %rax
.endm
.macro PTI_UENTRY has_err
swapgs
lfence
cmpq $~0,PCPU(UCR3)
je 1f
pushq %rax
pushq %rdx
PTI_UUENTRY \has_err
1:
.endm
.macro PTI_ENTRY name, contk, contu, has_err=0
PTI for amd64. The implementation of the Kernel Page Table Isolation (KPTI) for amd64, first version. It provides a workaround for the 'meltdown' vulnerability. PTI is turned off by default for now, enable with the loader tunable vm.pmap.pti=1. The pmap page table is split into kernel-mode table and user-mode table. Kernel-mode table is identical to the non-PTI table, while usermode table is obtained from kernel table by leaving userspace mappings intact, but only leaving the following parts of the kernel mapped: kernel text (but not modules text) PCPU GDT/IDT/user LDT/task structures IST stacks for NMI and doublefault handlers. Kernel switches to user page table before returning to usermode, and restores full kernel page table on the entry. Initial kernel-mode stack for PTI trampoline is allocated in PCPU, it is only 16 qwords. Kernel entry trampoline switches page tables. then the hardware trap frame is copied to the normal kstack, and execution continues. IST stacks are kept mapped and no trampoline is needed for NMI/doublefault, but of course page table switch is performed. On return to usermode, the trampoline is used again, iret frame is copied to the trampoline stack, page tables are switched and iretq is executed. The case of iretq faulting due to the invalid usermode context is tricky, since the frame for fault is appended to the trampoline frame. Besides copying the fault frame and original (corrupted) frame to kstack, the fault frame must be patched to make it look as if the fault occured on the kstack, see the comment in doret_iret detection code in trap(). Currently kernel pages which are mapped during trampoline operation are identical for all pmaps. They are registered using pmap_pti_add_kva(). Besides initial registrations done during boot, LDT and non-common TSS segments are registered if user requested their use. In principle, they can be installed into kernel page table per pmap with some work. Similarly, PCPU can be hidden from userspace mapping using trampoline PCPU page, but again I do not see much benefits besides complexity. PDPE pages for the kernel half of the user page tables are pre-allocated during boot because we need to know pml4 entries which are copied to the top-level paging structure page, in advance on a new pmap creation. I enforce this to avoid iterating over the all existing pmaps if a new PDPE page is needed for PTI kernel mappings. The iteration is a known problematic operation on i386. The need to flush hidden kernel translations on the switch to user mode make global tables (PG_G) meaningless and even harming, so PG_G use is disabled for PTI case. Our existing use of PCID is incompatible with PTI and is automatically disabled if PTI is enabled. PCID can be forced on only for developer's benefit. MCE is known to be broken, it requires IST stack to operate completely correctly even for non-PTI case, and absolutely needs dedicated IST stack because MCE delivery while trampoline did not switched from PTI stack is fatal. The fix is pending. Reviewed by: markj (partially) Tested by: pho (previous version) Discussed with: jeff, jhb Sponsored by: The FreeBSD Foundation MFC after: 2 weeks
2018-01-17 11:44:21 +00:00
ALIGN_TEXT
.globl X\name\()_pti
.type X\name\()_pti,@function
X\name\()_pti:
/* %rax, %rdx, and possibly err are not yet pushed */
testb $SEL_RPL_MASK,PTI_CS-PTI_ERR-((1-\has_err)*8)(%rsp)
jz \contk
PTI for amd64. The implementation of the Kernel Page Table Isolation (KPTI) for amd64, first version. It provides a workaround for the 'meltdown' vulnerability. PTI is turned off by default for now, enable with the loader tunable vm.pmap.pti=1. The pmap page table is split into kernel-mode table and user-mode table. Kernel-mode table is identical to the non-PTI table, while usermode table is obtained from kernel table by leaving userspace mappings intact, but only leaving the following parts of the kernel mapped: kernel text (but not modules text) PCPU GDT/IDT/user LDT/task structures IST stacks for NMI and doublefault handlers. Kernel switches to user page table before returning to usermode, and restores full kernel page table on the entry. Initial kernel-mode stack for PTI trampoline is allocated in PCPU, it is only 16 qwords. Kernel entry trampoline switches page tables. then the hardware trap frame is copied to the normal kstack, and execution continues. IST stacks are kept mapped and no trampoline is needed for NMI/doublefault, but of course page table switch is performed. On return to usermode, the trampoline is used again, iret frame is copied to the trampoline stack, page tables are switched and iretq is executed. The case of iretq faulting due to the invalid usermode context is tricky, since the frame for fault is appended to the trampoline frame. Besides copying the fault frame and original (corrupted) frame to kstack, the fault frame must be patched to make it look as if the fault occured on the kstack, see the comment in doret_iret detection code in trap(). Currently kernel pages which are mapped during trampoline operation are identical for all pmaps. They are registered using pmap_pti_add_kva(). Besides initial registrations done during boot, LDT and non-common TSS segments are registered if user requested their use. In principle, they can be installed into kernel page table per pmap with some work. Similarly, PCPU can be hidden from userspace mapping using trampoline PCPU page, but again I do not see much benefits besides complexity. PDPE pages for the kernel half of the user page tables are pre-allocated during boot because we need to know pml4 entries which are copied to the top-level paging structure page, in advance on a new pmap creation. I enforce this to avoid iterating over the all existing pmaps if a new PDPE page is needed for PTI kernel mappings. The iteration is a known problematic operation on i386. The need to flush hidden kernel translations on the switch to user mode make global tables (PG_G) meaningless and even harming, so PG_G use is disabled for PTI case. Our existing use of PCID is incompatible with PTI and is automatically disabled if PTI is enabled. PCID can be forced on only for developer's benefit. MCE is known to be broken, it requires IST stack to operate completely correctly even for non-PTI case, and absolutely needs dedicated IST stack because MCE delivery while trampoline did not switched from PTI stack is fatal. The fix is pending. Reviewed by: markj (partially) Tested by: pho (previous version) Discussed with: jeff, jhb Sponsored by: The FreeBSD Foundation MFC after: 2 weeks
2018-01-17 11:44:21 +00:00
PTI_UENTRY \has_err
jmp \contu
PTI for amd64. The implementation of the Kernel Page Table Isolation (KPTI) for amd64, first version. It provides a workaround for the 'meltdown' vulnerability. PTI is turned off by default for now, enable with the loader tunable vm.pmap.pti=1. The pmap page table is split into kernel-mode table and user-mode table. Kernel-mode table is identical to the non-PTI table, while usermode table is obtained from kernel table by leaving userspace mappings intact, but only leaving the following parts of the kernel mapped: kernel text (but not modules text) PCPU GDT/IDT/user LDT/task structures IST stacks for NMI and doublefault handlers. Kernel switches to user page table before returning to usermode, and restores full kernel page table on the entry. Initial kernel-mode stack for PTI trampoline is allocated in PCPU, it is only 16 qwords. Kernel entry trampoline switches page tables. then the hardware trap frame is copied to the normal kstack, and execution continues. IST stacks are kept mapped and no trampoline is needed for NMI/doublefault, but of course page table switch is performed. On return to usermode, the trampoline is used again, iret frame is copied to the trampoline stack, page tables are switched and iretq is executed. The case of iretq faulting due to the invalid usermode context is tricky, since the frame for fault is appended to the trampoline frame. Besides copying the fault frame and original (corrupted) frame to kstack, the fault frame must be patched to make it look as if the fault occured on the kstack, see the comment in doret_iret detection code in trap(). Currently kernel pages which are mapped during trampoline operation are identical for all pmaps. They are registered using pmap_pti_add_kva(). Besides initial registrations done during boot, LDT and non-common TSS segments are registered if user requested their use. In principle, they can be installed into kernel page table per pmap with some work. Similarly, PCPU can be hidden from userspace mapping using trampoline PCPU page, but again I do not see much benefits besides complexity. PDPE pages for the kernel half of the user page tables are pre-allocated during boot because we need to know pml4 entries which are copied to the top-level paging structure page, in advance on a new pmap creation. I enforce this to avoid iterating over the all existing pmaps if a new PDPE page is needed for PTI kernel mappings. The iteration is a known problematic operation on i386. The need to flush hidden kernel translations on the switch to user mode make global tables (PG_G) meaningless and even harming, so PG_G use is disabled for PTI case. Our existing use of PCID is incompatible with PTI and is automatically disabled if PTI is enabled. PCID can be forced on only for developer's benefit. MCE is known to be broken, it requires IST stack to operate completely correctly even for non-PTI case, and absolutely needs dedicated IST stack because MCE delivery while trampoline did not switched from PTI stack is fatal. The fix is pending. Reviewed by: markj (partially) Tested by: pho (previous version) Discussed with: jeff, jhb Sponsored by: The FreeBSD Foundation MFC after: 2 weeks
2018-01-17 11:44:21 +00:00
.endm
.macro PTI_INTRENTRY vec_name
SUPERALIGN_TEXT
.globl X\vec_name\()_pti
.type X\vec_name\()_pti,@function
X\vec_name\()_pti:
testb $SEL_RPL_MASK,PTI_CS-3*8(%rsp) /* err, %rax, %rdx not pushed */
jz .L\vec_name\()_u
PTI for amd64. The implementation of the Kernel Page Table Isolation (KPTI) for amd64, first version. It provides a workaround for the 'meltdown' vulnerability. PTI is turned off by default for now, enable with the loader tunable vm.pmap.pti=1. The pmap page table is split into kernel-mode table and user-mode table. Kernel-mode table is identical to the non-PTI table, while usermode table is obtained from kernel table by leaving userspace mappings intact, but only leaving the following parts of the kernel mapped: kernel text (but not modules text) PCPU GDT/IDT/user LDT/task structures IST stacks for NMI and doublefault handlers. Kernel switches to user page table before returning to usermode, and restores full kernel page table on the entry. Initial kernel-mode stack for PTI trampoline is allocated in PCPU, it is only 16 qwords. Kernel entry trampoline switches page tables. then the hardware trap frame is copied to the normal kstack, and execution continues. IST stacks are kept mapped and no trampoline is needed for NMI/doublefault, but of course page table switch is performed. On return to usermode, the trampoline is used again, iret frame is copied to the trampoline stack, page tables are switched and iretq is executed. The case of iretq faulting due to the invalid usermode context is tricky, since the frame for fault is appended to the trampoline frame. Besides copying the fault frame and original (corrupted) frame to kstack, the fault frame must be patched to make it look as if the fault occured on the kstack, see the comment in doret_iret detection code in trap(). Currently kernel pages which are mapped during trampoline operation are identical for all pmaps. They are registered using pmap_pti_add_kva(). Besides initial registrations done during boot, LDT and non-common TSS segments are registered if user requested their use. In principle, they can be installed into kernel page table per pmap with some work. Similarly, PCPU can be hidden from userspace mapping using trampoline PCPU page, but again I do not see much benefits besides complexity. PDPE pages for the kernel half of the user page tables are pre-allocated during boot because we need to know pml4 entries which are copied to the top-level paging structure page, in advance on a new pmap creation. I enforce this to avoid iterating over the all existing pmaps if a new PDPE page is needed for PTI kernel mappings. The iteration is a known problematic operation on i386. The need to flush hidden kernel translations on the switch to user mode make global tables (PG_G) meaningless and even harming, so PG_G use is disabled for PTI case. Our existing use of PCID is incompatible with PTI and is automatically disabled if PTI is enabled. PCID can be forced on only for developer's benefit. MCE is known to be broken, it requires IST stack to operate completely correctly even for non-PTI case, and absolutely needs dedicated IST stack because MCE delivery while trampoline did not switched from PTI stack is fatal. The fix is pending. Reviewed by: markj (partially) Tested by: pho (previous version) Discussed with: jeff, jhb Sponsored by: The FreeBSD Foundation MFC after: 2 weeks
2018-01-17 11:44:21 +00:00
PTI_UENTRY has_err=0
jmp .L\vec_name\()_u
PTI for amd64. The implementation of the Kernel Page Table Isolation (KPTI) for amd64, first version. It provides a workaround for the 'meltdown' vulnerability. PTI is turned off by default for now, enable with the loader tunable vm.pmap.pti=1. The pmap page table is split into kernel-mode table and user-mode table. Kernel-mode table is identical to the non-PTI table, while usermode table is obtained from kernel table by leaving userspace mappings intact, but only leaving the following parts of the kernel mapped: kernel text (but not modules text) PCPU GDT/IDT/user LDT/task structures IST stacks for NMI and doublefault handlers. Kernel switches to user page table before returning to usermode, and restores full kernel page table on the entry. Initial kernel-mode stack for PTI trampoline is allocated in PCPU, it is only 16 qwords. Kernel entry trampoline switches page tables. then the hardware trap frame is copied to the normal kstack, and execution continues. IST stacks are kept mapped and no trampoline is needed for NMI/doublefault, but of course page table switch is performed. On return to usermode, the trampoline is used again, iret frame is copied to the trampoline stack, page tables are switched and iretq is executed. The case of iretq faulting due to the invalid usermode context is tricky, since the frame for fault is appended to the trampoline frame. Besides copying the fault frame and original (corrupted) frame to kstack, the fault frame must be patched to make it look as if the fault occured on the kstack, see the comment in doret_iret detection code in trap(). Currently kernel pages which are mapped during trampoline operation are identical for all pmaps. They are registered using pmap_pti_add_kva(). Besides initial registrations done during boot, LDT and non-common TSS segments are registered if user requested their use. In principle, they can be installed into kernel page table per pmap with some work. Similarly, PCPU can be hidden from userspace mapping using trampoline PCPU page, but again I do not see much benefits besides complexity. PDPE pages for the kernel half of the user page tables are pre-allocated during boot because we need to know pml4 entries which are copied to the top-level paging structure page, in advance on a new pmap creation. I enforce this to avoid iterating over the all existing pmaps if a new PDPE page is needed for PTI kernel mappings. The iteration is a known problematic operation on i386. The need to flush hidden kernel translations on the switch to user mode make global tables (PG_G) meaningless and even harming, so PG_G use is disabled for PTI case. Our existing use of PCID is incompatible with PTI and is automatically disabled if PTI is enabled. PCID can be forced on only for developer's benefit. MCE is known to be broken, it requires IST stack to operate completely correctly even for non-PTI case, and absolutely needs dedicated IST stack because MCE delivery while trampoline did not switched from PTI stack is fatal. The fix is pending. Reviewed by: markj (partially) Tested by: pho (previous version) Discussed with: jeff, jhb Sponsored by: The FreeBSD Foundation MFC after: 2 weeks
2018-01-17 11:44:21 +00:00
.endm
.macro INTR_PUSH_FRAME vec_name
SUPERALIGN_TEXT
.globl X\vec_name
.type X\vec_name,@function
X\vec_name:
testb $SEL_RPL_MASK,PTI_CS-3*8(%rsp) /* come from kernel? */
jz .L\vec_name\()_u /* Yes, dont swapgs again */
PTI for amd64. The implementation of the Kernel Page Table Isolation (KPTI) for amd64, first version. It provides a workaround for the 'meltdown' vulnerability. PTI is turned off by default for now, enable with the loader tunable vm.pmap.pti=1. The pmap page table is split into kernel-mode table and user-mode table. Kernel-mode table is identical to the non-PTI table, while usermode table is obtained from kernel table by leaving userspace mappings intact, but only leaving the following parts of the kernel mapped: kernel text (but not modules text) PCPU GDT/IDT/user LDT/task structures IST stacks for NMI and doublefault handlers. Kernel switches to user page table before returning to usermode, and restores full kernel page table on the entry. Initial kernel-mode stack for PTI trampoline is allocated in PCPU, it is only 16 qwords. Kernel entry trampoline switches page tables. then the hardware trap frame is copied to the normal kstack, and execution continues. IST stacks are kept mapped and no trampoline is needed for NMI/doublefault, but of course page table switch is performed. On return to usermode, the trampoline is used again, iret frame is copied to the trampoline stack, page tables are switched and iretq is executed. The case of iretq faulting due to the invalid usermode context is tricky, since the frame for fault is appended to the trampoline frame. Besides copying the fault frame and original (corrupted) frame to kstack, the fault frame must be patched to make it look as if the fault occured on the kstack, see the comment in doret_iret detection code in trap(). Currently kernel pages which are mapped during trampoline operation are identical for all pmaps. They are registered using pmap_pti_add_kva(). Besides initial registrations done during boot, LDT and non-common TSS segments are registered if user requested their use. In principle, they can be installed into kernel page table per pmap with some work. Similarly, PCPU can be hidden from userspace mapping using trampoline PCPU page, but again I do not see much benefits besides complexity. PDPE pages for the kernel half of the user page tables are pre-allocated during boot because we need to know pml4 entries which are copied to the top-level paging structure page, in advance on a new pmap creation. I enforce this to avoid iterating over the all existing pmaps if a new PDPE page is needed for PTI kernel mappings. The iteration is a known problematic operation on i386. The need to flush hidden kernel translations on the switch to user mode make global tables (PG_G) meaningless and even harming, so PG_G use is disabled for PTI case. Our existing use of PCID is incompatible with PTI and is automatically disabled if PTI is enabled. PCID can be forced on only for developer's benefit. MCE is known to be broken, it requires IST stack to operate completely correctly even for non-PTI case, and absolutely needs dedicated IST stack because MCE delivery while trampoline did not switched from PTI stack is fatal. The fix is pending. Reviewed by: markj (partially) Tested by: pho (previous version) Discussed with: jeff, jhb Sponsored by: The FreeBSD Foundation MFC after: 2 weeks
2018-01-17 11:44:21 +00:00
swapgs
.L\vec_name\()_u:
lfence
PTI for amd64. The implementation of the Kernel Page Table Isolation (KPTI) for amd64, first version. It provides a workaround for the 'meltdown' vulnerability. PTI is turned off by default for now, enable with the loader tunable vm.pmap.pti=1. The pmap page table is split into kernel-mode table and user-mode table. Kernel-mode table is identical to the non-PTI table, while usermode table is obtained from kernel table by leaving userspace mappings intact, but only leaving the following parts of the kernel mapped: kernel text (but not modules text) PCPU GDT/IDT/user LDT/task structures IST stacks for NMI and doublefault handlers. Kernel switches to user page table before returning to usermode, and restores full kernel page table on the entry. Initial kernel-mode stack for PTI trampoline is allocated in PCPU, it is only 16 qwords. Kernel entry trampoline switches page tables. then the hardware trap frame is copied to the normal kstack, and execution continues. IST stacks are kept mapped and no trampoline is needed for NMI/doublefault, but of course page table switch is performed. On return to usermode, the trampoline is used again, iret frame is copied to the trampoline stack, page tables are switched and iretq is executed. The case of iretq faulting due to the invalid usermode context is tricky, since the frame for fault is appended to the trampoline frame. Besides copying the fault frame and original (corrupted) frame to kstack, the fault frame must be patched to make it look as if the fault occured on the kstack, see the comment in doret_iret detection code in trap(). Currently kernel pages which are mapped during trampoline operation are identical for all pmaps. They are registered using pmap_pti_add_kva(). Besides initial registrations done during boot, LDT and non-common TSS segments are registered if user requested their use. In principle, they can be installed into kernel page table per pmap with some work. Similarly, PCPU can be hidden from userspace mapping using trampoline PCPU page, but again I do not see much benefits besides complexity. PDPE pages for the kernel half of the user page tables are pre-allocated during boot because we need to know pml4 entries which are copied to the top-level paging structure page, in advance on a new pmap creation. I enforce this to avoid iterating over the all existing pmaps if a new PDPE page is needed for PTI kernel mappings. The iteration is a known problematic operation on i386. The need to flush hidden kernel translations on the switch to user mode make global tables (PG_G) meaningless and even harming, so PG_G use is disabled for PTI case. Our existing use of PCID is incompatible with PTI and is automatically disabled if PTI is enabled. PCID can be forced on only for developer's benefit. MCE is known to be broken, it requires IST stack to operate completely correctly even for non-PTI case, and absolutely needs dedicated IST stack because MCE delivery while trampoline did not switched from PTI stack is fatal. The fix is pending. Reviewed by: markj (partially) Tested by: pho (previous version) Discussed with: jeff, jhb Sponsored by: The FreeBSD Foundation MFC after: 2 weeks
2018-01-17 11:44:21 +00:00
subq $TF_RIP,%rsp /* skip dummy tf_err and tf_trapno */
movq %rdi,TF_RDI(%rsp)
movq %rsi,TF_RSI(%rsp)
movq %rdx,TF_RDX(%rsp)
movq %rcx,TF_RCX(%rsp)
movq %r8,TF_R8(%rsp)
movq %r9,TF_R9(%rsp)
movq %rax,TF_RAX(%rsp)
movq %rbx,TF_RBX(%rsp)
movq %rbp,TF_RBP(%rsp)
movq %r10,TF_R10(%rsp)
movq %r11,TF_R11(%rsp)
movq %r12,TF_R12(%rsp)
movq %r13,TF_R13(%rsp)
movq %r14,TF_R14(%rsp)
movq %r15,TF_R15(%rsp)
SAVE_SEGS
movl $TF_HASSEGS,TF_FLAGS(%rsp)
pushfq
andq $~(PSL_D|PSL_AC),(%rsp)
popfq
PTI for amd64. The implementation of the Kernel Page Table Isolation (KPTI) for amd64, first version. It provides a workaround for the 'meltdown' vulnerability. PTI is turned off by default for now, enable with the loader tunable vm.pmap.pti=1. The pmap page table is split into kernel-mode table and user-mode table. Kernel-mode table is identical to the non-PTI table, while usermode table is obtained from kernel table by leaving userspace mappings intact, but only leaving the following parts of the kernel mapped: kernel text (but not modules text) PCPU GDT/IDT/user LDT/task structures IST stacks for NMI and doublefault handlers. Kernel switches to user page table before returning to usermode, and restores full kernel page table on the entry. Initial kernel-mode stack for PTI trampoline is allocated in PCPU, it is only 16 qwords. Kernel entry trampoline switches page tables. then the hardware trap frame is copied to the normal kstack, and execution continues. IST stacks are kept mapped and no trampoline is needed for NMI/doublefault, but of course page table switch is performed. On return to usermode, the trampoline is used again, iret frame is copied to the trampoline stack, page tables are switched and iretq is executed. The case of iretq faulting due to the invalid usermode context is tricky, since the frame for fault is appended to the trampoline frame. Besides copying the fault frame and original (corrupted) frame to kstack, the fault frame must be patched to make it look as if the fault occured on the kstack, see the comment in doret_iret detection code in trap(). Currently kernel pages which are mapped during trampoline operation are identical for all pmaps. They are registered using pmap_pti_add_kva(). Besides initial registrations done during boot, LDT and non-common TSS segments are registered if user requested their use. In principle, they can be installed into kernel page table per pmap with some work. Similarly, PCPU can be hidden from userspace mapping using trampoline PCPU page, but again I do not see much benefits besides complexity. PDPE pages for the kernel half of the user page tables are pre-allocated during boot because we need to know pml4 entries which are copied to the top-level paging structure page, in advance on a new pmap creation. I enforce this to avoid iterating over the all existing pmaps if a new PDPE page is needed for PTI kernel mappings. The iteration is a known problematic operation on i386. The need to flush hidden kernel translations on the switch to user mode make global tables (PG_G) meaningless and even harming, so PG_G use is disabled for PTI case. Our existing use of PCID is incompatible with PTI and is automatically disabled if PTI is enabled. PCID can be forced on only for developer's benefit. MCE is known to be broken, it requires IST stack to operate completely correctly even for non-PTI case, and absolutely needs dedicated IST stack because MCE delivery while trampoline did not switched from PTI stack is fatal. The fix is pending. Reviewed by: markj (partially) Tested by: pho (previous version) Discussed with: jeff, jhb Sponsored by: The FreeBSD Foundation MFC after: 2 weeks
2018-01-17 11:44:21 +00:00
testb $SEL_RPL_MASK,TF_CS(%rsp) /* come from kernel ? */
jz 1f /* yes, leave PCB_FULL_IRET alone */
movq PCPU(CURPCB),%r8
andl $~PCB_FULL_IRET,PCB_FLAGS(%r8)
call handle_ibrs_entry
PTI for amd64. The implementation of the Kernel Page Table Isolation (KPTI) for amd64, first version. It provides a workaround for the 'meltdown' vulnerability. PTI is turned off by default for now, enable with the loader tunable vm.pmap.pti=1. The pmap page table is split into kernel-mode table and user-mode table. Kernel-mode table is identical to the non-PTI table, while usermode table is obtained from kernel table by leaving userspace mappings intact, but only leaving the following parts of the kernel mapped: kernel text (but not modules text) PCPU GDT/IDT/user LDT/task structures IST stacks for NMI and doublefault handlers. Kernel switches to user page table before returning to usermode, and restores full kernel page table on the entry. Initial kernel-mode stack for PTI trampoline is allocated in PCPU, it is only 16 qwords. Kernel entry trampoline switches page tables. then the hardware trap frame is copied to the normal kstack, and execution continues. IST stacks are kept mapped and no trampoline is needed for NMI/doublefault, but of course page table switch is performed. On return to usermode, the trampoline is used again, iret frame is copied to the trampoline stack, page tables are switched and iretq is executed. The case of iretq faulting due to the invalid usermode context is tricky, since the frame for fault is appended to the trampoline frame. Besides copying the fault frame and original (corrupted) frame to kstack, the fault frame must be patched to make it look as if the fault occured on the kstack, see the comment in doret_iret detection code in trap(). Currently kernel pages which are mapped during trampoline operation are identical for all pmaps. They are registered using pmap_pti_add_kva(). Besides initial registrations done during boot, LDT and non-common TSS segments are registered if user requested their use. In principle, they can be installed into kernel page table per pmap with some work. Similarly, PCPU can be hidden from userspace mapping using trampoline PCPU page, but again I do not see much benefits besides complexity. PDPE pages for the kernel half of the user page tables are pre-allocated during boot because we need to know pml4 entries which are copied to the top-level paging structure page, in advance on a new pmap creation. I enforce this to avoid iterating over the all existing pmaps if a new PDPE page is needed for PTI kernel mappings. The iteration is a known problematic operation on i386. The need to flush hidden kernel translations on the switch to user mode make global tables (PG_G) meaningless and even harming, so PG_G use is disabled for PTI case. Our existing use of PCID is incompatible with PTI and is automatically disabled if PTI is enabled. PCID can be forced on only for developer's benefit. MCE is known to be broken, it requires IST stack to operate completely correctly even for non-PTI case, and absolutely needs dedicated IST stack because MCE delivery while trampoline did not switched from PTI stack is fatal. The fix is pending. Reviewed by: markj (partially) Tested by: pho (previous version) Discussed with: jeff, jhb Sponsored by: The FreeBSD Foundation MFC after: 2 weeks
2018-01-17 11:44:21 +00:00
1:
.endm
.macro INTR_HANDLER vec_name
.text
PTI_INTRENTRY \vec_name
INTR_PUSH_FRAME \vec_name
.endm
.macro RESTORE_REGS
movq TF_RDI(%rsp),%rdi
movq TF_RSI(%rsp),%rsi
movq TF_RDX(%rsp),%rdx
movq TF_RCX(%rsp),%rcx
movq TF_R8(%rsp),%r8
movq TF_R9(%rsp),%r9
movq TF_RAX(%rsp),%rax
movq TF_RBX(%rsp),%rbx
movq TF_RBP(%rsp),%rbp
movq TF_R10(%rsp),%r10
movq TF_R11(%rsp),%r11
movq TF_R12(%rsp),%r12
movq TF_R13(%rsp),%r13
movq TF_R14(%rsp),%r14
movq TF_R15(%rsp),%r15
.endm
#endif /* LOCORE */
#ifdef __STDC__
#define ELFNOTE(name, type, desctype, descdata...) \
.pushsection .note.name ; \
.align 4 ; \
.long 2f - 1f /* namesz */ ; \
.long 4f - 3f /* descsz */ ; \
.long type ; \
1:.asciz #name ; \
2:.align 4 ; \
3:desctype descdata ; \
4:.align 4 ; \
.popsection
#else /* !__STDC__, i.e. -traditional */
#define ELFNOTE(name, type, desctype, descdata) \
.pushsection .note.name ; \
.align 4 ; \
.long 2f - 1f /* namesz */ ; \
.long 4f - 3f /* descsz */ ; \
.long type ; \
1:.asciz "name" ; \
2:.align 4 ; \
3:desctype descdata ; \
4:.align 4 ; \
.popsection
#endif /* __STDC__ */
#endif /* !_MACHINE_ASMACROS_H_ */