amd64: Protect the kernel text, data, and BSS by setting the RW/NX bits

correctly for the data contained on each memory page.

There are several components to this change:
 * Add a variable to indicate the start of the R/W portion of the
   initial memory.
 * Stop detecting NX bit support for each AP.  Instead, use the value
   from the BSP and, if supported, activate the feature on the other
   APs just before loading the correct page table.  (Functionally, we
   already assume that the BSP and all APs had the same support or
   lack of support for the NX bit.)
 * Set the RW and NX bits correctly for the kernel text, data, and
   BSS (subject to some caveats below).
 * Ensure DDB can write to memory when necessary (such as to set a
   breakpoint).
 * Ensure GDB can write to memory when necessary (such as to set a
   breakpoint).  For this purpose, add new MD functions gdb_begin_write()
   and gdb_end_write() which the GDB support code can call before and
   after writing to memory.

This change is not comprehensive:
 * It doesn't do anything to protect modules.
 * It doesn't do anything for kernel memory allocated after the kernel
   starts running.
 * In order to avoid excessive memory inefficiency, it may let multiple
   types of data share a 2M page, and assigns the most permissions
   needed for data on that page.

Reviewed by:	jhb, kib
Discussed with:	emaste
MFC after:	2 weeks
Sponsored by:	Netflix
Differential Revision:	https://reviews.freebsd.org/D14282
This commit is contained in:
Jonathan T. Looney 2018-03-06 14:28:37 +00:00
parent a0d442c0d8
commit beb2406556
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=330539
14 changed files with 207 additions and 11 deletions

View File

@ -36,6 +36,9 @@ __FBSDID("$FreeBSD$");
#include <sys/kdb.h>
#include <sys/pcpu.h>
#include <machine/cpufunc.h>
#include <machine/specialreg.h>
#include <ddb/ddb.h>
/*
@ -62,6 +65,9 @@ db_read_bytes(vm_offset_t addr, size_t size, char *data)
/*
* Write bytes to kernel address space for debugger.
* We need to disable write protection temporarily so we can write
* things (such as break points) that might be in write-protected
* memory.
*/
int
db_write_bytes(vm_offset_t addr, size_t size, char *data)
@ -69,15 +75,19 @@ db_write_bytes(vm_offset_t addr, size_t size, char *data)
jmp_buf jb;
void *prev_jb;
char *dst;
u_long cr0save;
int ret;
cr0save = rcr0();
prev_jb = kdb_jmpbuf(jb);
ret = setjmp(jb);
if (ret == 0) {
load_cr0(cr0save & ~CR0_WP);
dst = (char *)addr;
while (size-- > 0)
*dst++ = *data++;
}
load_cr0(cr0save);
(void)kdb_jmpbuf(prev_jb);
return (ret);
}

View File

@ -36,11 +36,13 @@ __FBSDID("$FreeBSD$");
#include <sys/proc.h>
#include <sys/signal.h>
#include <machine/cpufunc.h>
#include <machine/frame.h>
#include <machine/gdb_machdep.h>
#include <machine/pcb.h>
#include <machine/psl.h>
#include <machine/reg.h>
#include <machine/specialreg.h>
#include <machine/trap.h>
#include <machine/frame.h>
#include <machine/endian.h>
@ -121,3 +123,21 @@ gdb_cpu_signal(int type, int code)
}
return (SIGEMT);
}
void *
gdb_begin_write(void)
{
u_long cr0save;
cr0save = rcr0();
load_cr0(cr0save & ~CR0_WP);
return ((void *)cr0save);
}
void
gdb_end_write(void *arg)
{
load_cr0((u_long)arg);
}

View File

@ -218,7 +218,7 @@ initializecpu(void)
if (!IS_BSP() && (cpu_stdext_feature & CPUID_STDEXT_SMEP))
cr4 |= CR4_SMEP;
load_cr4(cr4);
if ((amd_feature & AMDID_NX) != 0) {
if (IS_BSP() && (amd_feature & AMDID_NX) != 0) {
msr = rdmsr(MSR_EFER) | EFER_NXE;
wrmsr(MSR_EFER, msr);
pg_nx = PG_NX;

View File

@ -221,15 +221,31 @@ mptramp_end:
/*
* From here on down is executed in the kernel .text section.
*
* Load a real %cr3 that has all the direct map stuff and switches
* off the 1GB replicated mirror. Load a stack pointer and jump
* into AP startup code in C.
*/
.text
.code64
.p2align 4,0
entry_64:
/*
* If the BSP reported NXE support, enable EFER.NXE for all APs
* prior to loading %cr3. This avoids page faults if the AP
* encounters memory marked with the NX bit prior to detecting and
* enabling NXE support.
*/
movq pg_nx, %rbx
testq %rbx, %rbx
je 1f
movl $MSR_EFER, %ecx
rdmsr
orl $EFER_NXE, %eax
wrmsr
1:
/*
* Load a real %cr3 that has all the direct map stuff and switches
* off the 1GB replicated mirror. Load a stack pointer and jump
* into AP startup code in C.
*/
movq KPML4phys, %rax
movq %rax, %cr3
movq bootSTK, %rsp

View File

@ -870,14 +870,64 @@ nkpt_init(vm_paddr_t addr)
nkpt = pt_pages;
}
/*
* Returns the proper write/execute permission for a physical page that is
* part of the initial boot allocations.
*
* If the page has kernel text, it is marked as read-only. If the page has
* kernel read-only data, it is marked as read-only/not-executable. If the
* page has only read-write data, it is marked as read-write/not-executable.
* If the page is below/above the kernel range, it is marked as read-write.
*
* This function operates on 2M pages, since we map the kernel space that
* way.
*
* Note that this doesn't currently provide any protection for modules.
*/
static inline pt_entry_t
bootaddr_rwx(vm_paddr_t pa)
{
/*
* Everything in the same 2M page as the start of the kernel
* should be static. On the other hand, things in the same 2M
* page as the end of the kernel could be read-write/executable,
* as the kernel image is not guaranteed to end on a 2M boundary.
*/
if (pa < trunc_2mpage(btext - KERNBASE) ||
pa >= trunc_2mpage(_end - KERNBASE))
return (X86_PG_RW);
/*
* The linker should ensure that the read-only and read-write
* portions don't share the same 2M page, so this shouldn't
* impact read-only data. However, in any case, any page with
* read-write data needs to be read-write.
*/
if (pa >= trunc_2mpage(brwsection - KERNBASE))
return (X86_PG_RW | pg_nx);
/*
* Mark any 2M page containing kernel text as read-only. Mark
* other pages with read-only data as read-only and not executable.
* (It is likely a small portion of the read-only data section will
* be marked as read-only, but executable. This should be acceptable
* since the read-only protection will keep the data from changing.)
* Note that fixups to the .text section will still work until we
* set CR0.WP.
*/
if (pa < round_2mpage(etext - KERNBASE))
return (0);
return (pg_nx);
}
static void
create_pagetables(vm_paddr_t *firstaddr)
{
int i, j, ndm1g, nkpdpe;
int i, j, ndm1g, nkpdpe, nkdmpde;
pt_entry_t *pt_p;
pd_entry_t *pd_p;
pdp_entry_t *pdp_p;
pml4_entry_t *p4_p;
uint64_t DMPDkernphys;
/* Allocate page table pages for the direct map */
ndmpdp = howmany(ptoa(Maxmem), NBPDP);
@ -896,8 +946,20 @@ create_pagetables(vm_paddr_t *firstaddr)
}
DMPDPphys = allocpages(firstaddr, ndmpdpphys);
ndm1g = 0;
if ((amd_feature & AMDID_PAGE1GB) != 0)
if ((amd_feature & AMDID_PAGE1GB) != 0) {
/*
* Calculate the number of 1G pages that will fully fit in
* Maxmem.
*/
ndm1g = ptoa(Maxmem) >> PDPSHIFT;
/*
* Allocate 2M pages for the kernel. These will be used in
* place of the first one or more 1G pages from ndm1g.
*/
nkdmpde = howmany((vm_offset_t)(brwsection - KERNBASE), NBPDP);
DMPDkernphys = allocpages(firstaddr, nkdmpde);
}
if (ndm1g < ndmpdp)
DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g);
dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
@ -923,11 +985,10 @@ create_pagetables(vm_paddr_t *firstaddr)
KPDphys = allocpages(firstaddr, nkpdpe);
/* Fill in the underlying page table pages */
/* Nominally read-only (but really R/W) from zero to physfree */
/* XXX not fully used, underneath 2M pages */
pt_p = (pt_entry_t *)KPTphys;
for (i = 0; ptoa(i) < *firstaddr; i++)
pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | pg_g;
pt_p[i] = ptoa(i) | X86_PG_V | pg_g | bootaddr_rwx(ptoa(i));
/* Now map the page tables at their location within PTmap */
pd_p = (pd_entry_t *)KPDphys;
@ -937,8 +998,8 @@ create_pagetables(vm_paddr_t *firstaddr)
/* Map from zero to end of allocations under 2M pages */
/* This replaces some of the KPTphys entries above */
for (i = 0; (i << PDRSHIFT) < *firstaddr; i++)
pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS |
pg_g;
pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g |
bootaddr_rwx(i << PDRSHIFT);
/*
* Because we map the physical blocks in 2M pages, adjust firstaddr
@ -980,6 +1041,22 @@ create_pagetables(vm_paddr_t *firstaddr)
pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_U;
}
/*
* Instead of using a 1G page for the memory containing the kernel,
* use 2M pages with appropriate permissions. (If using 1G pages,
* this will partially overwrite the PDPEs above.)
*/
if (ndm1g) {
pd_p = (pd_entry_t *)DMPDkernphys;
for (i = 0; i < (NPDEPG * nkdmpde); i++)
pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g |
X86_PG_M | X86_PG_A | pg_nx |
bootaddr_rwx(i << PDRSHIFT);
for (i = 0; i < nkdmpde; i++)
pdp_p[i] = (DMPDkernphys + ptoa(i)) | X86_PG_RW |
X86_PG_V | PG_U;
}
/* And recursively map PML4 to itself in order to get PTmap */
p4_p = (pml4_entry_t *)KPML4phys;
p4_p[PML4PML4I] = KPML4phys;

View File

@ -68,7 +68,9 @@ struct cpu_ops {
};
extern struct cpu_ops cpu_ops;
extern char brwsection[];
extern char btext[];
extern char _end[];
extern char etext[];
/* Resume hook for VMM. */

View File

@ -47,8 +47,10 @@ gdb_cpu_query(void)
return (0);
}
void *gdb_begin_write(void);
void *gdb_cpu_getreg(int, size_t *);
void gdb_cpu_setreg(int, void *);
int gdb_cpu_signal(int, int);
void gdb_end_write(void *);
#endif /* !_MACHINE_GDB_MACHDEP_H_ */

View File

@ -47,6 +47,19 @@ gdb_cpu_query(void)
return (0);
}
static __inline void *
gdb_begin_write(void)
{
return (NULL);
}
static __inline void
gdb_end_write(void *arg __unused)
{
}
void *gdb_cpu_getreg(int, size_t *);
void gdb_cpu_setreg(int, void *);
int gdb_cpu_signal(int, int);

View File

@ -80,6 +80,7 @@ SECTIONS
/* Adjust the address for the data segment. We want to adjust up to
the same address within the page on the next page up. */
. = ALIGN (CONSTANT (MAXPAGESIZE)) - ((CONSTANT (MAXPAGESIZE) - .) & (CONSTANT (MAXPAGESIZE) - 1)); . = DATA_SEGMENT_ALIGN (CONSTANT (MAXPAGESIZE), CONSTANT (COMMONPAGESIZE));
PROVIDE (brwsection = .);
/* Exception handling */
.eh_frame : ONLY_IF_RW { KEEP (*(.eh_frame)) }
.gcc_except_table : ONLY_IF_RW { *(.gcc_except_table .gcc_except_table.*) }

View File

@ -147,6 +147,7 @@ gdb_rx_mem(unsigned char *addr, size_t size)
{
unsigned char *p;
void *prev;
void *wctx;
jmp_buf jb;
size_t cnt;
int ret;
@ -155,6 +156,7 @@ gdb_rx_mem(unsigned char *addr, size_t size)
if (size * 2 != gdb_rxsz)
return (-1);
wctx = gdb_begin_write();
prev = kdb_jmpbuf(jb);
ret = setjmp(jb);
if (ret == 0) {
@ -170,6 +172,7 @@ gdb_rx_mem(unsigned char *addr, size_t size)
kdb_cpu_sync_icache(addr, size);
}
(void)kdb_jmpbuf(prev);
gdb_end_write(wctx);
return ((ret == 0) ? 1 : 0);
}

View File

@ -47,6 +47,19 @@ gdb_cpu_query(void)
return (0);
}
static __inline void *
gdb_begin_write(void)
{
return (NULL);
}
static __inline void
gdb_end_write(void *arg __unused)
{
}
void *gdb_cpu_getreg(int, size_t *);
void gdb_cpu_setreg(int, void *);
int gdb_cpu_signal(int, int);

View File

@ -51,6 +51,19 @@ gdb_cpu_query(void)
return (0);
}
static __inline void *
gdb_begin_write(void)
{
return (NULL);
}
static __inline void
gdb_end_write(void *arg __unused)
{
}
void *gdb_cpu_getreg(int, size_t *);
void gdb_cpu_setreg(int, void *);
int gdb_cpu_signal(int, int);

View File

@ -76,6 +76,19 @@ gdb_cpu_query(void)
return (0);
}
static __inline void *
gdb_begin_write(void)
{
return (NULL);
}
static __inline void
gdb_end_write(void *arg __unused)
{
}
void *gdb_cpu_getreg(int, size_t *);
void gdb_cpu_setreg(int, void *);
int gdb_cpu_signal(int, int);

View File

@ -53,6 +53,19 @@ gdb_cpu_signal(int vector, int _)
return (vector);
}
static __inline void *
gdb_begin_write(void)
{
return (NULL);
}
static __inline void
gdb_end_write(void *arg __unused)
{
}
void *gdb_cpu_getreg(int, size_t *);
void gdb_cpu_setreg(int, void *);