From 5730afc9b6405f9325dfbbf7164dd6c818191ff5 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Thu, 22 Mar 2012 04:52:51 +0000 Subject: [PATCH] Handle spurious page faults that may occur in no-fault sections of the kernel. When access restrictions are added to a page table entry, we flush the corresponding virtual address mapping from the TLB. In contrast, when access restrictions are removed from a page table entry, we do not flush the virtual address mapping from the TLB. This is exactly as recommended in AMD's documentation. In effect, when access restrictions are removed from a page table entry, AMD's MMUs will transparently refresh a stale TLB entry. In short, this saves us from having to perform potentially costly TLB flushes. In contrast, Intel's MMUs are allowed to generate a spurious page fault based upon the stale TLB entry. Usually, such spurious page faults are handled by vm_fault() without incident. However, when we are executing no-fault sections of the kernel, we are not allowed to execute vm_fault(). This change introduces special-case handling for spurious page faults that occur in no-fault sections of the kernel. In collaboration with: kib Tested by: gibbs (an earlier version) I would also like to acknowledge Hiroki Sato's assistance in diagnosing this problem. MFC after: 1 week --- sys/amd64/amd64/trap.c | 64 +++++++++++++++++++++++++------------ sys/amd64/include/proc.h | 1 + sys/i386/i386/trap.c | 69 ++++++++++++++++++++++++++++------------ sys/i386/include/proc.h | 1 + sys/kern/kern_sysctl.c | 12 ++++--- sys/kern/subr_uio.c | 8 +++-- sys/sys/proc.h | 1 + sys/vm/vm_fault.c | 8 ++++- 8 files changed, 117 insertions(+), 47 deletions(-) diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index 28bf79d77103..a685f1956453 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -301,26 +301,6 @@ trap(struct trapframe *frame) } code = frame->tf_err; - if (type == T_PAGEFLT) { - /* - * If we get a page fault while in a critical section, then - * it is most likely a fatal kernel page fault. The kernel - * is already going to panic trying to get a sleep lock to - * do the VM lookup, so just consider it a fatal trap so the - * kernel can print out a useful trap message and even get - * to the debugger. - * - * If we get a page fault while holding a non-sleepable - * lock, then it is most likely a fatal kernel page fault. - * If WITNESS is enabled, then it's going to whine about - * bogus LORs with various VM locks, so just skip to the - * fatal trap handling directly. - */ - if (td->td_critnest != 0 || - WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL, - "Kernel page fault") != 0) - trap_fatal(frame, frame->tf_addr); - } if (ISPL(frame->tf_cs) == SEL_UPL) { /* user trap */ @@ -653,6 +633,50 @@ trap_pfault(frame, usermode) struct proc *p = td->td_proc; vm_offset_t eva = frame->tf_addr; + if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) { + /* + * Due to both processor errata and lazy TLB invalidation when + * access restrictions are removed from virtual pages, memory + * accesses that are allowed by the physical mapping layer may + * nonetheless cause one spurious page fault per virtual page. + * When the thread is executing a "no faulting" section that + * is bracketed by vm_fault_{disable,enable}_pagefaults(), + * every page fault is treated as a spurious page fault, + * unless it accesses the same virtual address as the most + * recent page fault within the same "no faulting" section. + */ + if (td->td_md.md_spurflt_addr != eva || + (td->td_pflags & TDP_RESETSPUR) != 0) { + /* + * Do nothing to the TLB. A stale TLB entry is + * flushed automatically by a page fault. + */ + td->td_md.md_spurflt_addr = eva; + td->td_pflags &= ~TDP_RESETSPUR; + return (0); + } + } else { + /* + * If we get a page fault while in a critical section, then + * it is most likely a fatal kernel page fault. The kernel + * is already going to panic trying to get a sleep lock to + * do the VM lookup, so just consider it a fatal trap so the + * kernel can print out a useful trap message and even get + * to the debugger. + * + * If we get a page fault while holding a non-sleepable + * lock, then it is most likely a fatal kernel page fault. + * If WITNESS is enabled, then it's going to whine about + * bogus LORs with various VM locks, so just skip to the + * fatal trap handling directly. + */ + if (td->td_critnest != 0 || + WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL, + "Kernel page fault") != 0) { + trap_fatal(frame, eva); + return (-1); + } + } va = trunc_page(eva); if (va >= VM_MIN_KERNEL_ADDRESS) { /* diff --git a/sys/amd64/include/proc.h b/sys/amd64/include/proc.h index 5207d89acede..14585fbffe1c 100644 --- a/sys/amd64/include/proc.h +++ b/sys/amd64/include/proc.h @@ -46,6 +46,7 @@ struct proc_ldt { struct mdthread { int md_spinlock_count; /* (k) */ register_t md_saved_flags; /* (k) */ + register_t md_spurflt_addr; /* (k) Spurious page fault address. */ }; struct mdproc { diff --git a/sys/i386/i386/trap.c b/sys/i386/i386/trap.c index 05572b97b7ee..005646dc17a3 100644 --- a/sys/i386/i386/trap.c +++ b/sys/i386/i386/trap.c @@ -330,28 +330,13 @@ trap(struct trapframe *frame) * For some Cyrix CPUs, %cr2 is clobbered by * interrupts. This problem is worked around by using * an interrupt gate for the pagefault handler. We - * are finally ready to read %cr2 and then must - * reenable interrupts. - * - * If we get a page fault while in a critical section, then - * it is most likely a fatal kernel page fault. The kernel - * is already going to panic trying to get a sleep lock to - * do the VM lookup, so just consider it a fatal trap so the - * kernel can print out a useful trap message and even get - * to the debugger. - * - * If we get a page fault while holding a non-sleepable - * lock, then it is most likely a fatal kernel page fault. - * If WITNESS is enabled, then it's going to whine about - * bogus LORs with various VM locks, so just skip to the - * fatal trap handling directly. + * are finally ready to read %cr2 and conditionally + * reenable interrupts. If we hold a spin lock, then + * we must not reenable interrupts. This might be a + * spurious page fault. */ eva = rcr2(); - if (td->td_critnest != 0 || - WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL, - "Kernel page fault") != 0) - trap_fatal(frame, eva); - else + if (td->td_md.md_spinlock_count == 0) enable_intr(); } @@ -804,6 +789,50 @@ trap_pfault(frame, usermode, eva) struct thread *td = curthread; struct proc *p = td->td_proc; + if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) { + /* + * Due to both processor errata and lazy TLB invalidation when + * access restrictions are removed from virtual pages, memory + * accesses that are allowed by the physical mapping layer may + * nonetheless cause one spurious page fault per virtual page. + * When the thread is executing a "no faulting" section that + * is bracketed by vm_fault_{disable,enable}_pagefaults(), + * every page fault is treated as a spurious page fault, + * unless it accesses the same virtual address as the most + * recent page fault within the same "no faulting" section. + */ + if (td->td_md.md_spurflt_addr != eva || + (td->td_pflags & TDP_RESETSPUR) != 0) { + /* + * Do nothing to the TLB. A stale TLB entry is + * flushed automatically by a page fault. + */ + td->td_md.md_spurflt_addr = eva; + td->td_pflags &= ~TDP_RESETSPUR; + return (0); + } + } else { + /* + * If we get a page fault while in a critical section, then + * it is most likely a fatal kernel page fault. The kernel + * is already going to panic trying to get a sleep lock to + * do the VM lookup, so just consider it a fatal trap so the + * kernel can print out a useful trap message and even get + * to the debugger. + * + * If we get a page fault while holding a non-sleepable + * lock, then it is most likely a fatal kernel page fault. + * If WITNESS is enabled, then it's going to whine about + * bogus LORs with various VM locks, so just skip to the + * fatal trap handling directly. + */ + if (td->td_critnest != 0 || + WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL, + "Kernel page fault") != 0) { + trap_fatal(frame, eva); + return (-1); + } + } va = trunc_page(eva); if (va >= KERNBASE) { /* diff --git a/sys/i386/include/proc.h b/sys/i386/include/proc.h index 86e86020b99a..14f03b77825c 100644 --- a/sys/i386/include/proc.h +++ b/sys/i386/include/proc.h @@ -51,6 +51,7 @@ struct proc_ldt { struct mdthread { int md_spinlock_count; /* (k) */ register_t md_saved_flags; /* (k) */ + register_t md_spurflt_addr; /* (k) Spurious page fault address. */ }; struct mdproc { diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c index 97d03a32ea18..f294ce3b9a87 100644 --- a/sys/kern/kern_sysctl.c +++ b/sys/kern/kern_sysctl.c @@ -1294,8 +1294,8 @@ kernel_sysctlbyname(struct thread *td, char *name, void *old, size_t *oldlenp, static int sysctl_old_user(struct sysctl_req *req, const void *p, size_t l) { - int error = 0; size_t i, len, origidx; + int error; origidx = req->oldidx; req->oldidx += l; @@ -1316,10 +1316,14 @@ sysctl_old_user(struct sysctl_req *req, const void *p, size_t l) else { if (i > len - origidx) i = len - origidx; - error = copyout(p, (char *)req->oldptr + origidx, i); + if (req->lock == REQ_WIRED) { + error = copyout_nofault(p, (char *)req->oldptr + + origidx, i); + } else + error = copyout(p, (char *)req->oldptr + origidx, i); + if (error != 0) + return (error); } - if (error) - return (error); if (i < l) return (ENOMEM); return (0); diff --git a/sys/kern/subr_uio.c b/sys/kern/subr_uio.c index 3c7688aaad86..b85e50b88df2 100644 --- a/sys/kern/subr_uio.c +++ b/sys/kern/subr_uio.c @@ -187,8 +187,12 @@ uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault) /* XXX does it make a sense to set TDP_DEADLKTREAT for UIO_SYSSPACE ? */ newflags = TDP_DEADLKTREAT; - if (uio->uio_segflg == UIO_USERSPACE && nofault) - newflags |= TDP_NOFAULTING; + if (uio->uio_segflg == UIO_USERSPACE && nofault) { + /* + * Fail if a non-spurious page fault occurs. + */ + newflags |= TDP_NOFAULTING | TDP_RESETSPUR; + } save = curthread_pflags_set(newflags); while (n > 0 && uio->uio_resid) { diff --git a/sys/sys/proc.h b/sys/sys/proc.h index 78ad04c21910..6dad36b70817 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -417,6 +417,7 @@ do { \ #define TDP_IGNSUSP 0x00800000 /* Permission to ignore the MNTK_SUSPEND* */ #define TDP_AUDITREC 0x01000000 /* Audit record pending on thread */ #define TDP_RFPPWAIT 0x02000000 /* Handle RFPPWAIT on syscall exit */ +#define TDP_RESETSPUR 0x04000000 /* Reset spurious page fault history. */ /* * Reasons that the current thread can not be run yet. diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index 8c98b26a0146..8477c3551e0d 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -1468,11 +1468,17 @@ vm_fault_additional_pages(m, rbehind, rahead, marray, reqpage) return i; } +/* + * Block entry into the machine-independent layer's page fault handler by + * the calling thread. Subsequent calls to vm_fault() by that thread will + * return KERN_PROTECTION_FAILURE. Enable machine-dependent handling of + * spurious page faults. + */ int vm_fault_disable_pagefaults(void) { - return (curthread_pflags_set(TDP_NOFAULTING)); + return (curthread_pflags_set(TDP_NOFAULTING | TDP_RESETSPUR)); } void