Handle spurious page faults that may occur in no-fault sections of the

kernel.

When access restrictions are added to a page table entry, we flush the
corresponding virtual address mapping from the TLB.  In contrast, when
access restrictions are removed from a page table entry, we do not
flush the virtual address mapping from the TLB.  This is exactly as
recommended in AMD's documentation.  In effect, when access
restrictions are removed from a page table entry, AMD's MMUs will
transparently refresh a stale TLB entry.  In short, this saves us from
having to perform potentially costly TLB flushes.  In contrast,
Intel's MMUs are allowed to generate a spurious page fault based upon
the stale TLB entry.  Usually, such spurious page faults are handled
by vm_fault() without incident.  However, when we are executing
no-fault sections of the kernel, we are not allowed to execute
vm_fault().  This change introduces special-case handling for spurious
page faults that occur in no-fault sections of the kernel.

In collaboration with:	kib
Tested by:		gibbs (an earlier version)

I would also like to acknowledge Hiroki Sato's assistance in
diagnosing this problem.

MFC after:	1 week
This commit is contained in:
alc 2012-03-22 04:52:51 +00:00
parent 1c754fb497
commit e02fd6b842
8 changed files with 117 additions and 47 deletions

View File

@ -301,26 +301,6 @@ trap(struct trapframe *frame)
}
code = frame->tf_err;
if (type == T_PAGEFLT) {
/*
* If we get a page fault while in a critical section, then
* it is most likely a fatal kernel page fault. The kernel
* is already going to panic trying to get a sleep lock to
* do the VM lookup, so just consider it a fatal trap so the
* kernel can print out a useful trap message and even get
* to the debugger.
*
* If we get a page fault while holding a non-sleepable
* lock, then it is most likely a fatal kernel page fault.
* If WITNESS is enabled, then it's going to whine about
* bogus LORs with various VM locks, so just skip to the
* fatal trap handling directly.
*/
if (td->td_critnest != 0 ||
WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
"Kernel page fault") != 0)
trap_fatal(frame, frame->tf_addr);
}
if (ISPL(frame->tf_cs) == SEL_UPL) {
/* user trap */
@ -653,6 +633,50 @@ trap_pfault(frame, usermode)
struct proc *p = td->td_proc;
vm_offset_t eva = frame->tf_addr;
if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) {
/*
* Due to both processor errata and lazy TLB invalidation when
* access restrictions are removed from virtual pages, memory
* accesses that are allowed by the physical mapping layer may
* nonetheless cause one spurious page fault per virtual page.
* When the thread is executing a "no faulting" section that
* is bracketed by vm_fault_{disable,enable}_pagefaults(),
* every page fault is treated as a spurious page fault,
* unless it accesses the same virtual address as the most
* recent page fault within the same "no faulting" section.
*/
if (td->td_md.md_spurflt_addr != eva ||
(td->td_pflags & TDP_RESETSPUR) != 0) {
/*
* Do nothing to the TLB. A stale TLB entry is
* flushed automatically by a page fault.
*/
td->td_md.md_spurflt_addr = eva;
td->td_pflags &= ~TDP_RESETSPUR;
return (0);
}
} else {
/*
* If we get a page fault while in a critical section, then
* it is most likely a fatal kernel page fault. The kernel
* is already going to panic trying to get a sleep lock to
* do the VM lookup, so just consider it a fatal trap so the
* kernel can print out a useful trap message and even get
* to the debugger.
*
* If we get a page fault while holding a non-sleepable
* lock, then it is most likely a fatal kernel page fault.
* If WITNESS is enabled, then it's going to whine about
* bogus LORs with various VM locks, so just skip to the
* fatal trap handling directly.
*/
if (td->td_critnest != 0 ||
WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
"Kernel page fault") != 0) {
trap_fatal(frame, eva);
return (-1);
}
}
va = trunc_page(eva);
if (va >= VM_MIN_KERNEL_ADDRESS) {
/*

View File

@ -46,6 +46,7 @@ struct proc_ldt {
struct mdthread {
int md_spinlock_count; /* (k) */
register_t md_saved_flags; /* (k) */
register_t md_spurflt_addr; /* (k) Spurious page fault address. */
};
struct mdproc {

View File

@ -330,28 +330,13 @@ trap(struct trapframe *frame)
* For some Cyrix CPUs, %cr2 is clobbered by
* interrupts. This problem is worked around by using
* an interrupt gate for the pagefault handler. We
* are finally ready to read %cr2 and then must
* reenable interrupts.
*
* If we get a page fault while in a critical section, then
* it is most likely a fatal kernel page fault. The kernel
* is already going to panic trying to get a sleep lock to
* do the VM lookup, so just consider it a fatal trap so the
* kernel can print out a useful trap message and even get
* to the debugger.
*
* If we get a page fault while holding a non-sleepable
* lock, then it is most likely a fatal kernel page fault.
* If WITNESS is enabled, then it's going to whine about
* bogus LORs with various VM locks, so just skip to the
* fatal trap handling directly.
* are finally ready to read %cr2 and conditionally
* reenable interrupts. If we hold a spin lock, then
* we must not reenable interrupts. This might be a
* spurious page fault.
*/
eva = rcr2();
if (td->td_critnest != 0 ||
WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
"Kernel page fault") != 0)
trap_fatal(frame, eva);
else
if (td->td_md.md_spinlock_count == 0)
enable_intr();
}
@ -804,6 +789,50 @@ trap_pfault(frame, usermode, eva)
struct thread *td = curthread;
struct proc *p = td->td_proc;
if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) {
/*
* Due to both processor errata and lazy TLB invalidation when
* access restrictions are removed from virtual pages, memory
* accesses that are allowed by the physical mapping layer may
* nonetheless cause one spurious page fault per virtual page.
* When the thread is executing a "no faulting" section that
* is bracketed by vm_fault_{disable,enable}_pagefaults(),
* every page fault is treated as a spurious page fault,
* unless it accesses the same virtual address as the most
* recent page fault within the same "no faulting" section.
*/
if (td->td_md.md_spurflt_addr != eva ||
(td->td_pflags & TDP_RESETSPUR) != 0) {
/*
* Do nothing to the TLB. A stale TLB entry is
* flushed automatically by a page fault.
*/
td->td_md.md_spurflt_addr = eva;
td->td_pflags &= ~TDP_RESETSPUR;
return (0);
}
} else {
/*
* If we get a page fault while in a critical section, then
* it is most likely a fatal kernel page fault. The kernel
* is already going to panic trying to get a sleep lock to
* do the VM lookup, so just consider it a fatal trap so the
* kernel can print out a useful trap message and even get
* to the debugger.
*
* If we get a page fault while holding a non-sleepable
* lock, then it is most likely a fatal kernel page fault.
* If WITNESS is enabled, then it's going to whine about
* bogus LORs with various VM locks, so just skip to the
* fatal trap handling directly.
*/
if (td->td_critnest != 0 ||
WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
"Kernel page fault") != 0) {
trap_fatal(frame, eva);
return (-1);
}
}
va = trunc_page(eva);
if (va >= KERNBASE) {
/*

View File

@ -51,6 +51,7 @@ struct proc_ldt {
struct mdthread {
int md_spinlock_count; /* (k) */
register_t md_saved_flags; /* (k) */
register_t md_spurflt_addr; /* (k) Spurious page fault address. */
};
struct mdproc {

View File

@ -1294,8 +1294,8 @@ kernel_sysctlbyname(struct thread *td, char *name, void *old, size_t *oldlenp,
static int
sysctl_old_user(struct sysctl_req *req, const void *p, size_t l)
{
int error = 0;
size_t i, len, origidx;
int error;
origidx = req->oldidx;
req->oldidx += l;
@ -1316,10 +1316,14 @@ sysctl_old_user(struct sysctl_req *req, const void *p, size_t l)
else {
if (i > len - origidx)
i = len - origidx;
error = copyout(p, (char *)req->oldptr + origidx, i);
if (req->lock == REQ_WIRED) {
error = copyout_nofault(p, (char *)req->oldptr +
origidx, i);
} else
error = copyout(p, (char *)req->oldptr + origidx, i);
if (error != 0)
return (error);
}
if (error)
return (error);
if (i < l)
return (ENOMEM);
return (0);

View File

@ -187,8 +187,12 @@ uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault)
/* XXX does it make a sense to set TDP_DEADLKTREAT for UIO_SYSSPACE ? */
newflags = TDP_DEADLKTREAT;
if (uio->uio_segflg == UIO_USERSPACE && nofault)
newflags |= TDP_NOFAULTING;
if (uio->uio_segflg == UIO_USERSPACE && nofault) {
/*
* Fail if a non-spurious page fault occurs.
*/
newflags |= TDP_NOFAULTING | TDP_RESETSPUR;
}
save = curthread_pflags_set(newflags);
while (n > 0 && uio->uio_resid) {

View File

@ -417,6 +417,7 @@ do { \
#define TDP_IGNSUSP 0x00800000 /* Permission to ignore the MNTK_SUSPEND* */
#define TDP_AUDITREC 0x01000000 /* Audit record pending on thread */
#define TDP_RFPPWAIT 0x02000000 /* Handle RFPPWAIT on syscall exit */
#define TDP_RESETSPUR 0x04000000 /* Reset spurious page fault history. */
/*
* Reasons that the current thread can not be run yet.

View File

@ -1468,11 +1468,17 @@ vm_fault_additional_pages(m, rbehind, rahead, marray, reqpage)
return i;
}
/*
* Block entry into the machine-independent layer's page fault handler by
* the calling thread. Subsequent calls to vm_fault() by that thread will
* return KERN_PROTECTION_FAILURE. Enable machine-dependent handling of
* spurious page faults.
*/
int
vm_fault_disable_pagefaults(void)
{
return (curthread_pflags_set(TDP_NOFAULTING));
return (curthread_pflags_set(TDP_NOFAULTING | TDP_RESETSPUR));
}
void