Handle spurious page faults that may occur in no-fault sections of the
kernel. When access restrictions are added to a page table entry, we flush the corresponding virtual address mapping from the TLB. In contrast, when access restrictions are removed from a page table entry, we do not flush the virtual address mapping from the TLB. This is exactly as recommended in AMD's documentation. In effect, when access restrictions are removed from a page table entry, AMD's MMUs will transparently refresh a stale TLB entry. In short, this saves us from having to perform potentially costly TLB flushes. In contrast, Intel's MMUs are allowed to generate a spurious page fault based upon the stale TLB entry. Usually, such spurious page faults are handled by vm_fault() without incident. However, when we are executing no-fault sections of the kernel, we are not allowed to execute vm_fault(). This change introduces special-case handling for spurious page faults that occur in no-fault sections of the kernel. In collaboration with: kib Tested by: gibbs (an earlier version) I would also like to acknowledge Hiroki Sato's assistance in diagnosing this problem. MFC after: 1 week
This commit is contained in:
parent
1c754fb497
commit
e02fd6b842
@ -301,26 +301,6 @@ trap(struct trapframe *frame)
|
||||
}
|
||||
|
||||
code = frame->tf_err;
|
||||
if (type == T_PAGEFLT) {
|
||||
/*
|
||||
* If we get a page fault while in a critical section, then
|
||||
* it is most likely a fatal kernel page fault. The kernel
|
||||
* is already going to panic trying to get a sleep lock to
|
||||
* do the VM lookup, so just consider it a fatal trap so the
|
||||
* kernel can print out a useful trap message and even get
|
||||
* to the debugger.
|
||||
*
|
||||
* If we get a page fault while holding a non-sleepable
|
||||
* lock, then it is most likely a fatal kernel page fault.
|
||||
* If WITNESS is enabled, then it's going to whine about
|
||||
* bogus LORs with various VM locks, so just skip to the
|
||||
* fatal trap handling directly.
|
||||
*/
|
||||
if (td->td_critnest != 0 ||
|
||||
WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
|
||||
"Kernel page fault") != 0)
|
||||
trap_fatal(frame, frame->tf_addr);
|
||||
}
|
||||
|
||||
if (ISPL(frame->tf_cs) == SEL_UPL) {
|
||||
/* user trap */
|
||||
@ -653,6 +633,50 @@ trap_pfault(frame, usermode)
|
||||
struct proc *p = td->td_proc;
|
||||
vm_offset_t eva = frame->tf_addr;
|
||||
|
||||
if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) {
|
||||
/*
|
||||
* Due to both processor errata and lazy TLB invalidation when
|
||||
* access restrictions are removed from virtual pages, memory
|
||||
* accesses that are allowed by the physical mapping layer may
|
||||
* nonetheless cause one spurious page fault per virtual page.
|
||||
* When the thread is executing a "no faulting" section that
|
||||
* is bracketed by vm_fault_{disable,enable}_pagefaults(),
|
||||
* every page fault is treated as a spurious page fault,
|
||||
* unless it accesses the same virtual address as the most
|
||||
* recent page fault within the same "no faulting" section.
|
||||
*/
|
||||
if (td->td_md.md_spurflt_addr != eva ||
|
||||
(td->td_pflags & TDP_RESETSPUR) != 0) {
|
||||
/*
|
||||
* Do nothing to the TLB. A stale TLB entry is
|
||||
* flushed automatically by a page fault.
|
||||
*/
|
||||
td->td_md.md_spurflt_addr = eva;
|
||||
td->td_pflags &= ~TDP_RESETSPUR;
|
||||
return (0);
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* If we get a page fault while in a critical section, then
|
||||
* it is most likely a fatal kernel page fault. The kernel
|
||||
* is already going to panic trying to get a sleep lock to
|
||||
* do the VM lookup, so just consider it a fatal trap so the
|
||||
* kernel can print out a useful trap message and even get
|
||||
* to the debugger.
|
||||
*
|
||||
* If we get a page fault while holding a non-sleepable
|
||||
* lock, then it is most likely a fatal kernel page fault.
|
||||
* If WITNESS is enabled, then it's going to whine about
|
||||
* bogus LORs with various VM locks, so just skip to the
|
||||
* fatal trap handling directly.
|
||||
*/
|
||||
if (td->td_critnest != 0 ||
|
||||
WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
|
||||
"Kernel page fault") != 0) {
|
||||
trap_fatal(frame, eva);
|
||||
return (-1);
|
||||
}
|
||||
}
|
||||
va = trunc_page(eva);
|
||||
if (va >= VM_MIN_KERNEL_ADDRESS) {
|
||||
/*
|
||||
|
@ -46,6 +46,7 @@ struct proc_ldt {
|
||||
struct mdthread {
|
||||
int md_spinlock_count; /* (k) */
|
||||
register_t md_saved_flags; /* (k) */
|
||||
register_t md_spurflt_addr; /* (k) Spurious page fault address. */
|
||||
};
|
||||
|
||||
struct mdproc {
|
||||
|
@ -330,28 +330,13 @@ trap(struct trapframe *frame)
|
||||
* For some Cyrix CPUs, %cr2 is clobbered by
|
||||
* interrupts. This problem is worked around by using
|
||||
* an interrupt gate for the pagefault handler. We
|
||||
* are finally ready to read %cr2 and then must
|
||||
* reenable interrupts.
|
||||
*
|
||||
* If we get a page fault while in a critical section, then
|
||||
* it is most likely a fatal kernel page fault. The kernel
|
||||
* is already going to panic trying to get a sleep lock to
|
||||
* do the VM lookup, so just consider it a fatal trap so the
|
||||
* kernel can print out a useful trap message and even get
|
||||
* to the debugger.
|
||||
*
|
||||
* If we get a page fault while holding a non-sleepable
|
||||
* lock, then it is most likely a fatal kernel page fault.
|
||||
* If WITNESS is enabled, then it's going to whine about
|
||||
* bogus LORs with various VM locks, so just skip to the
|
||||
* fatal trap handling directly.
|
||||
* are finally ready to read %cr2 and conditionally
|
||||
* reenable interrupts. If we hold a spin lock, then
|
||||
* we must not reenable interrupts. This might be a
|
||||
* spurious page fault.
|
||||
*/
|
||||
eva = rcr2();
|
||||
if (td->td_critnest != 0 ||
|
||||
WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
|
||||
"Kernel page fault") != 0)
|
||||
trap_fatal(frame, eva);
|
||||
else
|
||||
if (td->td_md.md_spinlock_count == 0)
|
||||
enable_intr();
|
||||
}
|
||||
|
||||
@ -804,6 +789,50 @@ trap_pfault(frame, usermode, eva)
|
||||
struct thread *td = curthread;
|
||||
struct proc *p = td->td_proc;
|
||||
|
||||
if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) {
|
||||
/*
|
||||
* Due to both processor errata and lazy TLB invalidation when
|
||||
* access restrictions are removed from virtual pages, memory
|
||||
* accesses that are allowed by the physical mapping layer may
|
||||
* nonetheless cause one spurious page fault per virtual page.
|
||||
* When the thread is executing a "no faulting" section that
|
||||
* is bracketed by vm_fault_{disable,enable}_pagefaults(),
|
||||
* every page fault is treated as a spurious page fault,
|
||||
* unless it accesses the same virtual address as the most
|
||||
* recent page fault within the same "no faulting" section.
|
||||
*/
|
||||
if (td->td_md.md_spurflt_addr != eva ||
|
||||
(td->td_pflags & TDP_RESETSPUR) != 0) {
|
||||
/*
|
||||
* Do nothing to the TLB. A stale TLB entry is
|
||||
* flushed automatically by a page fault.
|
||||
*/
|
||||
td->td_md.md_spurflt_addr = eva;
|
||||
td->td_pflags &= ~TDP_RESETSPUR;
|
||||
return (0);
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* If we get a page fault while in a critical section, then
|
||||
* it is most likely a fatal kernel page fault. The kernel
|
||||
* is already going to panic trying to get a sleep lock to
|
||||
* do the VM lookup, so just consider it a fatal trap so the
|
||||
* kernel can print out a useful trap message and even get
|
||||
* to the debugger.
|
||||
*
|
||||
* If we get a page fault while holding a non-sleepable
|
||||
* lock, then it is most likely a fatal kernel page fault.
|
||||
* If WITNESS is enabled, then it's going to whine about
|
||||
* bogus LORs with various VM locks, so just skip to the
|
||||
* fatal trap handling directly.
|
||||
*/
|
||||
if (td->td_critnest != 0 ||
|
||||
WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
|
||||
"Kernel page fault") != 0) {
|
||||
trap_fatal(frame, eva);
|
||||
return (-1);
|
||||
}
|
||||
}
|
||||
va = trunc_page(eva);
|
||||
if (va >= KERNBASE) {
|
||||
/*
|
||||
|
@ -51,6 +51,7 @@ struct proc_ldt {
|
||||
struct mdthread {
|
||||
int md_spinlock_count; /* (k) */
|
||||
register_t md_saved_flags; /* (k) */
|
||||
register_t md_spurflt_addr; /* (k) Spurious page fault address. */
|
||||
};
|
||||
|
||||
struct mdproc {
|
||||
|
@ -1294,8 +1294,8 @@ kernel_sysctlbyname(struct thread *td, char *name, void *old, size_t *oldlenp,
|
||||
static int
|
||||
sysctl_old_user(struct sysctl_req *req, const void *p, size_t l)
|
||||
{
|
||||
int error = 0;
|
||||
size_t i, len, origidx;
|
||||
int error;
|
||||
|
||||
origidx = req->oldidx;
|
||||
req->oldidx += l;
|
||||
@ -1316,10 +1316,14 @@ sysctl_old_user(struct sysctl_req *req, const void *p, size_t l)
|
||||
else {
|
||||
if (i > len - origidx)
|
||||
i = len - origidx;
|
||||
error = copyout(p, (char *)req->oldptr + origidx, i);
|
||||
if (req->lock == REQ_WIRED) {
|
||||
error = copyout_nofault(p, (char *)req->oldptr +
|
||||
origidx, i);
|
||||
} else
|
||||
error = copyout(p, (char *)req->oldptr + origidx, i);
|
||||
if (error != 0)
|
||||
return (error);
|
||||
}
|
||||
if (error)
|
||||
return (error);
|
||||
if (i < l)
|
||||
return (ENOMEM);
|
||||
return (0);
|
||||
|
@ -187,8 +187,12 @@ uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault)
|
||||
|
||||
/* XXX does it make a sense to set TDP_DEADLKTREAT for UIO_SYSSPACE ? */
|
||||
newflags = TDP_DEADLKTREAT;
|
||||
if (uio->uio_segflg == UIO_USERSPACE && nofault)
|
||||
newflags |= TDP_NOFAULTING;
|
||||
if (uio->uio_segflg == UIO_USERSPACE && nofault) {
|
||||
/*
|
||||
* Fail if a non-spurious page fault occurs.
|
||||
*/
|
||||
newflags |= TDP_NOFAULTING | TDP_RESETSPUR;
|
||||
}
|
||||
save = curthread_pflags_set(newflags);
|
||||
|
||||
while (n > 0 && uio->uio_resid) {
|
||||
|
@ -417,6 +417,7 @@ do { \
|
||||
#define TDP_IGNSUSP 0x00800000 /* Permission to ignore the MNTK_SUSPEND* */
|
||||
#define TDP_AUDITREC 0x01000000 /* Audit record pending on thread */
|
||||
#define TDP_RFPPWAIT 0x02000000 /* Handle RFPPWAIT on syscall exit */
|
||||
#define TDP_RESETSPUR 0x04000000 /* Reset spurious page fault history. */
|
||||
|
||||
/*
|
||||
* Reasons that the current thread can not be run yet.
|
||||
|
@ -1468,11 +1468,17 @@ vm_fault_additional_pages(m, rbehind, rahead, marray, reqpage)
|
||||
return i;
|
||||
}
|
||||
|
||||
/*
|
||||
* Block entry into the machine-independent layer's page fault handler by
|
||||
* the calling thread. Subsequent calls to vm_fault() by that thread will
|
||||
* return KERN_PROTECTION_FAILURE. Enable machine-dependent handling of
|
||||
* spurious page faults.
|
||||
*/
|
||||
int
|
||||
vm_fault_disable_pagefaults(void)
|
||||
{
|
||||
|
||||
return (curthread_pflags_set(TDP_NOFAULTING));
|
||||
return (curthread_pflags_set(TDP_NOFAULTING | TDP_RESETSPUR));
|
||||
}
|
||||
|
||||
void
|
||||
|
Loading…
Reference in New Issue
Block a user