From 5730afc9b6405f9325dfbbf7164dd6c818191ff5 Mon Sep 17 00:00:00 2001
From: Alan Cox <alc@FreeBSD.org>
Date: Thu, 22 Mar 2012 04:52:51 +0000
Subject: [PATCH] Handle spurious page faults that may occur in no-fault
 sections of the kernel.

When access restrictions are added to a page table entry, we flush the
corresponding virtual address mapping from the TLB.  In contrast, when
access restrictions are removed from a page table entry, we do not
flush the virtual address mapping from the TLB.  This is exactly as
recommended in AMD's documentation.  In effect, when access
restrictions are removed from a page table entry, AMD's MMUs will
transparently refresh a stale TLB entry.  In short, this saves us from
having to perform potentially costly TLB flushes.  In contrast,
Intel's MMUs are allowed to generate a spurious page fault based upon
the stale TLB entry.  Usually, such spurious page faults are handled
by vm_fault() without incident.  However, when we are executing
no-fault sections of the kernel, we are not allowed to execute
vm_fault().  This change introduces special-case handling for spurious
page faults that occur in no-fault sections of the kernel.

In collaboration with:	kib
Tested by:		gibbs (an earlier version)

I would also like to acknowledge Hiroki Sato's assistance in
diagnosing this problem.

MFC after:	1 week
---
 sys/amd64/amd64/trap.c   | 64 +++++++++++++++++++++++++------------
 sys/amd64/include/proc.h |  1 +
 sys/i386/i386/trap.c     | 69 ++++++++++++++++++++++++++++------------
 sys/i386/include/proc.h  |  1 +
 sys/kern/kern_sysctl.c   | 12 ++++---
 sys/kern/subr_uio.c      |  8 +++--
 sys/sys/proc.h           |  1 +
 sys/vm/vm_fault.c        |  8 ++++-
 8 files changed, 117 insertions(+), 47 deletions(-)

diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c
index 28bf79d77103..a685f1956453 100644
--- a/sys/amd64/amd64/trap.c
+++ b/sys/amd64/amd64/trap.c
@@ -301,26 +301,6 @@ trap(struct trapframe *frame)
 	}
 
 	code = frame->tf_err;
-	if (type == T_PAGEFLT) {
-		/*
-		 * If we get a page fault while in a critical section, then
-		 * it is most likely a fatal kernel page fault.  The kernel
-		 * is already going to panic trying to get a sleep lock to
-		 * do the VM lookup, so just consider it a fatal trap so the
-		 * kernel can print out a useful trap message and even get
-		 * to the debugger.
-		 *
-		 * If we get a page fault while holding a non-sleepable
-		 * lock, then it is most likely a fatal kernel page fault.
-		 * If WITNESS is enabled, then it's going to whine about
-		 * bogus LORs with various VM locks, so just skip to the
-		 * fatal trap handling directly.
-		 */
-		if (td->td_critnest != 0 ||
-		    WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
-		    "Kernel page fault") != 0)
-			trap_fatal(frame, frame->tf_addr);
-	}
 
         if (ISPL(frame->tf_cs) == SEL_UPL) {
 		/* user trap */
@@ -653,6 +633,50 @@ trap_pfault(frame, usermode)
 	struct proc *p = td->td_proc;
 	vm_offset_t eva = frame->tf_addr;
 
+	if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) {
+		/*
+		 * Due to both processor errata and lazy TLB invalidation when
+		 * access restrictions are removed from virtual pages, memory
+		 * accesses that are allowed by the physical mapping layer may
+		 * nonetheless cause one spurious page fault per virtual page. 
+		 * When the thread is executing a "no faulting" section that
+		 * is bracketed by vm_fault_{disable,enable}_pagefaults(),
+		 * every page fault is treated as a spurious page fault,
+		 * unless it accesses the same virtual address as the most
+		 * recent page fault within the same "no faulting" section.
+		 */
+		if (td->td_md.md_spurflt_addr != eva ||
+		    (td->td_pflags & TDP_RESETSPUR) != 0) {
+			/*
+			 * Do nothing to the TLB.  A stale TLB entry is
+			 * flushed automatically by a page fault.
+			 */
+			td->td_md.md_spurflt_addr = eva;
+			td->td_pflags &= ~TDP_RESETSPUR;
+			return (0);
+		}
+	} else {
+		/*
+		 * If we get a page fault while in a critical section, then
+		 * it is most likely a fatal kernel page fault.  The kernel
+		 * is already going to panic trying to get a sleep lock to
+		 * do the VM lookup, so just consider it a fatal trap so the
+		 * kernel can print out a useful trap message and even get
+		 * to the debugger.
+		 *
+		 * If we get a page fault while holding a non-sleepable
+		 * lock, then it is most likely a fatal kernel page fault.
+		 * If WITNESS is enabled, then it's going to whine about
+		 * bogus LORs with various VM locks, so just skip to the
+		 * fatal trap handling directly.
+		 */
+		if (td->td_critnest != 0 ||
+		    WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
+		    "Kernel page fault") != 0) {
+			trap_fatal(frame, eva);
+			return (-1);
+		}
+	}
 	va = trunc_page(eva);
 	if (va >= VM_MIN_KERNEL_ADDRESS) {
 		/*
diff --git a/sys/amd64/include/proc.h b/sys/amd64/include/proc.h
index 5207d89acede..14585fbffe1c 100644
--- a/sys/amd64/include/proc.h
+++ b/sys/amd64/include/proc.h
@@ -46,6 +46,7 @@ struct proc_ldt {
 struct mdthread {
 	int	md_spinlock_count;	/* (k) */
 	register_t md_saved_flags;	/* (k) */
+	register_t md_spurflt_addr;	/* (k) Spurious page fault address. */
 };
 
 struct mdproc {
diff --git a/sys/i386/i386/trap.c b/sys/i386/i386/trap.c
index 05572b97b7ee..005646dc17a3 100644
--- a/sys/i386/i386/trap.c
+++ b/sys/i386/i386/trap.c
@@ -330,28 +330,13 @@ trap(struct trapframe *frame)
 		 * For some Cyrix CPUs, %cr2 is clobbered by
 		 * interrupts.  This problem is worked around by using
 		 * an interrupt gate for the pagefault handler.  We
-		 * are finally ready to read %cr2 and then must
-		 * reenable interrupts.
-		 *
-		 * If we get a page fault while in a critical section, then
-		 * it is most likely a fatal kernel page fault.  The kernel
-		 * is already going to panic trying to get a sleep lock to
-		 * do the VM lookup, so just consider it a fatal trap so the
-		 * kernel can print out a useful trap message and even get
-		 * to the debugger.
-		 *
-		 * If we get a page fault while holding a non-sleepable
-		 * lock, then it is most likely a fatal kernel page fault.
-		 * If WITNESS is enabled, then it's going to whine about
-		 * bogus LORs with various VM locks, so just skip to the
-		 * fatal trap handling directly.
+		 * are finally ready to read %cr2 and conditionally
+		 * reenable interrupts.  If we hold a spin lock, then
+		 * we must not reenable interrupts.  This might be a
+		 * spurious page fault.
 		 */
 		eva = rcr2();
-		if (td->td_critnest != 0 ||
-		    WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
-		    "Kernel page fault") != 0)
-			trap_fatal(frame, eva);
-		else
+		if (td->td_md.md_spinlock_count == 0)
 			enable_intr();
 	}
 
@@ -804,6 +789,50 @@ trap_pfault(frame, usermode, eva)
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 
+	if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) {
+		/*
+		 * Due to both processor errata and lazy TLB invalidation when
+		 * access restrictions are removed from virtual pages, memory
+		 * accesses that are allowed by the physical mapping layer may
+		 * nonetheless cause one spurious page fault per virtual page. 
+		 * When the thread is executing a "no faulting" section that
+		 * is bracketed by vm_fault_{disable,enable}_pagefaults(),
+		 * every page fault is treated as a spurious page fault,
+		 * unless it accesses the same virtual address as the most
+		 * recent page fault within the same "no faulting" section.
+		 */
+		if (td->td_md.md_spurflt_addr != eva ||
+		    (td->td_pflags & TDP_RESETSPUR) != 0) {
+			/*
+			 * Do nothing to the TLB.  A stale TLB entry is
+			 * flushed automatically by a page fault.
+			 */
+			td->td_md.md_spurflt_addr = eva;
+			td->td_pflags &= ~TDP_RESETSPUR;
+			return (0);
+		}
+	} else {
+		/*
+		 * If we get a page fault while in a critical section, then
+		 * it is most likely a fatal kernel page fault.  The kernel
+		 * is already going to panic trying to get a sleep lock to
+		 * do the VM lookup, so just consider it a fatal trap so the
+		 * kernel can print out a useful trap message and even get
+		 * to the debugger.
+		 *
+		 * If we get a page fault while holding a non-sleepable
+		 * lock, then it is most likely a fatal kernel page fault.
+		 * If WITNESS is enabled, then it's going to whine about
+		 * bogus LORs with various VM locks, so just skip to the
+		 * fatal trap handling directly.
+		 */
+		if (td->td_critnest != 0 ||
+		    WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
+		    "Kernel page fault") != 0) {
+			trap_fatal(frame, eva);
+			return (-1);
+		}
+	}
 	va = trunc_page(eva);
 	if (va >= KERNBASE) {
 		/*
diff --git a/sys/i386/include/proc.h b/sys/i386/include/proc.h
index 86e86020b99a..14f03b77825c 100644
--- a/sys/i386/include/proc.h
+++ b/sys/i386/include/proc.h
@@ -51,6 +51,7 @@ struct proc_ldt {
 struct mdthread {
 	int	md_spinlock_count;	/* (k) */
 	register_t md_saved_flags;	/* (k) */
+	register_t md_spurflt_addr;	/* (k) Spurious page fault address. */
 };
 
 struct mdproc {
diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c
index 97d03a32ea18..f294ce3b9a87 100644
--- a/sys/kern/kern_sysctl.c
+++ b/sys/kern/kern_sysctl.c
@@ -1294,8 +1294,8 @@ kernel_sysctlbyname(struct thread *td, char *name, void *old, size_t *oldlenp,
 static int
 sysctl_old_user(struct sysctl_req *req, const void *p, size_t l)
 {
-	int error = 0;
 	size_t i, len, origidx;
+	int error;
 
 	origidx = req->oldidx;
 	req->oldidx += l;
@@ -1316,10 +1316,14 @@ sysctl_old_user(struct sysctl_req *req, const void *p, size_t l)
 	else {
 		if (i > len - origidx)
 			i = len - origidx;
-		error = copyout(p, (char *)req->oldptr + origidx, i);
+		if (req->lock == REQ_WIRED) {
+			error = copyout_nofault(p, (char *)req->oldptr +
+			    origidx, i);
+		} else
+			error = copyout(p, (char *)req->oldptr + origidx, i);
+		if (error != 0)
+			return (error);
 	}
-	if (error)
-		return (error);
 	if (i < l)
 		return (ENOMEM);
 	return (0);
diff --git a/sys/kern/subr_uio.c b/sys/kern/subr_uio.c
index 3c7688aaad86..b85e50b88df2 100644
--- a/sys/kern/subr_uio.c
+++ b/sys/kern/subr_uio.c
@@ -187,8 +187,12 @@ uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault)
 
 	/* XXX does it make a sense to set TDP_DEADLKTREAT for UIO_SYSSPACE ? */
 	newflags = TDP_DEADLKTREAT;
-	if (uio->uio_segflg == UIO_USERSPACE && nofault)
-		newflags |= TDP_NOFAULTING;
+	if (uio->uio_segflg == UIO_USERSPACE && nofault) {
+		/*
+		 * Fail if a non-spurious page fault occurs.
+		 */
+		newflags |= TDP_NOFAULTING | TDP_RESETSPUR;
+	}
 	save = curthread_pflags_set(newflags);
 
 	while (n > 0 && uio->uio_resid) {
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index 78ad04c21910..6dad36b70817 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -417,6 +417,7 @@ do {									\
 #define	TDP_IGNSUSP	0x00800000 /* Permission to ignore the MNTK_SUSPEND* */
 #define	TDP_AUDITREC	0x01000000 /* Audit record pending on thread */
 #define	TDP_RFPPWAIT	0x02000000 /* Handle RFPPWAIT on syscall exit */
+#define	TDP_RESETSPUR	0x04000000 /* Reset spurious page fault history. */
 
 /*
  * Reasons that the current thread can not be run yet.
diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c
index 8c98b26a0146..8477c3551e0d 100644
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@@ -1468,11 +1468,17 @@ vm_fault_additional_pages(m, rbehind, rahead, marray, reqpage)
 	return i;
 }
 
+/*
+ * Block entry into the machine-independent layer's page fault handler by
+ * the calling thread.  Subsequent calls to vm_fault() by that thread will
+ * return KERN_PROTECTION_FAILURE.  Enable machine-dependent handling of
+ * spurious page faults. 
+ */
 int
 vm_fault_disable_pagefaults(void)
 {
 
-	return (curthread_pflags_set(TDP_NOFAULTING));
+	return (curthread_pflags_set(TDP_NOFAULTING | TDP_RESETSPUR));
 }
 
 void