diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64
index 08f8fe8291b3..57294e1e4b2c 100644
--- a/sys/conf/files.amd64
+++ b/sys/conf/files.amd64
@@ -545,6 +545,9 @@ isa/syscons_isa.c		optional	sc
 isa/vga_isa.c			optional	vga
 kern/kern_clocksource.c		standard
 kern/link_elf_obj.c		standard
+libkern/x86/crc32_sse42.c	standard
+libkern/memmove.c		standard
+libkern/memset.c		standard
 #
 # IA32 binary support
 #
@@ -602,14 +605,6 @@ compat/ndis/subr_pe.c		optional	ndisapi pci
 compat/ndis/subr_usbd.c		optional	ndisapi pci
 compat/ndis/winx64_wrap.S	optional	ndisapi pci
 #
-crc32_sse42.o			standard				\
-	dependency	"$S/libkern/x86/crc32_sse42.c"			\
-	compile-with	"${CC} -c ${CFLAGS:N-nostdinc} ${WERROR} ${PROF} -msse4 ${.IMPSRC}" \
-	no-implicit-rule						\
-	clean		"crc32_sse42.o"
-libkern/memmove.c		standard
-libkern/memset.c		standard
-#
 # x86 real mode BIOS emulator, required by dpms/pci/vesa
 #
 compat/x86bios/x86bios.c	optional x86bios | dpms | pci | vesa
diff --git a/sys/conf/files.i386 b/sys/conf/files.i386
index 2fc64ef6b6ef..a248c22f0be8 100644
--- a/sys/conf/files.i386
+++ b/sys/conf/files.i386
@@ -524,11 +524,6 @@ kern/kern_clocksource.c		standard
 kern/imgact_aout.c		optional compat_aout
 kern/imgact_gzip.c		optional gzip
 kern/subr_sfbuf.c		standard
-crc32_sse42.o			standard				\
-	dependency	"$S/libkern/x86/crc32_sse42.c"			\
-	compile-with	"${CC} -c ${CFLAGS:N-nostdinc} ${WERROR} ${PROF} -msse4 ${.IMPSRC}" \
-	no-implicit-rule						\
-	clean		"crc32_sse42.o"
 libkern/divdi3.c		standard
 libkern/ffsll.c			standard
 libkern/flsll.c			standard
@@ -539,6 +534,7 @@ libkern/qdivrem.c		standard
 libkern/ucmpdi2.c		standard
 libkern/udivdi3.c		standard
 libkern/umoddi3.c		standard
+libkern/x86/crc32_sse42.c	standard
 i386/xbox/xbox.c		optional xbox
 i386/xbox/xboxfb.c		optional xboxfb
 dev/fb/boot_font.c		optional xboxfb
diff --git a/sys/libkern/x86/crc32_sse42.c b/sys/libkern/x86/crc32_sse42.c
index afd8f19cbdf4..f56a10e4542c 100644
--- a/sys/libkern/x86/crc32_sse42.c
+++ b/sys/libkern/x86/crc32_sse42.c
@@ -31,14 +31,40 @@ __FBSDID("$FreeBSD$");
  */
 #ifdef USERSPACE_TESTING
 #include <stdint.h>
+#include <stdlib.h>
 #else
 #include <sys/param.h>
-#include <sys/kernel.h>
-#include <sys/libkern.h>
 #include <sys/systm.h>
+#include <sys/kernel.h>
 #endif
 
-#include <nmmintrin.h>
+static __inline uint32_t
+_mm_crc32_u8(uint32_t x, uint8_t y)
+{
+	/*
+	 * clang (at least 3.9.[0-1]) pessimizes "rm" (y) and "m" (y)
+	 * significantly and "r" (y) a lot by copying y to a different
+	 * local variable (on the stack or in a register), so only use
+	 * the latter.  This costs a register and an instruction but
+	 * not a uop.
+	 */
+	__asm("crc32b %1,%0" : "+r" (x) : "r" (y));
+	return (x);
+}
+
+static __inline uint32_t
+_mm_crc32_u32(uint32_t x, uint32_t y)
+{
+	__asm("crc32l %1,%0" : "+r" (x) : "r" (y));
+	return (x);
+}
+
+static __inline uint64_t
+_mm_crc32_u64(uint64_t x, uint64_t y)
+{
+	__asm("crc32q %1,%0" : "+r" (x) : "r" (y));
+	return (x);
+}
 
 /* CRC-32C (iSCSI) polynomial in reversed bit order. */
 #define POLY	0x82f63b78
@@ -47,12 +73,18 @@ __FBSDID("$FreeBSD$");
  * Block sizes for three-way parallel crc computation.  LONG and SHORT must
  * both be powers of two.
  */
-#define LONG	8192
-#define SHORT	256
+#define LONG	128
+#define SHORT	64
 
-/* Tables for hardware crc that shift a crc by LONG and SHORT zeros. */
+/* 
+ * Tables for updating a crc for LONG, 2 * LONG, SHORT and 2 * SHORT bytes
+ * of value 0 later in the input stream, in the same way that the hardware
+ * would, but in software without calculating intermediate steps.
+ */
 static uint32_t crc32c_long[4][256];
+static uint32_t crc32c_2long[4][256];
 static uint32_t crc32c_short[4][256];
+static uint32_t crc32c_2short[4][256];
 
 /*
  * Multiply a matrix times a vector over the Galois field of two elements,
@@ -171,7 +203,9 @@ __attribute__((__constructor__))
 crc32c_init_hw(void)
 {
 	crc32c_zeros(crc32c_long, LONG);
+	crc32c_zeros(crc32c_2long, 2 * LONG);
 	crc32c_zeros(crc32c_short, SHORT);
+	crc32c_zeros(crc32c_2short, 2 * SHORT);
 }
 #ifdef _KERNEL
 SYSINIT(crc32c_sse42, SI_SUB_LOCK, SI_ORDER_ANY, crc32c_init_hw, NULL);
@@ -190,7 +224,11 @@ sse42_crc32c(uint32_t crc, const unsigned char *buf, unsigned len)
 	const size_t align = 4;
 #endif
 	const unsigned char *next, *end;
-	uint64_t crc0, crc1, crc2;      /* need to be 64 bits for crc32q */
+#ifdef __amd64__
+	uint64_t crc0, crc1, crc2;
+#else
+	uint32_t crc0, crc1, crc2;
+#endif
 
 	next = buf;
 	crc0 = crc;
@@ -202,6 +240,7 @@ sse42_crc32c(uint32_t crc, const unsigned char *buf, unsigned len)
 		len--;
 	}
 
+#if LONG > SHORT
 	/*
 	 * Compute the crc on sets of LONG*3 bytes, executing three independent
 	 * crc instructions, each on LONG bytes -- this is optimized for the
@@ -209,6 +248,7 @@ sse42_crc32c(uint32_t crc, const unsigned char *buf, unsigned len)
 	 * have a throughput of one crc per cycle, but a latency of three
 	 * cycles.
 	 */
+	crc = 0;
 	while (len >= LONG * 3) {
 		crc1 = 0;
 		crc2 = 0;
@@ -229,16 +269,64 @@ sse42_crc32c(uint32_t crc, const unsigned char *buf, unsigned len)
 #endif
 			next += align;
 		} while (next < end);
-		crc0 = crc32c_shift(crc32c_long, crc0) ^ crc1;
-		crc0 = crc32c_shift(crc32c_long, crc0) ^ crc2;
+		/*-
+		 * Update the crc.  Try to do it in parallel with the inner
+		 * loop.  'crc' is used to accumulate crc0 and crc1
+		 * produced by the inner loop so that the next iteration
+		 * of the loop doesn't depend on anything except crc2.
+		 *
+		 * The full expression for the update is:
+		 *     crc = S*S*S*crc + S*S*crc0 + S*crc1
+		 * where the terms are polynomials modulo the CRC polynomial.
+		 * We regroup this subtly as:
+		 *     crc = S*S * (S*crc + crc0) + S*crc1.
+		 * This has an extra dependency which reduces possible
+		 * parallelism for the expression, but it turns out to be
+		 * best to intentionally delay evaluation of this expression
+		 * so that it competes less with the inner loop.
+		 *
+		 * We also intentionally reduce parallelism by feedng back
+		 * crc2 to the inner loop as crc0 instead of accumulating
+		 * it in crc.  This synchronizes the loop with crc update.
+		 * CPU and/or compiler schedulers produced bad order without
+		 * this.
+		 *
+		 * Shifts take about 12 cycles each, so 3 here with 2
+		 * parallelizable take about 24 cycles and the crc update
+		 * takes slightly longer.  8 dependent crc32 instructions
+		 * can run in 24 cycles, so the 3-way blocking is worse
+		 * than useless for sizes less than 8 * <word size> = 64
+		 * on amd64.  In practice, SHORT = 32 confirms these
+		 * timing calculations by giving a small improvement
+		 * starting at size 96.  Then the inner loop takes about
+		 * 12 cycles and the crc update about 24, but these are
+		 * partly in parallel so the total time is less than the
+		 * 36 cycles that 12 dependent crc32 instructions would
+		 * take.
+		 *
+		 * To have a chance of completely hiding the overhead for
+		 * the crc update, the inner loop must take considerably
+		 * longer than 24 cycles.  LONG = 64 makes the inner loop
+		 * take about 24 cycles, so is not quite large enough.
+		 * LONG = 128 works OK.  Unhideable overheads are about
+		 * 12 cycles per inner loop.  All assuming timing like
+		 * Haswell.
+		 */
+		crc = crc32c_shift(crc32c_long, crc) ^ crc0;
+		crc1 = crc32c_shift(crc32c_long, crc1);
+		crc = crc32c_shift(crc32c_2long, crc) ^ crc1;
+		crc0 = crc2;
 		next += LONG * 2;
 		len -= LONG * 3;
 	}
+	crc0 ^= crc;
+#endif /* LONG > SHORT */
 
 	/*
 	 * Do the same thing, but now on SHORT*3 blocks for the remaining data
 	 * less than a LONG*3 block
 	 */
+	crc = 0;
 	while (len >= SHORT * 3) {
 		crc1 = 0;
 		crc2 = 0;
@@ -259,11 +347,14 @@ sse42_crc32c(uint32_t crc, const unsigned char *buf, unsigned len)
 #endif
 			next += align;
 		} while (next < end);
-		crc0 = crc32c_shift(crc32c_short, crc0) ^ crc1;
-		crc0 = crc32c_shift(crc32c_short, crc0) ^ crc2;
+		crc = crc32c_shift(crc32c_short, crc) ^ crc0;
+		crc1 = crc32c_shift(crc32c_short, crc1);
+		crc = crc32c_shift(crc32c_2short, crc) ^ crc1;
+		crc0 = crc2;
 		next += SHORT * 2;
 		len -= SHORT * 3;
 	}
+	crc0 ^= crc;
 
 	/* Compute the crc on the remaining bytes at native word size. */
 	end = next + (len - (len & (align - 1)));