aesni(4): Add support for x86 SHA intrinsics

Some x86 class CPUs have accelerated intrinsics for SHA1 and SHA256. Provide this functionality on CPUs that support it. This implements CRYPTO_SHA1, CRYPTO_SHA1_HMAC, and CRYPTO_SHA2_256_HMAC. Correctness: The cryptotest.py suite in tests/sys/opencrypto has been enhanced to verify SHA1 and SHA256 HMAC using standard NIST test vectors. The test passes on this driver. Additionally, jhb's cryptocheck tool has been used to compare various random inputs against OpenSSL. This test also passes. Rough performance averages on AMD Ryzen 1950X (4kB buffer): aesni: SHA1: ~8300 Mb/s SHA256: ~8000 Mb/s cryptosoft: ~1800 Mb/s SHA256: ~1800 Mb/s So ~4.4-4.6x speedup depending on algorithm choice. This is consistent with the results the Linux folks saw for 4kB buffers. The driver borrows SHA update code from sys/crypto sha1 and sha256. The intrinsic step function comes from Intel under a 3-clause BSDL.[0] The intel_sha_extensions_sha<foo>_intrinsic.c files were renamed and lightly modified (added const, resolved a warning or two; included the sha_sse header to declare the functions). [0]: https://software.intel.com/en-us/articles/intel-sha-extensions-implementations Reviewed by: jhb Sponsored by: Dell EMC Isilon Differential Revision: https://reviews.freebsd.org/D12452
2017-09-26 23:12:32 +00:00 · 2017-09-26 23:12:32 +00:00 · fe182ba1d0
commit fe182ba1d0
parent 119bdf3b3a
10 changed files with 1028 additions and 114 deletions
--- a/share/man/man4/aesni.4
+++ b/share/man/man4/aesni.4
@ -24,12 +24,12 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd December 14, 2015
+.Dd September 26, 2017
 .Dt AESNI 4
 .Os
 .Sh NAME
 .Nm aesni
-.Nd "driver for the AES accelerator on Intel CPUs"
+.Nd "driver for the AES and SHA accelerator on x86 CPUs"
 .Sh SYNOPSIS
 To compile this driver into the kernel,
 place the following lines in your
@ -47,8 +47,8 @@ module at boot time, place the following line in
 aesni_load="YES"
 .Ed
 .Sh DESCRIPTION
-Starting with some models of Core i5/i7, Intel processors implement
-a new set of instructions called AESNI.
+Starting with Intel Westmere and AMD Bulldozer, some x86 processors implement a
+new set of instructions called AESNI.
 The set of six instructions accelerates the calculation of the key
 schedule for key lengths of 128, 192, and 256 of the Advanced
 Encryption Standard (AES) symmetric cipher, and provides a hardware
@ -56,13 +56,24 @@ implementation of the regular and the last encryption and decryption
 rounds.
 .Pp
 The processor capability is reported as AESNI in the Features2 line at boot.
-The
-.Nm
-driver does not attach on systems that lack the required CPU capability.
+.Pp
+Starting with the Intel Goldmont and AMD Ryzen microarchitectures, some x86
+processors implement a new set of SHA instructions.
+The set of seven instructions accelerates the calculation of SHA1 and SHA256
+hashes.
+.Pp
+The processor capability is reported as SHA in the Structured Extended Features
+line at boot.
 .Pp
 The
 .Nm
-driver registers itself to accelerate AES operations for
+driver does not attach on systems that lack both CPU capabilities.
+On systems that support only one of AESNI or SHA extensions, the driver will
+attach and support that one function.
+.Pp
+The
+.Nm
+driver registers itself to accelerate AES and SHA operations for
 .Xr crypto 4 .
 Besides speed, the advantage of using the
 .Nm
@ -83,13 +94,18 @@ The
 .Nm
 driver first appeared in
 .Fx 9.0 .
+SHA support was added in
+.Fx 12.0 .
 .Sh AUTHORS
 .An -nosplit
 The
 .Nm
 driver was written by
-.An Konstantin Belousov Aq Mt kib@FreeBSD.org .
+.An Konstantin Belousov Aq Mt kib@FreeBSD.org
+and
+.An Conrad Meyer Aq Mt cem@FreeBSD.org .
 The key schedule calculation code was adopted from the sample provided
 by Intel and used in the analogous
 .Ox
 driver.
+The hash step intrinsics implementations were supplied by Intel.
--- a/sys/conf/files.amd64
+++ b/sys/conf/files.amd64
@ -182,6 +182,16 @@ aesni_wrap.o			optional aesni				\
 crypto/blowfish/bf_enc.c	optional	crypto | ipsec | ipsec_support
 crypto/des/des_enc.c		optional	crypto | ipsec | \
 	ipsec_support | netsmb
+intel_sha1.o			optional	aesni			\
+	dependency	"$S/crypto/aesni/intel_sha1.c"			\
+	compile-with	"${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \
+	no-implicit-rule						\
+	clean		"intel_sha1.o"
+intel_sha256.o			optional	aesni			\
+	dependency	"$S/crypto/aesni/intel_sha256.c"		\
+	compile-with	"${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \
+	no-implicit-rule						\
+	clean		"intel_sha256.o"
 crypto/via/padlock.c		optional	padlock
 crypto/via/padlock_cipher.c	optional	padlock
 crypto/via/padlock_hash.c	optional	padlock
--- a/sys/conf/files.i386
+++ b/sys/conf/files.i386
@ -132,6 +132,16 @@ aesni_wrap.o			optional aesni				\
 	no-implicit-rule						\
 	clean		"aesni_wrap.o"
 crypto/des/arch/i386/des_enc.S	optional crypto | ipsec | ipsec_support | netsmb
+intel_sha1.o			optional	aesni			\
+	dependency	"$S/crypto/aesni/intel_sha1.c"			\
+	compile-with	"${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \
+	no-implicit-rule						\
+	clean		"intel_sha1.o"
+intel_sha256.o			optional	aesni			\
+	dependency	"$S/crypto/aesni/intel_sha256.c"		\
+	compile-with	"${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \
+	no-implicit-rule						\
+	clean		"intel_sha256.o"
 crypto/via/padlock.c		optional padlock
 crypto/via/padlock_cipher.c	optional padlock
 crypto/via/padlock_hash.c	optional padlock
--- a/sys/crypto/aesni/aesni.c
+++ b/sys/crypto/aesni/aesni.c
@ -2,6 +2,7 @@
 * Copyright (c) 2005-2008 Pawel Jakub Dawidek <pjd@FreeBSD.org>
 * Copyright (c) 2010 Konstantin Belousov <kib@FreeBSD.org>
 * Copyright (c) 2014 The FreeBSD Foundation
+ * Copyright (c) 2017 Conrad Meyer <cem@FreeBSD.org>
 * All rights reserved.
 *
 * Portions of this software were developed by John-Mark Gurney
@ -46,9 +47,23 @@ __FBSDID("$FreeBSD$");
 #include <sys/uio.h>
 #include <sys/mbuf.h>
 #include <sys/smp.h>
+
 #include <crypto/aesni/aesni.h>
-#include <cryptodev_if.h>
+#include <crypto/aesni/sha_sse.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2/sha256.h>
+
+#include <opencrypto/cryptodev.h>
 #include <opencrypto/gmac.h>
+#include <cryptodev_if.h>
+
+#include <machine/md_var.h>
+#include <machine/specialreg.h>
+#if defined(__i386__)
+#include <machine/npx.h>
+#elif defined(__amd64__)
+#include <machine/fpu.h>
+#endif

 static struct mtx_padalign *ctx_mtx;
 static struct fpu_kern_ctx **ctx_fpu;
@ -57,6 +72,8 @@ struct aesni_softc {
 	int	dieing;
 	int32_t cid;
 	uint32_t sid;
+	bool	has_aes;
+	bool	has_sha;
 	TAILQ_HEAD(aesni_sessions_head, aesni_session) sessions;
 	struct rwlock lock;
 };
@ -79,9 +96,13 @@ static int aesni_freesession(device_t, uint64_t tid);
 static void aesni_freesession_locked(struct aesni_softc *sc,
    struct aesni_session *ses);
 static int aesni_cipher_setup(struct aesni_session *ses,
-    struct cryptoini *encini);
+    struct cryptoini *encini, struct cryptoini *authini);
 static int aesni_cipher_process(struct aesni_session *ses,
    struct cryptodesc *enccrd, struct cryptodesc *authcrd, struct cryptop *crp);
+static int aesni_cipher_crypt(struct aesni_session *ses,
+    struct cryptodesc *enccrd, struct cryptodesc *authcrd, struct cryptop *crp);
+static int aesni_cipher_mac(struct aesni_session *ses, struct cryptodesc *crd,
+    struct cryptop *crp);

 MALLOC_DEFINE(M_AESNI, "aesni_data", "AESNI Data");

@ -95,21 +116,33 @@ aesni_identify(driver_t *drv, device_t parent)
 		panic("aesni: could not attach");
 }

+static void
+detect_cpu_features(bool *has_aes, bool *has_sha)
+{
+
+	*has_aes = ((cpu_feature2 & CPUID2_AESNI) != 0 &&
+	    (cpu_feature2 & CPUID2_SSE41) != 0);
+	*has_sha = ((cpu_stdext_feature & CPUID_STDEXT_SHA) != 0 &&
+	    (cpu_feature2 & CPUID2_SSSE3) != 0);
+}
+
 static int
 aesni_probe(device_t dev)
 {
+	bool has_aes, has_sha;

-	if ((cpu_feature2 & CPUID2_AESNI) == 0) {
-		device_printf(dev, "No AESNI support.\n");
+	detect_cpu_features(&has_aes, &has_sha);
+	if (!has_aes && !has_sha) {
+		device_printf(dev, "No AES or SHA support.\n");
 		return (EINVAL);
-	}
+	} else if (has_aes && has_sha)
+		device_set_desc(dev,
+		    "AES-CBC,AES-XTS,AES-GCM,AES-ICM,SHA1,SHA256");
+	else if (has_aes)
+		device_set_desc(dev, "AES-CBC,AES-XTS,AES-GCM,AES-ICM");
+	else
+		device_set_desc(dev, "SHA1,SHA256");

-	if ((cpu_feature2 & CPUID2_SSE41) == 0) {
-		device_printf(dev, "No SSE4.1 support.\n");
-		return (EINVAL);
-	}
-
-	device_set_desc_copy(dev, "AES-CBC,AES-XTS,AES-GCM,AES-ICM");
 	return (0);
 }

@ -161,13 +194,22 @@ aesni_attach(device_t dev)
 	}

 	rw_init(&sc->lock, "aesni_lock");
-	crypto_register(sc->cid, CRYPTO_AES_CBC, 0, 0);
-	crypto_register(sc->cid, CRYPTO_AES_ICM, 0, 0);
-	crypto_register(sc->cid, CRYPTO_AES_NIST_GCM_16, 0, 0);
-	crypto_register(sc->cid, CRYPTO_AES_128_NIST_GMAC, 0, 0);
-	crypto_register(sc->cid, CRYPTO_AES_192_NIST_GMAC, 0, 0);
-	crypto_register(sc->cid, CRYPTO_AES_256_NIST_GMAC, 0, 0);
-	crypto_register(sc->cid, CRYPTO_AES_XTS, 0, 0);
+
+	detect_cpu_features(&sc->has_aes, &sc->has_sha);
+	if (sc->has_aes) {
+		crypto_register(sc->cid, CRYPTO_AES_CBC, 0, 0);
+		crypto_register(sc->cid, CRYPTO_AES_ICM, 0, 0);
+		crypto_register(sc->cid, CRYPTO_AES_NIST_GCM_16, 0, 0);
+		crypto_register(sc->cid, CRYPTO_AES_128_NIST_GMAC, 0, 0);
+		crypto_register(sc->cid, CRYPTO_AES_192_NIST_GMAC, 0, 0);
+		crypto_register(sc->cid, CRYPTO_AES_256_NIST_GMAC, 0, 0);
+		crypto_register(sc->cid, CRYPTO_AES_XTS, 0, 0);
+	}
+	if (sc->has_sha) {
+		crypto_register(sc->cid, CRYPTO_SHA1, 0, 0);
+		crypto_register(sc->cid, CRYPTO_SHA1_HMAC, 0, 0);
+		crypto_register(sc->cid, CRYPTO_SHA2_256_HMAC, 0, 0);
+	}
 	return (0);
 }

@ -208,7 +250,8 @@ aesni_newsession(device_t dev, uint32_t *sidp, struct cryptoini *cri)
 {
 	struct aesni_softc *sc;
 	struct aesni_session *ses;
-	struct cryptoini *encini;
+	struct cryptoini *encini, *authini;
+	bool gcm_hash, gcm;
 	int error;

 	if (sidp == NULL || cri == NULL) {
@ -221,13 +264,20 @@ aesni_newsession(device_t dev, uint32_t *sidp, struct cryptoini *cri)
 		return (EINVAL);

 	ses = NULL;
+	authini = NULL;
 	encini = NULL;
+	gcm = false;
+	gcm_hash = false;
 	for (; cri != NULL; cri = cri->cri_next) {
 		switch (cri->cri_alg) {
+		case CRYPTO_AES_NIST_GCM_16:
+			gcm = true;
+			/* FALLTHROUGH */
 		case CRYPTO_AES_CBC:
 		case CRYPTO_AES_ICM:
 		case CRYPTO_AES_XTS:
-		case CRYPTO_AES_NIST_GCM_16:
+			if (!sc->has_aes)
+				goto unhandled;
 			if (encini != NULL) {
 				CRYPTDEB("encini already set");
 				return (EINVAL);
@ -241,16 +291,35 @@ aesni_newsession(device_t dev, uint32_t *sidp, struct cryptoini *cri)
 			 * nothing to do here, maybe in the future cache some
 			 * values for GHASH
 			 */
+			gcm_hash = true;
+			break;
+		case CRYPTO_SHA1:
+		case CRYPTO_SHA1_HMAC:
+		case CRYPTO_SHA2_256_HMAC:
+			if (!sc->has_sha)
+				goto unhandled;
+			if (authini != NULL) {
+				CRYPTDEB("authini already set");
+				return (EINVAL);
+			}
+			authini = cri;
 			break;
 		default:
+unhandled:
 			CRYPTDEB("unhandled algorithm");
 			return (EINVAL);
 		}
 	}
-	if (encini == NULL) {
+	if (encini == NULL && authini == NULL) {
 		CRYPTDEB("no cipher");
 		return (EINVAL);
 	}
+	/*
+	 * GMAC algorithms are only supported with simultaneous GCM.  Likewise
+	 * GCM is not supported without GMAC.
+	 */
+	if (gcm_hash != gcm)
+		return (EINVAL);

 	rw_wlock(&sc->lock);
 	if (sc->dieing) {
@ -275,9 +344,13 @@ aesni_newsession(device_t dev, uint32_t *sidp, struct cryptoini *cri)
 	ses->used = 1;
 	TAILQ_INSERT_TAIL(&sc->sessions, ses, next);
 	rw_wunlock(&sc->lock);
-	ses->algo = encini->cri_alg;

-	error = aesni_cipher_setup(ses, encini);
+	if (encini != NULL)
+		ses->algo = encini->cri_alg;
+	if (authini != NULL)
+		ses->auth_algo = authini->cri_alg;
+
+	error = aesni_cipher_setup(ses, encini, authini);
 	if (error != 0) {
 		CRYPTDEB("setup failed");
 		rw_wlock(&sc->lock);
@ -299,7 +372,7 @@ aesni_freesession_locked(struct aesni_softc *sc, struct aesni_session *ses)

 	sid = ses->id;
 	TAILQ_REMOVE(&sc->sessions, ses, next);
-	*ses = (struct aesni_session){};
+	explicit_bzero(ses, sizeof(*ses));
 	ses->id = sid;
 	TAILQ_INSERT_HEAD(&sc->sessions, ses, next);
 }
@ -351,6 +424,9 @@ aesni_process(device_t dev, struct cryptop *crp, int hint __unused)

 	for (crd = crp->crp_desc; crd != NULL; crd = crd->crd_next) {
 		switch (crd->crd_alg) {
+		case CRYPTO_AES_NIST_GCM_16:
+			needauth = 1;
+			/* FALLTHROUGH */
 		case CRYPTO_AES_CBC:
 		case CRYPTO_AES_ICM:
 		case CRYPTO_AES_XTS:
@ -361,24 +437,17 @@ aesni_process(device_t dev, struct cryptop *crp, int hint __unused)
 			enccrd = crd;
 			break;

-		case CRYPTO_AES_NIST_GCM_16:
-			if (enccrd != NULL) {
-				error = EINVAL;
-				goto out;
-			}
-			enccrd = crd;
-			needauth = 1;
-			break;
-
 		case CRYPTO_AES_128_NIST_GMAC:
 		case CRYPTO_AES_192_NIST_GMAC:
 		case CRYPTO_AES_256_NIST_GMAC:
+		case CRYPTO_SHA1:
+		case CRYPTO_SHA1_HMAC:
+		case CRYPTO_SHA2_256_HMAC:
 			if (authcrd != NULL) {
 				error = EINVAL;
 				goto out;
 			}
 			authcrd = crd;
-			needauth = 1;
 			break;

 		default:
@ -387,14 +456,16 @@ aesni_process(device_t dev, struct cryptop *crp, int hint __unused)
 		}
 	}

-	if (enccrd == NULL || (needauth && authcrd == NULL)) {
+	if ((enccrd == NULL && authcrd == NULL) ||
+	    (needauth && authcrd == NULL)) {
 		error = EINVAL;
 		goto out;
 	}

 	/* CBC & XTS can only handle full blocks for now */
-	if ((enccrd->crd_alg == CRYPTO_AES_CBC || enccrd->crd_alg ==
-	    CRYPTO_AES_XTS) && (enccrd->crd_len % AES_BLOCK_LEN) != 0) {
+	if (enccrd != NULL && (enccrd->crd_alg == CRYPTO_AES_CBC ||
+	    enccrd->crd_alg == CRYPTO_AES_XTS) &&
+	    (enccrd->crd_len % AES_BLOCK_LEN) != 0) {
 		error = EINVAL;
 		goto out;
 	}
@ -420,9 +491,9 @@ out:
 	return (error);
 }

-uint8_t *
+static uint8_t *
 aesni_cipher_alloc(struct cryptodesc *enccrd, struct cryptop *crp,
-    int *allocated)
+    bool *allocated)
 {
 	struct mbuf *m;
 	struct uio *uio;
@ -442,18 +513,18 @@ aesni_cipher_alloc(struct cryptodesc *enccrd, struct cryptop *crp,
 		addr = (uint8_t *)iov->iov_base;
 	} else
 		addr = (uint8_t *)crp->crp_buf;
-	*allocated = 0;
+	*allocated = false;
 	addr += enccrd->crd_skip;
 	return (addr);

 alloc:
 	addr = malloc(enccrd->crd_len, M_AESNI, M_NOWAIT);
 	if (addr != NULL) {
-		*allocated = 1;
+		*allocated = true;
 		crypto_copydata(crp->crp_flags, crp->crp_buf, enccrd->crd_skip,
 		    enccrd->crd_len, addr);
 	} else
-		*allocated = 0;
+		*allocated = false;
 	return (addr);
 }

@ -482,13 +553,28 @@ MODULE_VERSION(aesni, 1);
 MODULE_DEPEND(aesni, crypto, 1, 1, 1);

 static int
-aesni_cipher_setup(struct aesni_session *ses, struct cryptoini *encini)
+aesni_cipher_setup(struct aesni_session *ses, struct cryptoini *encini,
+    struct cryptoini *authini)
 {
 	struct fpu_kern_ctx *ctx;
-	int error;
-	int kt, ctxidx;
+	int kt, ctxidx, keylen, error;

-	kt = is_fpu_kern_thread(0);
+	switch (ses->auth_algo) {
+	case CRYPTO_SHA1:
+	case CRYPTO_SHA1_HMAC:
+	case CRYPTO_SHA2_256_HMAC:
+		if (authini->cri_klen % 8 != 0)
+			return (EINVAL);
+		keylen = authini->cri_klen / 8;
+		if (keylen > sizeof(ses->hmac_key))
+			return (EINVAL);
+		if (ses->auth_algo == CRYPTO_SHA1 && keylen > 0)
+			return (EINVAL);
+		memcpy(ses->hmac_key, authini->cri_key, keylen);
+		ses->mlen = authini->cri_mlen;
+	}
+
+	kt = is_fpu_kern_thread(0) || (encini == NULL);
 	if (!kt) {
 		ACQUIRE_CTX(ctxidx, ctx);
 		error = fpu_kern_enter(curthread, ctx,
@ -497,8 +583,10 @@ aesni_cipher_setup(struct aesni_session *ses, struct cryptoini *encini)
 			goto out;
 	}

-	error = aesni_cipher_setup_common(ses, encini->cri_key,
-	    encini->cri_klen);
+	error = 0;
+	if (encini != NULL)
+		error = aesni_cipher_setup_common(ses, encini->cri_key,
+		    encini->cri_klen);

 	if (!kt) {
 		fpu_kern_leave(curthread, ctx);
@ -508,52 +596,198 @@ out:
 	return (error);
 }

+static int
+intel_sha1_update(void *vctx, const void *vdata, u_int datalen)
+{
+	struct sha1_ctxt *ctx = vctx;
+	const char *data = vdata;
+	size_t gaplen;
+	size_t gapstart;
+	size_t off;
+	size_t copysiz;
+	u_int blocks;
+
+	off = 0;
+	/* Do any aligned blocks without redundant copying. */
+	if (datalen >= 64 && ctx->count % 64 == 0) {
+		blocks = datalen / 64;
+		ctx->c.b64[0] += blocks * 64 * 8;
+		intel_sha1_step(ctx->h.b32, data + off, blocks);
+		off += blocks * 64;
+	}
+
+	while (off < datalen) {
+		gapstart = ctx->count % 64;
+		gaplen = 64 - gapstart;
+
+		copysiz = (gaplen < datalen - off) ? gaplen : datalen - off;
+		bcopy(&data[off], &ctx->m.b8[gapstart], copysiz);
+		ctx->count += copysiz;
+		ctx->count %= 64;
+		ctx->c.b64[0] += copysiz * 8;
+		if (ctx->count % 64 == 0)
+			intel_sha1_step(ctx->h.b32, (void *)ctx->m.b8, 1);
+		off += copysiz;
+	}
+	return (0);
+}
+
+static void
+SHA1_Finalize_fn(void *digest, void *ctx)
+{
+	sha1_result(ctx, digest);
+}
+
+static int
+intel_sha256_update(void *vctx, const void *vdata, u_int len)
+{
+	SHA256_CTX *ctx = vctx;
+	uint64_t bitlen;
+	uint32_t r;
+	u_int blocks;
+	const unsigned char *src = vdata;
+
+	/* Number of bytes left in the buffer from previous updates */
+	r = (ctx->count >> 3) & 0x3f;
+
+	/* Convert the length into a number of bits */
+	bitlen = len << 3;
+
+	/* Update number of bits */
+	ctx->count += bitlen;
+
+	/* Handle the case where we don't need to perform any transforms */
+	if (len < 64 - r) {
+		memcpy(&ctx->buf[r], src, len);
+		return (0);
+	}
+
+	/* Finish the current block */
+	memcpy(&ctx->buf[r], src, 64 - r);
+	intel_sha256_step(ctx->state, ctx->buf, 1);
+	src += 64 - r;
+	len -= 64 - r;
+
+	/* Perform complete blocks */
+	if (len >= 64) {
+		blocks = len / 64;
+		intel_sha256_step(ctx->state, src, blocks);
+		src += blocks * 64;
+		len -= blocks * 64;
+	}
+
+	/* Copy left over data into buffer */
+	memcpy(ctx->buf, src, len);
+	return (0);
+}
+
+static void
+SHA256_Finalize_fn(void *digest, void *ctx)
+{
+	SHA256_Final(digest, ctx);
+}
+
 /*
- * authcrd contains the associated date.
+ * Compute the HASH( (key ^ xorbyte) || buf )
 */
+static void
+hmac_internal(void *ctx, uint32_t *res,
+	int (*update)(void *, const void *, u_int),
+	void (*finalize)(void *, void *), uint8_t *key, uint8_t xorbyte,
+	const void *buf, size_t off, size_t buflen, int crpflags)
+{
+	size_t i;
+
+	for (i = 0; i < 64; i++)
+		key[i] ^= xorbyte;
+	update(ctx, key, 64);
+	for (i = 0; i < 64; i++)
+		key[i] ^= xorbyte;
+
+	crypto_apply(crpflags, __DECONST(void *, buf), off, buflen,
+	    __DECONST(int (*)(void *, void *, u_int), update), ctx);
+	finalize(res, ctx);
+}
+
 static int
 aesni_cipher_process(struct aesni_session *ses, struct cryptodesc *enccrd,
    struct cryptodesc *authcrd, struct cryptop *crp)
 {
 	struct fpu_kern_ctx *ctx;
-	uint8_t iv[AES_BLOCK_LEN];
-	uint8_t tag[GMAC_DIGEST_LEN];
-	uint8_t *buf, *authbuf;
-	int error, allocated, authallocated;
-	int ivlen, encflag;
-	int kt, ctxidx;
+	int error, ctxidx;
+	bool kt;

-	encflag = (enccrd->crd_flags & CRD_F_ENCRYPT) == CRD_F_ENCRYPT;
+	if (enccrd != NULL) {
+		if ((enccrd->crd_alg == CRYPTO_AES_ICM ||
+		    enccrd->crd_alg == CRYPTO_AES_NIST_GCM_16) &&
+		    (enccrd->crd_flags & CRD_F_IV_EXPLICIT) == 0)
+			return (EINVAL);
+	}

-	if ((enccrd->crd_alg == CRYPTO_AES_ICM ||
-	    enccrd->crd_alg == CRYPTO_AES_NIST_GCM_16) &&
-	    (enccrd->crd_flags & CRD_F_IV_EXPLICIT) == 0)
-		return (EINVAL);
+	error = 0;
+	kt = is_fpu_kern_thread(0);
+	if (!kt) {
+		ACQUIRE_CTX(ctxidx, ctx);
+		error = fpu_kern_enter(curthread, ctx,
+		    FPU_KERN_NORMAL | FPU_KERN_KTHR);
+		if (error != 0)
+			goto out2;
+	}
+
+	/* Do work */
+	if (enccrd != NULL && authcrd != NULL) {
+		/* Perform the first operation */
+		if (crp->crp_desc == enccrd)
+			error = aesni_cipher_crypt(ses, enccrd, authcrd, crp);
+		else
+			error = aesni_cipher_mac(ses, authcrd, crp);
+		if (error != 0)
+			goto out;
+		/* Perform the second operation */
+		if (crp->crp_desc == enccrd)
+			error = aesni_cipher_mac(ses, authcrd, crp);
+		else
+			error = aesni_cipher_crypt(ses, enccrd, authcrd, crp);
+	} else if (enccrd != NULL)
+		error = aesni_cipher_crypt(ses, enccrd, authcrd, crp);
+	else
+		error = aesni_cipher_mac(ses, authcrd, crp);
+
+	if (error != 0)
+		goto out;
+
+out:
+	if (!kt) {
+		fpu_kern_leave(curthread, ctx);
+out2:
+		RELEASE_CTX(ctxidx, ctx);
+	}
+	return (error);
+}
+
+static int
+aesni_cipher_crypt(struct aesni_session *ses, struct cryptodesc *enccrd,
+	struct cryptodesc *authcrd, struct cryptop *crp)
+{
+	uint8_t iv[AES_BLOCK_LEN], tag[GMAC_DIGEST_LEN], *buf, *authbuf;
+	int error, ivlen;
+	bool encflag, allocated, authallocated;

 	buf = aesni_cipher_alloc(enccrd, crp, &allocated);
 	if (buf == NULL)
 		return (ENOMEM);

-	error = 0;
-	authbuf = NULL;
-	authallocated = 0;
-	if (authcrd != NULL) {
+	authallocated = false;
+	if (ses->algo == CRYPTO_AES_NIST_GCM_16 && authcrd != NULL) {
 		authbuf = aesni_cipher_alloc(authcrd, crp, &authallocated);
 		if (authbuf == NULL) {
 			error = ENOMEM;
-			goto out1;
+			goto out;
 		}
 	}

-	kt = is_fpu_kern_thread(0);
-	if (!kt) {
-		ACQUIRE_CTX(ctxidx, ctx);
-		error = fpu_kern_enter(curthread, ctx,
-		    FPU_KERN_NORMAL|FPU_KERN_KTHR);
-		if (error != 0)
-			goto out2;
-	}
-
+	error = 0;
+	encflag = (enccrd->crd_flags & CRD_F_ENCRYPT) == CRD_F_ENCRYPT;
 	if ((enccrd->crd_flags & CRD_F_KEY_EXPLICIT) != 0) {
 		error = aesni_cipher_setup_common(ses, enccrd->crd_key,
 		    enccrd->crd_klen);
@ -561,7 +795,6 @@ aesni_cipher_process(struct aesni_session *ses, struct cryptodesc *enccrd,
 			goto out;
 	}

-	/* XXX - validate that enccrd and authcrd have/use same key? */
 	switch (enccrd->crd_alg) {
 	case CRYPTO_AES_CBC:
 	case CRYPTO_AES_ICM:
@ -593,13 +826,6 @@ aesni_cipher_process(struct aesni_session *ses, struct cryptodesc *enccrd,
 			    enccrd->crd_inject, ivlen, iv);
 	}

-	if (authcrd != NULL && !encflag)
-		crypto_copydata(crp->crp_flags, crp->crp_buf,
-		    authcrd->crd_inject, GMAC_DIGEST_LEN, tag);
-	else
-		bzero(tag, sizeof tag);
-
-	/* Do work */
 	switch (ses->algo) {
 	case CRYPTO_AES_CBC:
 		if (encflag)
@ -625,11 +851,21 @@ aesni_cipher_process(struct aesni_session *ses, struct cryptodesc *enccrd,
 			    iv);
 		break;
 	case CRYPTO_AES_NIST_GCM_16:
-		if (encflag)
+		if (authcrd != NULL && !encflag)
+			crypto_copydata(crp->crp_flags, crp->crp_buf,
+			    authcrd->crd_inject, GMAC_DIGEST_LEN, tag);
+		else
+			bzero(tag, sizeof tag);
+
+		if (encflag) {
 			AES_GCM_encrypt(buf, buf, authbuf, iv, tag,
 			    enccrd->crd_len, authcrd->crd_len, ivlen,
 			    ses->enc_schedule, ses->rounds);
-		else {
+
+			if (authcrd != NULL)
+				crypto_copyback(crp->crp_flags, crp->crp_buf,
+				    authcrd->crd_inject, GMAC_DIGEST_LEN, tag);
+		} else {
 			if (!AES_GCM_decrypt(buf, buf, authbuf, iv, tag,
 			    enccrd->crd_len, authcrd->crd_len, ivlen,
 			    ses->enc_schedule, ses->rounds))
@ -638,28 +874,78 @@ aesni_cipher_process(struct aesni_session *ses, struct cryptodesc *enccrd,
 		break;
 	}

-	if (allocated)
-		crypto_copyback(crp->crp_flags, crp->crp_buf, enccrd->crd_skip,
-		    enccrd->crd_len, buf);
-
-	if (!error && authcrd != NULL) {
-		crypto_copyback(crp->crp_flags, crp->crp_buf,
-		    authcrd->crd_inject, GMAC_DIGEST_LEN, tag);
-	}
-
 out:
-	if (!kt) {
-		fpu_kern_leave(curthread, ctx);
-out2:
-		RELEASE_CTX(ctxidx, ctx);
-	}
-
-out1:
 	if (allocated) {
-		bzero(buf, enccrd->crd_len);
+		explicit_bzero(buf, enccrd->crd_len);
 		free(buf, M_AESNI);
 	}
-	if (authallocated)
+	if (authallocated) {
+		explicit_bzero(authbuf, authcrd->crd_len);
 		free(authbuf, M_AESNI);
+	}
 	return (error);
 }
+
+static int
+aesni_cipher_mac(struct aesni_session *ses, struct cryptodesc *crd,
+    struct cryptop *crp)
+{
+	union {
+		struct SHA256Context sha2 __aligned(16);
+		struct sha1_ctxt sha1 __aligned(16);
+	} sctx;
+	uint32_t res[SHA2_256_HASH_LEN / sizeof(uint32_t)];
+	int hashlen;
+
+	if (crd->crd_flags != 0)
+		return (EINVAL);
+
+	switch (ses->auth_algo) {
+	case CRYPTO_SHA1_HMAC:
+		hashlen = SHA1_HASH_LEN;
+		/* Inner hash: (K ^ IPAD) || data */
+		sha1_init(&sctx.sha1);
+		hmac_internal(&sctx.sha1, res, intel_sha1_update,
+		    SHA1_Finalize_fn, ses->hmac_key, 0x36, crp->crp_buf,
+		    crd->crd_skip, crd->crd_len, crp->crp_flags);
+		/* Outer hash: (K ^ OPAD) || inner hash */
+		sha1_init(&sctx.sha1);
+		hmac_internal(&sctx.sha1, res, intel_sha1_update,
+		    SHA1_Finalize_fn, ses->hmac_key, 0x5C, res, 0, hashlen, 0);
+		break;
+	case CRYPTO_SHA1:
+		hashlen = SHA1_HASH_LEN;
+		sha1_init(&sctx.sha1);
+		crypto_apply(crp->crp_flags, crp->crp_buf, crd->crd_skip,
+		    crd->crd_len, __DECONST(int (*)(void *, void *, u_int),
+		    intel_sha1_update), &sctx.sha1);
+		sha1_result(&sctx.sha1, (void *)res);
+		break;
+	case CRYPTO_SHA2_256_HMAC:
+		hashlen = SHA2_256_HASH_LEN;
+		/* Inner hash: (K ^ IPAD) || data */
+		SHA256_Init(&sctx.sha2);
+		hmac_internal(&sctx.sha2, res, intel_sha256_update,
+		    SHA256_Finalize_fn, ses->hmac_key, 0x36, crp->crp_buf,
+		    crd->crd_skip, crd->crd_len, crp->crp_flags);
+		/* Outer hash: (K ^ OPAD) || inner hash */
+		SHA256_Init(&sctx.sha2);
+		hmac_internal(&sctx.sha2, res, intel_sha256_update,
+		    SHA256_Finalize_fn, ses->hmac_key, 0x5C, res, 0, hashlen,
+		    0);
+		break;
+	default:
+		/*
+		 * AES-GMAC authentication is verified while processing the
+		 * enccrd
+		 */
+		return (0);
+	}
+
+	if (ses->mlen != 0 && ses->mlen < hashlen)
+		hashlen = ses->mlen;
+
+	crypto_copyback(crp->crp_flags, crp->crp_buf, crd->crd_inject, hashlen,
+	    (void *)res);
+	return (0);
+}
--- a/sys/crypto/aesni/aesni.h
+++ b/sys/crypto/aesni/aesni.h
@ -56,12 +56,16 @@ struct aesni_session {
 	uint8_t enc_schedule[AES_SCHED_LEN] __aligned(16);
 	uint8_t dec_schedule[AES_SCHED_LEN] __aligned(16);
 	uint8_t xts_schedule[AES_SCHED_LEN] __aligned(16);
+	/* Same as the SHA256 Blocksize. */
+	uint8_t hmac_key[SHA1_HMAC_BLOCK_LEN] __aligned(16);
 	int algo;
 	int rounds;
 	/* uint8_t *ses_ictx; */
 	/* uint8_t *ses_octx; */
 	/* int ses_mlen; */
 	int used;
+	int auth_algo;
+	int mlen;
 	uint32_t id;
 	TAILQ_ENTRY(aesni_session) next;
 };
@ -111,7 +115,5 @@ int AES_GCM_decrypt(const unsigned char *in, unsigned char *out,

 int aesni_cipher_setup_common(struct aesni_session *ses, const uint8_t *key,
    int keylen);
-uint8_t *aesni_cipher_alloc(struct cryptodesc *enccrd, struct cryptop *crp,
-    int *allocated);

 #endif /* _AESNI_H_ */
--- a/sys/crypto/aesni/intel_sha1.c
+++ b/sys/crypto/aesni/intel_sha1.c
@ -0,0 +1,261 @@
+/*******************************************************************************
+* Copyright (c) 2013, Intel Corporation 
+* 
+* All rights reserved. 
+* 
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are
+* met: 
+* 
+* * Redistributions of source code must retain the above copyright
+*   notice, this list of conditions and the following disclaimer.  
+* 
+* * Redistributions in binary form must reproduce the above copyright
+*   notice, this list of conditions and the following disclaimer in the
+*   documentation and/or other materials provided with the
+*   distribution. 
+* 
+* * Neither the name of the Intel Corporation nor the names of its
+*   contributors may be used to endorse or promote products derived from
+*   this software without specific prior written permission. 
+* 
+* 
+* THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+********************************************************************************
+*
+* Intel SHA Extensions optimized implementation of a SHA-1 update function 
+* 
+* The function takes a pointer to the current hash values, a pointer to the 
+* input data, and a number of 64 byte blocks to process.  Once all blocks have 
+* been processed, the digest pointer is  updated with the resulting hash value.
+* The function only processes complete blocks, there is no functionality to 
+* store partial blocks.  All message padding and hash value initialization must
+* be done outside the update function.  
+* 
+* The indented lines in the loop are instructions related to rounds processing.
+* The non-indented lines are instructions related to the message schedule.
+* 
+* Author: Sean Gulley <sean.m.gulley@intel.com>
+* Date:   July 2013
+*
+********************************************************************************
+*
+* Example complier command line:
+* icc intel_sha_extensions_sha1_intrinsic.c
+* gcc -msha -msse4 intel_sha_extensions_sha1_intrinsic.c
+*
+*******************************************************************************/
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <immintrin.h>
+
+#include <crypto/aesni/sha_sse.h>
+
+void intel_sha1_step(uint32_t *digest, const char *data, uint32_t num_blks) {
+   __m128i abcd, e0, e1;
+   __m128i abcd_save, e_save;
+   __m128i msg0, msg1, msg2, msg3;
+   __m128i shuf_mask, e_mask;
+
+#if 0
+   e_mask    = _mm_set_epi64x(0xFFFFFFFF00000000ull, 0x0000000000000000ull);
+#else
+   (void)e_mask;
+   e0        = _mm_set_epi64x(0, 0);
+#endif
+   shuf_mask = _mm_set_epi64x(0x0001020304050607ull, 0x08090a0b0c0d0e0full);
+
+   // Load initial hash values
+   abcd      = _mm_loadu_si128((__m128i*) digest);
+   e0        = _mm_insert_epi32(e0, *(digest+4), 3);
+   abcd      = _mm_shuffle_epi32(abcd, 0x1B);
+#if 0
+   e0        = _mm_and_si128(e0, e_mask);
+#endif
+
+   while (num_blks > 0) {
+      // Save hash values for addition after rounds
+      abcd_save = abcd;
+      e_save    = e0;
+
+      // Rounds 0-3
+      msg0 = _mm_loadu_si128((const __m128i*) data);
+      msg0 = _mm_shuffle_epi8(msg0, shuf_mask);
+         e0   = _mm_add_epi32(e0, msg0);
+         e1   = abcd;
+         abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
+
+      // Rounds 4-7
+      msg1 = _mm_loadu_si128((const __m128i*) (data+16));
+      msg1 = _mm_shuffle_epi8(msg1, shuf_mask);
+         e1   = _mm_sha1nexte_epu32(e1, msg1);
+         e0   = abcd;
+         abcd = _mm_sha1rnds4_epu32(abcd, e1, 0);
+      msg0 = _mm_sha1msg1_epu32(msg0, msg1);
+
+      // Rounds 8-11
+      msg2 = _mm_loadu_si128((const __m128i*) (data+32));
+      msg2 = _mm_shuffle_epi8(msg2, shuf_mask);
+         e0   = _mm_sha1nexte_epu32(e0, msg2);
+         e1   = abcd;
+         abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
+      msg1 = _mm_sha1msg1_epu32(msg1, msg2);
+      msg0 = _mm_xor_si128(msg0, msg2);
+
+      // Rounds 12-15
+      msg3 = _mm_loadu_si128((const __m128i*) (data+48));
+      msg3 = _mm_shuffle_epi8(msg3, shuf_mask);
+         e1   = _mm_sha1nexte_epu32(e1, msg3);
+         e0   = abcd;
+      msg0 = _mm_sha1msg2_epu32(msg0, msg3);
+         abcd = _mm_sha1rnds4_epu32(abcd, e1, 0);
+      msg2 = _mm_sha1msg1_epu32(msg2, msg3);
+      msg1 = _mm_xor_si128(msg1, msg3);
+
+      // Rounds 16-19
+         e0   = _mm_sha1nexte_epu32(e0, msg0);
+         e1   = abcd;
+      msg1 = _mm_sha1msg2_epu32(msg1, msg0);
+         abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
+      msg3 = _mm_sha1msg1_epu32(msg3, msg0);
+      msg2 = _mm_xor_si128(msg2, msg0);
+
+      // Rounds 20-23
+         e1   = _mm_sha1nexte_epu32(e1, msg1);
+         e0   = abcd;
+      msg2 = _mm_sha1msg2_epu32(msg2, msg1);
+         abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
+      msg0 = _mm_sha1msg1_epu32(msg0, msg1);
+      msg3 = _mm_xor_si128(msg3, msg1);
+	
+      // Rounds 24-27
+         e0   = _mm_sha1nexte_epu32(e0, msg2);
+         e1   = abcd;
+      msg3 = _mm_sha1msg2_epu32(msg3, msg2);
+         abcd = _mm_sha1rnds4_epu32(abcd, e0, 1);
+      msg1 = _mm_sha1msg1_epu32(msg1, msg2);
+      msg0 = _mm_xor_si128(msg0, msg2);
+
+      // Rounds 28-31
+         e1   = _mm_sha1nexte_epu32(e1, msg3);
+         e0   = abcd;
+      msg0 = _mm_sha1msg2_epu32(msg0, msg3);
+         abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
+      msg2 = _mm_sha1msg1_epu32(msg2, msg3);
+      msg1 = _mm_xor_si128(msg1, msg3);
+
+      // Rounds 32-35
+         e0   = _mm_sha1nexte_epu32(e0, msg0);
+         e1   = abcd;
+      msg1 = _mm_sha1msg2_epu32(msg1, msg0);
+         abcd = _mm_sha1rnds4_epu32(abcd, e0, 1);
+      msg3 = _mm_sha1msg1_epu32(msg3, msg0);
+      msg2 = _mm_xor_si128(msg2, msg0);
+
+      // Rounds 36-39
+         e1   = _mm_sha1nexte_epu32(e1, msg1);
+         e0   = abcd;
+      msg2 = _mm_sha1msg2_epu32(msg2, msg1);
+         abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
+      msg0 = _mm_sha1msg1_epu32(msg0, msg1);
+      msg3 = _mm_xor_si128(msg3, msg1);
+	
+      // Rounds 40-43
+         e0   = _mm_sha1nexte_epu32(e0, msg2);
+         e1   = abcd;
+      msg3 = _mm_sha1msg2_epu32(msg3, msg2);
+         abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
+      msg1 = _mm_sha1msg1_epu32(msg1, msg2);
+      msg0 = _mm_xor_si128(msg0, msg2);
+
+      // Rounds 44-47
+         e1   = _mm_sha1nexte_epu32(e1, msg3);
+         e0   = abcd;
+      msg0 = _mm_sha1msg2_epu32(msg0, msg3);
+         abcd = _mm_sha1rnds4_epu32(abcd, e1, 2);
+      msg2 = _mm_sha1msg1_epu32(msg2, msg3);
+      msg1 = _mm_xor_si128(msg1, msg3);
+
+      // Rounds 48-51
+         e0   = _mm_sha1nexte_epu32(e0, msg0);
+         e1   = abcd;
+      msg1 = _mm_sha1msg2_epu32(msg1, msg0);
+         abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
+      msg3 = _mm_sha1msg1_epu32(msg3, msg0);
+      msg2 = _mm_xor_si128(msg2, msg0);
+
+      // Rounds 52-55
+         e1   = _mm_sha1nexte_epu32(e1, msg1);
+         e0   = abcd;
+      msg2 = _mm_sha1msg2_epu32(msg2, msg1);
+         abcd = _mm_sha1rnds4_epu32(abcd, e1, 2);
+      msg0 = _mm_sha1msg1_epu32(msg0, msg1);
+      msg3 = _mm_xor_si128(msg3, msg1);
+	
+      // Rounds 56-59
+         e0   = _mm_sha1nexte_epu32(e0, msg2);
+         e1   = abcd;
+      msg3 = _mm_sha1msg2_epu32(msg3, msg2);
+         abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
+      msg1 = _mm_sha1msg1_epu32(msg1, msg2);
+      msg0 = _mm_xor_si128(msg0, msg2);
+
+      // Rounds 60-63
+         e1   = _mm_sha1nexte_epu32(e1, msg3);
+         e0   = abcd;
+      msg0 = _mm_sha1msg2_epu32(msg0, msg3);
+         abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
+      msg2 = _mm_sha1msg1_epu32(msg2, msg3);
+      msg1 = _mm_xor_si128(msg1, msg3);
+
+      // Rounds 64-67
+         e0   = _mm_sha1nexte_epu32(e0, msg0);
+         e1   = abcd;
+      msg1 = _mm_sha1msg2_epu32(msg1, msg0);
+         abcd = _mm_sha1rnds4_epu32(abcd, e0, 3);
+      msg3 = _mm_sha1msg1_epu32(msg3, msg0);
+      msg2 = _mm_xor_si128(msg2, msg0);
+
+      // Rounds 68-71
+         e1   = _mm_sha1nexte_epu32(e1, msg1);
+         e0   = abcd;
+      msg2 = _mm_sha1msg2_epu32(msg2, msg1);
+         abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
+      msg3 = _mm_xor_si128(msg3, msg1);
+	
+      // Rounds 72-75
+         e0   = _mm_sha1nexte_epu32(e0, msg2);
+         e1   = abcd;
+      msg3 = _mm_sha1msg2_epu32(msg3, msg2);
+         abcd = _mm_sha1rnds4_epu32(abcd, e0, 3);
+
+      // Rounds 76-79
+         e1   = _mm_sha1nexte_epu32(e1, msg3);
+         e0   = abcd;
+         abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
+
+      // Add current hash values with previously saved
+      e0   = _mm_sha1nexte_epu32(e0, e_save);
+      abcd = _mm_add_epi32(abcd, abcd_save);
+
+      data += 64;
+      num_blks--;
+   }
+
+   abcd = _mm_shuffle_epi32(abcd, 0x1B);
+   _mm_store_si128((__m128i*) digest, abcd);
+   *(digest+4) = _mm_extract_epi32(e0, 3);
+}
+
--- a/sys/crypto/aesni/intel_sha256.c
+++ b/sys/crypto/aesni/intel_sha256.c
@ -0,0 +1,277 @@
+/*******************************************************************************
+* Copyright (c) 2013, Intel Corporation 
+* 
+* All rights reserved. 
+* 
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are
+* met: 
+* 
+* * Redistributions of source code must retain the above copyright
+*   notice, this list of conditions and the following disclaimer.  
+* 
+* * Redistributions in binary form must reproduce the above copyright
+*   notice, this list of conditions and the following disclaimer in the
+*   documentation and/or other materials provided with the
+*   distribution. 
+* 
+* * Neither the name of the Intel Corporation nor the names of its
+*   contributors may be used to endorse or promote products derived from
+*   this software without specific prior written permission. 
+* 
+* 
+* THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+********************************************************************************
+*
+* Intel SHA Extensions optimized implementation of a SHA-256 update function 
+*
+* The function takes a pointer to the current hash values, a pointer to the 
+* input data, and a number of 64 byte blocks to process.  Once all blocks have 
+* been processed, the digest pointer is  updated with the resulting hash value.
+* The function only processes complete blocks, there is no functionality to 
+* store partial blocks.  All message padding and hash value initialization must
+* be done outside the update function.  
+*
+* The indented lines in the loop are instructions related to rounds processing.
+* The non-indented lines are instructions related to the message schedule.
+*
+* Author: Sean Gulley <sean.m.gulley@intel.com>
+* Date:   July 2013
+*
+********************************************************************************
+*
+* Example complier command line:
+* icc intel_sha_extensions_sha256_intrinsic.c
+* gcc -msha -msse4 intel_sha_extensions_sha256_intrinsic.c
+*
+*******************************************************************************/
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <immintrin.h>
+
+#include <crypto/aesni/sha_sse.h>
+
+void intel_sha256_step(uint32_t *digest, const char *data, uint32_t num_blks) {
+   __m128i state0, state1;
+   __m128i msg;
+   __m128i msgtmp0, msgtmp1, msgtmp2, msgtmp3;
+   __m128i tmp;
+   __m128i shuf_mask;
+   __m128i abef_save, cdgh_save;
+
+   // Load initial hash values
+   // Need to reorder these appropriately
+   // DCBA, HGFE -> ABEF, CDGH
+   tmp    = _mm_loadu_si128((__m128i*) digest);
+   state1 = _mm_loadu_si128((__m128i*) (digest+4));
+
+   tmp    = _mm_shuffle_epi32(tmp, 0xB1);       // CDAB
+   state1 = _mm_shuffle_epi32(state1, 0x1B);    // EFGH
+   state0 = _mm_alignr_epi8(tmp, state1, 8);    // ABEF
+   state1 = _mm_blend_epi16(state1, tmp, 0xF0); // CDGH
+
+   shuf_mask = _mm_set_epi64x(0x0c0d0e0f08090a0bull, 0x0405060700010203ull);
+
+   while (num_blks > 0) {
+      // Save hash values for addition after rounds
+      abef_save = state0;
+      cdgh_save = state1;
+
+      // Rounds 0-3
+      msg     = _mm_loadu_si128((const __m128i*) data);
+      msgtmp0 = _mm_shuffle_epi8(msg, shuf_mask);
+         msg    = _mm_add_epi32(msgtmp0, 
+                  _mm_set_epi64x(0xE9B5DBA5B5C0FBCFull, 0x71374491428A2F98ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+
+      // Rounds 4-7
+      msgtmp1 = _mm_loadu_si128((const __m128i*) (data+16));
+      msgtmp1 = _mm_shuffle_epi8(msgtmp1, shuf_mask);
+         msg    = _mm_add_epi32(msgtmp1, 
+                  _mm_set_epi64x(0xAB1C5ED5923F82A4ull, 0x59F111F13956C25Bull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp0 = _mm_sha256msg1_epu32(msgtmp0, msgtmp1);
+
+      // Rounds 8-11
+      msgtmp2 = _mm_loadu_si128((const __m128i*) (data+32));
+      msgtmp2 = _mm_shuffle_epi8(msgtmp2, shuf_mask);
+         msg    = _mm_add_epi32(msgtmp2, 
+                  _mm_set_epi64x(0x550C7DC3243185BEull, 0x12835B01D807AA98ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp1 = _mm_sha256msg1_epu32(msgtmp1, msgtmp2);
+
+      // Rounds 12-15
+      msgtmp3 = _mm_loadu_si128((const __m128i*) (data+48));
+      msgtmp3 = _mm_shuffle_epi8(msgtmp3, shuf_mask);
+         msg    = _mm_add_epi32(msgtmp3, 
+                  _mm_set_epi64x(0xC19BF1749BDC06A7ull, 0x80DEB1FE72BE5D74ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp3, msgtmp2, 4);
+      msgtmp0 = _mm_add_epi32(msgtmp0, tmp);
+      msgtmp0 = _mm_sha256msg2_epu32(msgtmp0, msgtmp3);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp2 = _mm_sha256msg1_epu32(msgtmp2, msgtmp3);
+
+      // Rounds 16-19
+         msg    = _mm_add_epi32(msgtmp0, 
+                  _mm_set_epi64x(0x240CA1CC0FC19DC6ull, 0xEFBE4786E49B69C1ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp0, msgtmp3, 4);
+      msgtmp1 = _mm_add_epi32(msgtmp1, tmp);
+      msgtmp1 = _mm_sha256msg2_epu32(msgtmp1, msgtmp0);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp3 = _mm_sha256msg1_epu32(msgtmp3, msgtmp0);
+
+      // Rounds 20-23
+         msg    = _mm_add_epi32(msgtmp1, 
+                  _mm_set_epi64x(0x76F988DA5CB0A9DCull, 0x4A7484AA2DE92C6Full));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp1, msgtmp0, 4);
+      msgtmp2 = _mm_add_epi32(msgtmp2, tmp);
+      msgtmp2 = _mm_sha256msg2_epu32(msgtmp2, msgtmp1);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp0 = _mm_sha256msg1_epu32(msgtmp0, msgtmp1);
+
+      // Rounds 24-27
+         msg    = _mm_add_epi32(msgtmp2, 
+                  _mm_set_epi64x(0xBF597FC7B00327C8ull, 0xA831C66D983E5152ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp2, msgtmp1, 4);
+      msgtmp3 = _mm_add_epi32(msgtmp3, tmp);
+      msgtmp3 = _mm_sha256msg2_epu32(msgtmp3, msgtmp2);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp1 = _mm_sha256msg1_epu32(msgtmp1, msgtmp2);
+
+      // Rounds 28-31
+         msg    = _mm_add_epi32(msgtmp3, 
+                  _mm_set_epi64x(0x1429296706CA6351ull, 0xD5A79147C6E00BF3ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp3, msgtmp2, 4);
+      msgtmp0 = _mm_add_epi32(msgtmp0, tmp);
+      msgtmp0 = _mm_sha256msg2_epu32(msgtmp0, msgtmp3);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp2 = _mm_sha256msg1_epu32(msgtmp2, msgtmp3);
+
+      // Rounds 32-35
+         msg    = _mm_add_epi32(msgtmp0, 
+                  _mm_set_epi64x(0x53380D134D2C6DFCull, 0x2E1B213827B70A85ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp0, msgtmp3, 4);
+      msgtmp1 = _mm_add_epi32(msgtmp1, tmp);
+      msgtmp1 = _mm_sha256msg2_epu32(msgtmp1, msgtmp0);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp3 = _mm_sha256msg1_epu32(msgtmp3, msgtmp0);
+
+      // Rounds 36-39
+         msg    = _mm_add_epi32(msgtmp1, 
+                  _mm_set_epi64x(0x92722C8581C2C92Eull, 0x766A0ABB650A7354ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp1, msgtmp0, 4);
+      msgtmp2 = _mm_add_epi32(msgtmp2, tmp);
+      msgtmp2 = _mm_sha256msg2_epu32(msgtmp2, msgtmp1);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp0 = _mm_sha256msg1_epu32(msgtmp0, msgtmp1);
+
+      // Rounds 40-43
+         msg    = _mm_add_epi32(msgtmp2, 
+                  _mm_set_epi64x(0xC76C51A3C24B8B70ull, 0xA81A664BA2BFE8A1ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp2, msgtmp1, 4);
+      msgtmp3 = _mm_add_epi32(msgtmp3, tmp);
+      msgtmp3 = _mm_sha256msg2_epu32(msgtmp3, msgtmp2);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp1 = _mm_sha256msg1_epu32(msgtmp1, msgtmp2);
+
+      // Rounds 44-47
+         msg    = _mm_add_epi32(msgtmp3, 
+                  _mm_set_epi64x(0x106AA070F40E3585ull, 0xD6990624D192E819ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp3, msgtmp2, 4);
+      msgtmp0 = _mm_add_epi32(msgtmp0, tmp);
+      msgtmp0 = _mm_sha256msg2_epu32(msgtmp0, msgtmp3);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp2 = _mm_sha256msg1_epu32(msgtmp2, msgtmp3);
+
+      // Rounds 48-51
+         msg    = _mm_add_epi32(msgtmp0, 
+                  _mm_set_epi64x(0x34B0BCB52748774Cull, 0x1E376C0819A4C116ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp0, msgtmp3, 4);
+      msgtmp1 = _mm_add_epi32(msgtmp1, tmp);
+      msgtmp1 = _mm_sha256msg2_epu32(msgtmp1, msgtmp0);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp3 = _mm_sha256msg1_epu32(msgtmp3, msgtmp0);
+
+      // Rounds 52-55
+         msg    = _mm_add_epi32(msgtmp1, 
+                  _mm_set_epi64x(0x682E6FF35B9CCA4Full, 0x4ED8AA4A391C0CB3ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp1, msgtmp0, 4);
+      msgtmp2 = _mm_add_epi32(msgtmp2, tmp);
+      msgtmp2 = _mm_sha256msg2_epu32(msgtmp2, msgtmp1);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+
+      // Rounds 56-59
+         msg    = _mm_add_epi32(msgtmp2, 
+                  _mm_set_epi64x(0x8CC7020884C87814ull, 0x78A5636F748F82EEull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp2, msgtmp1, 4);
+      msgtmp3 = _mm_add_epi32(msgtmp3, tmp);
+      msgtmp3 = _mm_sha256msg2_epu32(msgtmp3, msgtmp2);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+
+      // Rounds 60-63
+         msg    = _mm_add_epi32(msgtmp3, 
+                  _mm_set_epi64x(0xC67178F2BEF9A3F7ull, 0xA4506CEB90BEFFFAull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+
+      // Add current hash values with previously saved
+      state0 = _mm_add_epi32(state0, abef_save);
+      state1 = _mm_add_epi32(state1, cdgh_save);
+
+      data += 64;
+      num_blks--;
+   }
+
+   // Write hash values back in the correct order
+   tmp    = _mm_shuffle_epi32(state0, 0x1B);    // FEBA
+   state1 = _mm_shuffle_epi32(state1, 0xB1);    // DCHG
+   state0 = _mm_blend_epi16(tmp, state1, 0xF0); // DCBA
+   state1 = _mm_alignr_epi8(state1, tmp, 8);    // ABEF
+
+   _mm_store_si128((__m128i*) digest, state0);
+   _mm_store_si128((__m128i*) (digest+4), state1);
+}
+
--- a/sys/crypto/aesni/sha_sse.h
+++ b/sys/crypto/aesni/sha_sse.h
@ -0,0 +1,38 @@
+/*-
+ * Copyright (c) 2017 Conrad Meyer <cem@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _CRYPTO__SHA_SSE_H_
+#define _CRYPTO__SHA_SSE_H_
+
+/*
+ * Internal functions, implemented in intrinsics.
+ */
+void intel_sha1_step(uint32_t *digest, const char *data, uint32_t num_blks);
+void intel_sha256_step(uint32_t *digest, const char *data, uint32_t num_blks);
+
+#endif /* _CRYPTO__SHA_SSE_H_ */
--- a/sys/modules/aesni/Makefile
+++ b/sys/modules/aesni/Makefile
@ -1,6 +1,7 @@
 # $FreeBSD$

 .PATH: ${SRCTOP}/sys/crypto/aesni
+.PATH: ${SRCTOP}/contrib/llvm/tools/clang/lib/Headers

 KMOD=	aesni
 SRCS=	aesni.c
@ -8,6 +9,7 @@ SRCS+=	aeskeys_${MACHINE_CPUARCH}.S
 SRCS+=	device_if.h bus_if.h opt_bus.h cryptodev_if.h

 OBJS+=	aesni_ghash.o aesni_wrap.o
+OBJS+=	intel_sha1.o intel_sha256.o

 # Remove -nostdinc so we can get the intrinsics.
 aesni_ghash.o: aesni_ghash.c
@ -21,8 +23,20 @@ aesni_wrap.o: aesni_wrap.c
 	     -mmmx -msse -msse4 -maes ${.IMPSRC}
 	${CTFCONVERT_CMD}

+intel_sha1.o: intel_sha1.c
+	${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} \
+	     -mmmx -msse -msse4 -msha ${.IMPSRC}
+	${CTFCONVERT_CMD}
+
+intel_sha256.o: intel_sha256.c
+	${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} \
+	     -mmmx -msse -msse4 -msha ${.IMPSRC}
+	${CTFCONVERT_CMD}
+
 aesni_ghash.o: aesni.h
 aesni_wrap.o: aesni.h
+intel_sha1.o: sha_sse.h immintrin.h shaintrin.h tmmintrin.h xmmintrin.h
+intel_sha256.o: sha_sse.h immintrin.h shaintrin.h tmmintrin.h xmmintrin.h

 .include <bsd.kmod.mk>

--- a/tests/sys/opencrypto/cryptotest.py
+++ b/tests/sys/opencrypto/cryptotest.py
@ -47,7 +47,7 @@ def katg(base, glob):

 aesmodules = [ 'cryptosoft0', 'aesni0', 'ccr0' ]
 desmodules = [ 'cryptosoft0', ]
-shamodules = [ 'cryptosoft0', 'ccr0' ]
+shamodules = [ 'cryptosoft0', 'aesni0', 'ccr0' ]

 def GenTestCase(cname):
 	try: