unroll the loop slightly... This improves performance enough to

justify, especially for CBC performance where we can't pipeline.. I don't happen to have my measurements handy though... Sponsored by: Netflix, Inc.
svn path=/head/; revision=285254
2015-07-07 20:31:09 +00:00 · 2015-07-07 20:31:09 +00:00 · a13589bc47 · 2020-12-20 02:59:44 +00:00
commit a13589bc47
parent 754f368cda
1 changed files with 17 additions and 5 deletions
--- a/sys/crypto/aesni/aesencdec.h
+++ b/sys/crypto/aesni/aesencdec.h
@ -1,5 +1,6 @@
 /*-
 * Copyright 2013 John-Mark Gurney <jmg@FreeBSD.org>
+ * Copyright 2015 Netflix, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
@ -27,6 +28,9 @@
 *
 */

+#ifndef _AESENCDEC_H_
+#define _AESENCDEC_H_
+
 #include <crypto/aesni/aesni_os.h>

 #include <wmmintrin.h>
@ -105,6 +109,7 @@ aesni_dec8(int rounds, const __m128i *keysched, __m128i a,
 	out[7] = _mm_aesdeclast_si128(h, keysched[i + 1]);
 }

+/* rounds is passed in as rounds - 1 */
 static inline __m128i
 aesni_enc(int rounds, const __m128i *keysched, const __m128i from)
 {
@ -112,11 +117,13 @@ aesni_enc(int rounds, const __m128i *keysched, const __m128i from)
 	int i;

 	tmp = from ^ keysched[0];
-
-	for (i = 0; i < rounds; i++)
+	for (i = 1; i < rounds; i += 2) {
+		tmp = _mm_aesenc_si128(tmp, keysched[i]);
 		tmp = _mm_aesenc_si128(tmp, keysched[i + 1]);
+	}

-	return _mm_aesenclast_si128(tmp, keysched[i + 1]);
+	tmp = _mm_aesenc_si128(tmp, keysched[rounds]);
+	return _mm_aesenclast_si128(tmp, keysched[rounds + 1]);
 }

 static inline __m128i
@ -127,8 +134,13 @@ aesni_dec(int rounds, const __m128i *keysched, const __m128i from)

 	tmp = from ^ keysched[0];

-	for (i = 0; i < rounds; i++)
+	for (i = 1; i < rounds; i += 2) {
+		tmp = _mm_aesdec_si128(tmp, keysched[i]);
 		tmp = _mm_aesdec_si128(tmp, keysched[i + 1]);
+	}

-	return _mm_aesdeclast_si128(tmp, keysched[i + 1]);
+	tmp = _mm_aesdec_si128(tmp, keysched[rounds]);
+	return _mm_aesdeclast_si128(tmp, keysched[rounds + 1]);
 }
+
+#endif /* _AESENCDEC_H_ */