freebsd-nq/sys/crypto/aesni/aesni_ghash.c
John-Mark Gurney 08fca7a56b Add some new modes to OpenCrypto. These modes are AES-ICM (can be used
for counter mode), and AES-GCM.  Both of these modes have been added to
the aesni module.

Included is a set of tests to validate that the software and aesni
module calculate the correct values.  These use the NIST KAT test
vectors.  To run the test, you will need to install a soon to be
committed port, nist-kat that will install the vectors.  Using a port
is necessary as the test vectors are around 25MB.

All the man pages were updated.  I have added a new man page, crypto.7,
which includes a description of how to use each mode.  All the new modes
and some other AES modes are present.  It would be good for someone
else to go through and document the other modes.

A new ioctl was added to support AEAD modes which AES-GCM is one of them.
Without this ioctl, it is not possible to test AEAD modes from userland.

Add a timing safe bcmp for use to compare MACs.  Previously we were using
bcmp which could leak timing info and result in the ability to forge
messages.

Add a minor optimization to the aesni module so that single segment
mbufs don't get copied and instead are updated in place.  The aesni
module needs to be updated to support blocked IO so segmented mbufs
don't have to be copied.

We require that the IV be specified for all calls for both GCM and ICM.
This is to ensure proper use of these functions.

Obtained from:	p4: //depot/projects/opencrypto
Relnotes:	yes
Sponsored by:	FreeBSD Foundation
Sponsored by:	NetGate
2014-12-12 19:56:36 +00:00

804 lines
26 KiB
C

/*-
* Copyright (c) 2014 The FreeBSD Foundation
* All rights reserved.
*
* This software was developed by John-Mark Gurney under
* the sponsorship of the FreeBSD Foundation and
* Rubicon Communications, LLC (Netgate).
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*
* $FreeBSD$
*
*/
/*
* Figure 5, 8 and 12 are copied from the Intel white paper:
* Intel® Carry-Less Multiplication Instruction and its Usage for
* Computing the GCM Mode
*
* and as such are:
* Copyright © 2010 Intel Corporation.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef _KERNEL
#include <crypto/aesni/aesni.h>
#else
#include <stdint.h>
#endif
#include <wmmintrin.h>
#include <emmintrin.h>
#include <smmintrin.h>
static inline int
m128icmp(__m128i a, __m128i b)
{
__m128i cmp;
cmp = _mm_cmpeq_epi32(a, b);
return _mm_movemask_epi8(cmp) == 0xffff;
}
#ifdef __i386__
static inline __m128i
_mm_insert_epi64(__m128i a, int64_t b, const int ndx)
{
if (!ndx) {
a = _mm_insert_epi32(a, b, 0);
a = _mm_insert_epi32(a, b >> 32, 1);
} else {
a = _mm_insert_epi32(a, b, 2);
a = _mm_insert_epi32(a, b >> 32, 3);
}
return a;
}
#endif
/* some code from carry-less-multiplication-instruction-in-gcm-mode-paper.pdf */
/* Figure 5. Code Sample - Performing Ghash Using Algorithms 1 and 5 (C) */
static void
gfmul(__m128i a, __m128i b, __m128i *res)
{
__m128i tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
tmp4 = _mm_clmulepi64_si128(a, b, 0x10);
tmp5 = _mm_clmulepi64_si128(a, b, 0x01);
tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
tmp4 = _mm_xor_si128(tmp4, tmp5);
tmp5 = _mm_slli_si128(tmp4, 8);
tmp4 = _mm_srli_si128(tmp4, 8);
tmp3 = _mm_xor_si128(tmp3, tmp5);
tmp6 = _mm_xor_si128(tmp6, tmp4);
tmp7 = _mm_srli_epi32(tmp3, 31);
tmp8 = _mm_srli_epi32(tmp6, 31);
tmp3 = _mm_slli_epi32(tmp3, 1);
tmp6 = _mm_slli_epi32(tmp6, 1);
tmp9 = _mm_srli_si128(tmp7, 12);
tmp8 = _mm_slli_si128(tmp8, 4);
tmp7 = _mm_slli_si128(tmp7, 4);
tmp3 = _mm_or_si128(tmp3, tmp7);
tmp6 = _mm_or_si128(tmp6, tmp8);
tmp6 = _mm_or_si128(tmp6, tmp9);
tmp7 = _mm_slli_epi32(tmp3, 31);
tmp8 = _mm_slli_epi32(tmp3, 30);
tmp9 = _mm_slli_epi32(tmp3, 25);
tmp7 = _mm_xor_si128(tmp7, tmp8);
tmp7 = _mm_xor_si128(tmp7, tmp9);
tmp8 = _mm_srli_si128(tmp7, 4);
tmp7 = _mm_slli_si128(tmp7, 12);
tmp3 = _mm_xor_si128(tmp3, tmp7);
tmp2 = _mm_srli_epi32(tmp3, 1);
tmp4 = _mm_srli_epi32(tmp3, 2);
tmp5 = _mm_srli_epi32(tmp3, 7);
tmp2 = _mm_xor_si128(tmp2, tmp4);
tmp2 = _mm_xor_si128(tmp2, tmp5);
tmp2 = _mm_xor_si128(tmp2, tmp8);
tmp3 = _mm_xor_si128(tmp3, tmp2);
tmp6 = _mm_xor_si128(tmp6, tmp3);
*res = tmp6;
}
/*
* Figure 8. Code Sample - Performing Ghash Using an Aggregated Reduction
* Method */
static void
reduce4(__m128i H1, __m128i H2, __m128i H3, __m128i H4,
__m128i X1, __m128i X2, __m128i X3, __m128i X4, __m128i *res)
{
/*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/
__m128i H1_X1_lo, H1_X1_hi, H2_X2_lo, H2_X2_hi, H3_X3_lo,
H3_X3_hi, H4_X4_lo, H4_X4_hi, lo, hi;
__m128i tmp0, tmp1, tmp2, tmp3;
__m128i tmp4, tmp5, tmp6, tmp7;
__m128i tmp8, tmp9;
H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00);
H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00);
H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00);
H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00);
lo = _mm_xor_si128(H1_X1_lo, H2_X2_lo);
lo = _mm_xor_si128(lo, H3_X3_lo);
lo = _mm_xor_si128(lo, H4_X4_lo);
H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11);
H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11);
H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11);
H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11);
hi = _mm_xor_si128(H1_X1_hi, H2_X2_hi);
hi = _mm_xor_si128(hi, H3_X3_hi);
hi = _mm_xor_si128(hi, H4_X4_hi);
tmp0 = _mm_shuffle_epi32(H1, 78);
tmp4 = _mm_shuffle_epi32(X1, 78);
tmp0 = _mm_xor_si128(tmp0, H1);
tmp4 = _mm_xor_si128(tmp4, X1);
tmp1 = _mm_shuffle_epi32(H2, 78);
tmp5 = _mm_shuffle_epi32(X2, 78);
tmp1 = _mm_xor_si128(tmp1, H2);
tmp5 = _mm_xor_si128(tmp5, X2);
tmp2 = _mm_shuffle_epi32(H3, 78);
tmp6 = _mm_shuffle_epi32(X3, 78);
tmp2 = _mm_xor_si128(tmp2, H3);
tmp6 = _mm_xor_si128(tmp6, X3);
tmp3 = _mm_shuffle_epi32(H4, 78);
tmp7 = _mm_shuffle_epi32(X4, 78);
tmp3 = _mm_xor_si128(tmp3, H4);
tmp7 = _mm_xor_si128(tmp7, X4);
tmp0 = _mm_clmulepi64_si128(tmp0, tmp4, 0x00);
tmp1 = _mm_clmulepi64_si128(tmp1, tmp5, 0x00);
tmp2 = _mm_clmulepi64_si128(tmp2, tmp6, 0x00);
tmp3 = _mm_clmulepi64_si128(tmp3, tmp7, 0x00);
tmp0 = _mm_xor_si128(tmp0, lo);
tmp0 = _mm_xor_si128(tmp0, hi);
tmp0 = _mm_xor_si128(tmp1, tmp0);
tmp0 = _mm_xor_si128(tmp2, tmp0);
tmp0 = _mm_xor_si128(tmp3, tmp0);
tmp4 = _mm_slli_si128(tmp0, 8);
tmp0 = _mm_srli_si128(tmp0, 8);
lo = _mm_xor_si128(tmp4, lo);
hi = _mm_xor_si128(tmp0, hi);
tmp3 = lo;
tmp6 = hi;
tmp7 = _mm_srli_epi32(tmp3, 31);
tmp8 = _mm_srli_epi32(tmp6, 31);
tmp3 = _mm_slli_epi32(tmp3, 1);
tmp6 = _mm_slli_epi32(tmp6, 1);
tmp9 = _mm_srli_si128(tmp7, 12);
tmp8 = _mm_slli_si128(tmp8, 4);
tmp7 = _mm_slli_si128(tmp7, 4);
tmp3 = _mm_or_si128(tmp3, tmp7);
tmp6 = _mm_or_si128(tmp6, tmp8);
tmp6 = _mm_or_si128(tmp6, tmp9);
tmp7 = _mm_slli_epi32(tmp3, 31);
tmp8 = _mm_slli_epi32(tmp3, 30);
tmp9 = _mm_slli_epi32(tmp3, 25);
tmp7 = _mm_xor_si128(tmp7, tmp8);
tmp7 = _mm_xor_si128(tmp7, tmp9);
tmp8 = _mm_srli_si128(tmp7, 4);
tmp7 = _mm_slli_si128(tmp7, 12);
tmp3 = _mm_xor_si128(tmp3, tmp7);
tmp2 = _mm_srli_epi32(tmp3, 1);
tmp4 = _mm_srli_epi32(tmp3, 2);
tmp5 = _mm_srli_epi32(tmp3, 7);
tmp2 = _mm_xor_si128(tmp2, tmp4);
tmp2 = _mm_xor_si128(tmp2, tmp5);
tmp2 = _mm_xor_si128(tmp2, tmp8);
tmp3 = _mm_xor_si128(tmp3, tmp2);
tmp6 = _mm_xor_si128(tmp6, tmp3);
*res = tmp6;
}
/*
* Figure 12. AES-GCM: Processing Four Blocks in Parallel with Aggregated
* Every Four Blocks
*/
/*
* per NIST SP-800-38D, 5.2.1.1, len(p) <= 2^39-256 (in bits), or
* 2^32-256*8*16 bytes.
*/
void
AES_GCM_encrypt(const unsigned char *in, unsigned char *out,
const unsigned char *addt, const unsigned char *ivec,
unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
const unsigned char *key, int nr)
{
int i, j ,k;
__m128i tmp1, tmp2, tmp3, tmp4;
__m128i tmp5, tmp6, tmp7, tmp8;
__m128i H, H2, H3, H4, Y, T;
__m128i *KEY = (__m128i*)key;
__m128i ctr1, ctr2, ctr3, ctr4;
__m128i ctr5, ctr6, ctr7, ctr8;
__m128i last_block = _mm_setzero_si128();
__m128i ONE = _mm_set_epi32(0, 1, 0, 0);
__m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
__m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
7);
__m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
15);
__m128i X = _mm_setzero_si128();
if (ibytes == 96/8) {
Y = _mm_loadu_si128((__m128i*)ivec);
Y = _mm_insert_epi32(Y, 0x1000000, 3);
/*(Compute E[ZERO, KS] and E[Y0, KS] together*/
tmp1 = _mm_xor_si128(X, KEY[0]);
tmp2 = _mm_xor_si128(Y, KEY[0]);
for (j=1; j < nr-1; j+=2) {
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
}
tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
H = _mm_aesenclast_si128(tmp1, KEY[nr]);
T = _mm_aesenclast_si128(tmp2, KEY[nr]);
H = _mm_shuffle_epi8(H, BSWAP_MASK);
} else {
tmp1 = _mm_xor_si128(X, KEY[0]);
for (j=1; j <nr; j++)
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
H = _mm_aesenclast_si128(tmp1, KEY[nr]);
H = _mm_shuffle_epi8(H, BSWAP_MASK);
Y = _mm_setzero_si128();
for (i=0; i < ibytes/16; i++) {
tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]);
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
Y = _mm_xor_si128(Y, tmp1);
gfmul(Y, H, &Y);
}
if (ibytes%16) {
for (j=0; j < ibytes%16; j++)
((unsigned char*)&last_block)[j] = ivec[i*16+j];
tmp1 = last_block;
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
Y = _mm_xor_si128(Y, tmp1);
gfmul(Y, H, &Y);
}
tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
tmp1 = _mm_insert_epi64(tmp1, 0, 1);
Y = _mm_xor_si128(Y, tmp1);
gfmul(Y, H, &Y);
Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
tmp1 = _mm_xor_si128(Y, KEY[0]);
for (j=1; j < nr; j++)
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
T = _mm_aesenclast_si128(tmp1, KEY[nr]);
}
gfmul(H,H,&H2);
gfmul(H,H2,&H3);
gfmul(H,H3,&H4);
for (i=0; i<abytes/16/4; i++) {
tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i*4]);
tmp2 = _mm_loadu_si128(&((__m128i*)addt)[i*4+1]);
tmp3 = _mm_loadu_si128(&((__m128i*)addt)[i*4+2]);
tmp4 = _mm_loadu_si128(&((__m128i*)addt)[i*4+3]);
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
tmp1 = _mm_xor_si128(X, tmp1);
reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
}
for (i=i*4; i<abytes/16; i++) {
tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]);
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
X = _mm_xor_si128(X,tmp1);
gfmul(X, H, &X);
}
if (abytes%16) {
last_block = _mm_setzero_si128();
for (j=0; j<abytes%16; j++)
((unsigned char*)&last_block)[j] = addt[i*16+j];
tmp1 = last_block;
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
X =_mm_xor_si128(X,tmp1);
gfmul(X,H,&X);
}
ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
ctr1 = _mm_add_epi64(ctr1, ONE);
ctr2 = _mm_add_epi64(ctr1, ONE);
ctr3 = _mm_add_epi64(ctr2, ONE);
ctr4 = _mm_add_epi64(ctr3, ONE);
ctr5 = _mm_add_epi64(ctr4, ONE);
ctr6 = _mm_add_epi64(ctr5, ONE);
ctr7 = _mm_add_epi64(ctr6, ONE);
ctr8 = _mm_add_epi64(ctr7, ONE);
for (i=0; i<nbytes/16/8; i++) {
tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
ctr1 = _mm_add_epi64(ctr1, EIGHT);
ctr2 = _mm_add_epi64(ctr2, EIGHT);
ctr3 = _mm_add_epi64(ctr3, EIGHT);
ctr4 = _mm_add_epi64(ctr4, EIGHT);
ctr5 = _mm_add_epi64(ctr5, EIGHT);
ctr6 = _mm_add_epi64(ctr6, EIGHT);
ctr7 = _mm_add_epi64(ctr7, EIGHT);
ctr8 = _mm_add_epi64(ctr8, EIGHT);
tmp1 =_mm_xor_si128(tmp1, KEY[0]);
tmp2 =_mm_xor_si128(tmp2, KEY[0]);
tmp3 =_mm_xor_si128(tmp3, KEY[0]);
tmp4 =_mm_xor_si128(tmp4, KEY[0]);
tmp5 =_mm_xor_si128(tmp5, KEY[0]);
tmp6 =_mm_xor_si128(tmp6, KEY[0]);
tmp7 =_mm_xor_si128(tmp7, KEY[0]);
tmp8 =_mm_xor_si128(tmp8, KEY[0]);
for (j=1; j<nr; j++) {
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
}
tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
tmp1 = _mm_xor_si128(tmp1,
_mm_loadu_si128(&((__m128i*)in)[i*8+0]));
tmp2 = _mm_xor_si128(tmp2,
_mm_loadu_si128(&((__m128i*)in)[i*8+1]));
tmp3 = _mm_xor_si128(tmp3,
_mm_loadu_si128(&((__m128i*)in)[i*8+2]));
tmp4 = _mm_xor_si128(tmp4,
_mm_loadu_si128(&((__m128i*)in)[i*8+3]));
tmp5 = _mm_xor_si128(tmp5,
_mm_loadu_si128(&((__m128i*)in)[i*8+4]));
tmp6 = _mm_xor_si128(tmp6,
_mm_loadu_si128(&((__m128i*)in)[i*8+5]));
tmp7 = _mm_xor_si128(tmp7,
_mm_loadu_si128(&((__m128i*)in)[i*8+6]));
tmp8 = _mm_xor_si128(tmp8,
_mm_loadu_si128(&((__m128i*)in)[i*8+7]));
_mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
_mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
_mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
_mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
_mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
_mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
_mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
_mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
tmp1 = _mm_xor_si128(X, tmp1);
reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
tmp5 = _mm_xor_si128(X, tmp5);
reduce4(H, H2, H3, H4, tmp8, tmp7, tmp6, tmp5, &X);
}
for (k=i*8; k<nbytes/16; k++) {
tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
ctr1 = _mm_add_epi64(ctr1, ONE);
tmp1 = _mm_xor_si128(tmp1, KEY[0]);
for (j=1; j<nr-1; j+=2) {
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
}
tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k]));
_mm_storeu_si128(&((__m128i*)out)[k], tmp1);
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
X = _mm_xor_si128(X, tmp1);
gfmul(X,H,&X);
}
//If remains one incomplete block
if (nbytes%16) {
tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
tmp1 = _mm_xor_si128(tmp1, KEY[0]);
for (j=1; j<nr-1; j+=2) {
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
}
tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k]));
last_block = tmp1;
for (j=0; j<nbytes%16; j++)
out[k*16+j] = ((unsigned char*)&last_block)[j];
for ((void)j; j<16; j++)
((unsigned char*)&last_block)[j] = 0;
tmp1 = last_block;
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
X = _mm_xor_si128(X, tmp1);
gfmul(X, H, &X);
}
tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
X = _mm_xor_si128(X, tmp1);
gfmul(X,H,&X);
X = _mm_shuffle_epi8(X, BSWAP_MASK);
T = _mm_xor_si128(X, T);
_mm_storeu_si128((__m128i*)tag, T);
}
/* My modification of _encrypt to be _decrypt */
int
AES_GCM_decrypt(const unsigned char *in, unsigned char *out,
const unsigned char *addt, const unsigned char *ivec,
unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
const unsigned char *key, int nr)
{
int i, j ,k;
__m128i tmp1, tmp2, tmp3, tmp4;
__m128i tmp5, tmp6, tmp7, tmp8;
__m128i H, H2, H3, H4, Y, T;
__m128i *KEY = (__m128i*)key;
__m128i ctr1, ctr2, ctr3, ctr4;
__m128i ctr5, ctr6, ctr7, ctr8;
__m128i last_block = _mm_setzero_si128();
__m128i ONE = _mm_set_epi32(0, 1, 0, 0);
__m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
__m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
7);
__m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
15);
__m128i X = _mm_setzero_si128();
if (ibytes == 96/8) {
Y = _mm_loadu_si128((__m128i*)ivec);
Y = _mm_insert_epi32(Y, 0x1000000, 3);
/*(Compute E[ZERO, KS] and E[Y0, KS] together*/
tmp1 = _mm_xor_si128(X, KEY[0]);
tmp2 = _mm_xor_si128(Y, KEY[0]);
for (j=1; j < nr-1; j+=2) {
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
}
tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
H = _mm_aesenclast_si128(tmp1, KEY[nr]);
T = _mm_aesenclast_si128(tmp2, KEY[nr]);
H = _mm_shuffle_epi8(H, BSWAP_MASK);
} else {
tmp1 = _mm_xor_si128(X, KEY[0]);
for (j=1; j <nr; j++)
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
H = _mm_aesenclast_si128(tmp1, KEY[nr]);
H = _mm_shuffle_epi8(H, BSWAP_MASK);
Y = _mm_setzero_si128();
for (i=0; i < ibytes/16; i++) {
tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]);
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
Y = _mm_xor_si128(Y, tmp1);
gfmul(Y, H, &Y);
}
if (ibytes%16) {
for (j=0; j < ibytes%16; j++)
((unsigned char*)&last_block)[j] = ivec[i*16+j];
tmp1 = last_block;
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
Y = _mm_xor_si128(Y, tmp1);
gfmul(Y, H, &Y);
}
tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
tmp1 = _mm_insert_epi64(tmp1, 0, 1);
Y = _mm_xor_si128(Y, tmp1);
gfmul(Y, H, &Y);
Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
tmp1 = _mm_xor_si128(Y, KEY[0]);
for (j=1; j < nr; j++)
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
T = _mm_aesenclast_si128(tmp1, KEY[nr]);
}
gfmul(H,H,&H2);
gfmul(H,H2,&H3);
gfmul(H,H3,&H4);
for (i=0; i<abytes/16/4; i++) {
tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i*4]);
tmp2 = _mm_loadu_si128(&((__m128i*)addt)[i*4+1]);
tmp3 = _mm_loadu_si128(&((__m128i*)addt)[i*4+2]);
tmp4 = _mm_loadu_si128(&((__m128i*)addt)[i*4+3]);
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
tmp1 = _mm_xor_si128(X, tmp1);
reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
}
for (i=i*4; i<abytes/16; i++) {
tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]);
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
X = _mm_xor_si128(X,tmp1);
gfmul(X, H, &X);
}
if (abytes%16) {
last_block = _mm_setzero_si128();
for (j=0; j<abytes%16; j++)
((unsigned char*)&last_block)[j] = addt[i*16+j];
tmp1 = last_block;
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
X =_mm_xor_si128(X,tmp1);
gfmul(X,H,&X);
}
/* This is where we validate the cipher text before decrypt */
for (i = 0; i<nbytes/16/4; i++) {
tmp1 = _mm_loadu_si128(&((__m128i*)in)[i*4]);
tmp2 = _mm_loadu_si128(&((__m128i*)in)[i*4+1]);
tmp3 = _mm_loadu_si128(&((__m128i*)in)[i*4+2]);
tmp4 = _mm_loadu_si128(&((__m128i*)in)[i*4+3]);
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
tmp1 = _mm_xor_si128(X, tmp1);
reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
}
for (i = i*4; i<nbytes/16; i++) {
tmp1 = _mm_loadu_si128(&((__m128i*)in)[i]);
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
X = _mm_xor_si128(X, tmp1);
gfmul(X,H,&X);
}
if (nbytes%16) {
last_block = _mm_setzero_si128();
for (j=0; j<nbytes%16; j++)
((unsigned char*)&last_block)[j] = in[i*16+j];
tmp1 = last_block;
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
X = _mm_xor_si128(X, tmp1);
gfmul(X, H, &X);
}
tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
X = _mm_xor_si128(X, tmp1);
gfmul(X,H,&X);
X = _mm_shuffle_epi8(X, BSWAP_MASK);
T = _mm_xor_si128(X, T);
if (!m128icmp(T, _mm_loadu_si128((__m128i*)tag)))
return 0; //in case the authentication failed
ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
ctr1 = _mm_add_epi64(ctr1, ONE);
ctr2 = _mm_add_epi64(ctr1, ONE);
ctr3 = _mm_add_epi64(ctr2, ONE);
ctr4 = _mm_add_epi64(ctr3, ONE);
ctr5 = _mm_add_epi64(ctr4, ONE);
ctr6 = _mm_add_epi64(ctr5, ONE);
ctr7 = _mm_add_epi64(ctr6, ONE);
ctr8 = _mm_add_epi64(ctr7, ONE);
for (i=0; i<nbytes/16/8; i++) {
tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
ctr1 = _mm_add_epi64(ctr1, EIGHT);
ctr2 = _mm_add_epi64(ctr2, EIGHT);
ctr3 = _mm_add_epi64(ctr3, EIGHT);
ctr4 = _mm_add_epi64(ctr4, EIGHT);
ctr5 = _mm_add_epi64(ctr5, EIGHT);
ctr6 = _mm_add_epi64(ctr6, EIGHT);
ctr7 = _mm_add_epi64(ctr7, EIGHT);
ctr8 = _mm_add_epi64(ctr8, EIGHT);
tmp1 =_mm_xor_si128(tmp1, KEY[0]);
tmp2 =_mm_xor_si128(tmp2, KEY[0]);
tmp3 =_mm_xor_si128(tmp3, KEY[0]);
tmp4 =_mm_xor_si128(tmp4, KEY[0]);
tmp5 =_mm_xor_si128(tmp5, KEY[0]);
tmp6 =_mm_xor_si128(tmp6, KEY[0]);
tmp7 =_mm_xor_si128(tmp7, KEY[0]);
tmp8 =_mm_xor_si128(tmp8, KEY[0]);
for (j=1; j<nr; j++) {
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
}
tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
tmp1 = _mm_xor_si128(tmp1,
_mm_loadu_si128(&((__m128i*)in)[i*8+0]));
tmp2 = _mm_xor_si128(tmp2,
_mm_loadu_si128(&((__m128i*)in)[i*8+1]));
tmp3 = _mm_xor_si128(tmp3,
_mm_loadu_si128(&((__m128i*)in)[i*8+2]));
tmp4 = _mm_xor_si128(tmp4,
_mm_loadu_si128(&((__m128i*)in)[i*8+3]));
tmp5 = _mm_xor_si128(tmp5,
_mm_loadu_si128(&((__m128i*)in)[i*8+4]));
tmp6 = _mm_xor_si128(tmp6,
_mm_loadu_si128(&((__m128i*)in)[i*8+5]));
tmp7 = _mm_xor_si128(tmp7,
_mm_loadu_si128(&((__m128i*)in)[i*8+6]));
tmp8 = _mm_xor_si128(tmp8,
_mm_loadu_si128(&((__m128i*)in)[i*8+7]));
_mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
_mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
_mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
_mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
_mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
_mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
_mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
_mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
}
for (k=i*8; k<nbytes/16; k++) {
tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
ctr1 = _mm_add_epi64(ctr1, ONE);
tmp1 = _mm_xor_si128(tmp1, KEY[0]);
for (j=1; j<nr-1; j+=2) {
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
}
tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k]));
_mm_storeu_si128(&((__m128i*)out)[k], tmp1);
}
//If remains one incomplete block
if (nbytes%16) {
tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
tmp1 = _mm_xor_si128(tmp1, KEY[0]);
for (j=1; j<nr-1; j+=2) {
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
}
tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k]));
last_block = tmp1;
for (j=0; j<nbytes%16; j++)
out[k*16+j] = ((unsigned char*)&last_block)[j];
}
return 1; //when sucessfull returns 1
}