800be1b6f9
the compiler in svn r242182: #if STDC_HOSTED #include <mm_malloc.h> #endif A similar change was done to clang in the FreeBSD tree in svn r218893: However, for external gcc toolchains, this patch is not in the compiler's header file. This patch to FreeBSD's aesni code allows compilation with an external gcc toolchain. Differential Revision: https://reviews.freebsd.org/D2285 Reviewed by: jmg, dim Approved by: dim
805 lines
26 KiB
C
805 lines
26 KiB
C
/*-
|
|
* Copyright (c) 2014 The FreeBSD Foundation
|
|
* All rights reserved.
|
|
*
|
|
* This software was developed by John-Mark Gurney under
|
|
* the sponsorship of the FreeBSD Foundation and
|
|
* Rubicon Communications, LLC (Netgate).
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
* SUCH DAMAGE.
|
|
*
|
|
*
|
|
* $FreeBSD$
|
|
*
|
|
*/
|
|
|
|
/*
|
|
* Figure 5, 8 and 12 are copied from the Intel white paper:
|
|
* Intel® Carry-Less Multiplication Instruction and its Usage for
|
|
* Computing the GCM Mode
|
|
*
|
|
* and as such are:
|
|
* Copyright © 2010 Intel Corporation.
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of Intel Corporation nor the
|
|
* names of its contributors may be used to endorse or promote products
|
|
* derived from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#ifdef _KERNEL
|
|
#include <crypto/aesni/aesni.h>
|
|
#include <crypto/aesni/aesni_os.h>
|
|
#else
|
|
#include <stdint.h>
|
|
#endif
|
|
|
|
#include <wmmintrin.h>
|
|
#include <emmintrin.h>
|
|
#include <smmintrin.h>
|
|
|
|
static inline int
|
|
m128icmp(__m128i a, __m128i b)
|
|
{
|
|
__m128i cmp;
|
|
|
|
cmp = _mm_cmpeq_epi32(a, b);
|
|
|
|
return _mm_movemask_epi8(cmp) == 0xffff;
|
|
}
|
|
|
|
#ifdef __i386__
|
|
static inline __m128i
|
|
_mm_insert_epi64(__m128i a, int64_t b, const int ndx)
|
|
{
|
|
|
|
if (!ndx) {
|
|
a = _mm_insert_epi32(a, b, 0);
|
|
a = _mm_insert_epi32(a, b >> 32, 1);
|
|
} else {
|
|
a = _mm_insert_epi32(a, b, 2);
|
|
a = _mm_insert_epi32(a, b >> 32, 3);
|
|
}
|
|
|
|
return a;
|
|
}
|
|
#endif
|
|
|
|
/* some code from carry-less-multiplication-instruction-in-gcm-mode-paper.pdf */
|
|
|
|
/* Figure 5. Code Sample - Performing Ghash Using Algorithms 1 and 5 (C) */
|
|
static void
|
|
gfmul(__m128i a, __m128i b, __m128i *res)
|
|
{
|
|
__m128i tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
|
|
|
|
tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
|
|
tmp4 = _mm_clmulepi64_si128(a, b, 0x10);
|
|
tmp5 = _mm_clmulepi64_si128(a, b, 0x01);
|
|
tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
|
|
|
|
tmp4 = _mm_xor_si128(tmp4, tmp5);
|
|
tmp5 = _mm_slli_si128(tmp4, 8);
|
|
tmp4 = _mm_srli_si128(tmp4, 8);
|
|
tmp3 = _mm_xor_si128(tmp3, tmp5);
|
|
tmp6 = _mm_xor_si128(tmp6, tmp4);
|
|
|
|
tmp7 = _mm_srli_epi32(tmp3, 31);
|
|
tmp8 = _mm_srli_epi32(tmp6, 31);
|
|
tmp3 = _mm_slli_epi32(tmp3, 1);
|
|
tmp6 = _mm_slli_epi32(tmp6, 1);
|
|
|
|
tmp9 = _mm_srli_si128(tmp7, 12);
|
|
tmp8 = _mm_slli_si128(tmp8, 4);
|
|
tmp7 = _mm_slli_si128(tmp7, 4);
|
|
tmp3 = _mm_or_si128(tmp3, tmp7);
|
|
tmp6 = _mm_or_si128(tmp6, tmp8);
|
|
tmp6 = _mm_or_si128(tmp6, tmp9);
|
|
|
|
tmp7 = _mm_slli_epi32(tmp3, 31);
|
|
tmp8 = _mm_slli_epi32(tmp3, 30);
|
|
tmp9 = _mm_slli_epi32(tmp3, 25);
|
|
|
|
tmp7 = _mm_xor_si128(tmp7, tmp8);
|
|
tmp7 = _mm_xor_si128(tmp7, tmp9);
|
|
tmp8 = _mm_srli_si128(tmp7, 4);
|
|
tmp7 = _mm_slli_si128(tmp7, 12);
|
|
tmp3 = _mm_xor_si128(tmp3, tmp7);
|
|
|
|
tmp2 = _mm_srli_epi32(tmp3, 1);
|
|
tmp4 = _mm_srli_epi32(tmp3, 2);
|
|
tmp5 = _mm_srli_epi32(tmp3, 7);
|
|
tmp2 = _mm_xor_si128(tmp2, tmp4);
|
|
tmp2 = _mm_xor_si128(tmp2, tmp5);
|
|
tmp2 = _mm_xor_si128(tmp2, tmp8);
|
|
tmp3 = _mm_xor_si128(tmp3, tmp2);
|
|
tmp6 = _mm_xor_si128(tmp6, tmp3);
|
|
|
|
*res = tmp6;
|
|
}
|
|
|
|
/*
|
|
* Figure 8. Code Sample - Performing Ghash Using an Aggregated Reduction
|
|
* Method */
|
|
static void
|
|
reduce4(__m128i H1, __m128i H2, __m128i H3, __m128i H4,
|
|
__m128i X1, __m128i X2, __m128i X3, __m128i X4, __m128i *res)
|
|
{
|
|
/*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/
|
|
__m128i H1_X1_lo, H1_X1_hi, H2_X2_lo, H2_X2_hi, H3_X3_lo,
|
|
H3_X3_hi, H4_X4_lo, H4_X4_hi, lo, hi;
|
|
__m128i tmp0, tmp1, tmp2, tmp3;
|
|
__m128i tmp4, tmp5, tmp6, tmp7;
|
|
__m128i tmp8, tmp9;
|
|
|
|
H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00);
|
|
H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00);
|
|
H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00);
|
|
H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00);
|
|
|
|
lo = _mm_xor_si128(H1_X1_lo, H2_X2_lo);
|
|
lo = _mm_xor_si128(lo, H3_X3_lo);
|
|
lo = _mm_xor_si128(lo, H4_X4_lo);
|
|
|
|
H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11);
|
|
H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11);
|
|
H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11);
|
|
H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11);
|
|
|
|
hi = _mm_xor_si128(H1_X1_hi, H2_X2_hi);
|
|
hi = _mm_xor_si128(hi, H3_X3_hi);
|
|
hi = _mm_xor_si128(hi, H4_X4_hi);
|
|
|
|
tmp0 = _mm_shuffle_epi32(H1, 78);
|
|
tmp4 = _mm_shuffle_epi32(X1, 78);
|
|
tmp0 = _mm_xor_si128(tmp0, H1);
|
|
tmp4 = _mm_xor_si128(tmp4, X1);
|
|
tmp1 = _mm_shuffle_epi32(H2, 78);
|
|
tmp5 = _mm_shuffle_epi32(X2, 78);
|
|
tmp1 = _mm_xor_si128(tmp1, H2);
|
|
tmp5 = _mm_xor_si128(tmp5, X2);
|
|
tmp2 = _mm_shuffle_epi32(H3, 78);
|
|
tmp6 = _mm_shuffle_epi32(X3, 78);
|
|
tmp2 = _mm_xor_si128(tmp2, H3);
|
|
tmp6 = _mm_xor_si128(tmp6, X3);
|
|
tmp3 = _mm_shuffle_epi32(H4, 78);
|
|
tmp7 = _mm_shuffle_epi32(X4, 78);
|
|
tmp3 = _mm_xor_si128(tmp3, H4);
|
|
tmp7 = _mm_xor_si128(tmp7, X4);
|
|
|
|
tmp0 = _mm_clmulepi64_si128(tmp0, tmp4, 0x00);
|
|
tmp1 = _mm_clmulepi64_si128(tmp1, tmp5, 0x00);
|
|
tmp2 = _mm_clmulepi64_si128(tmp2, tmp6, 0x00);
|
|
tmp3 = _mm_clmulepi64_si128(tmp3, tmp7, 0x00);
|
|
|
|
tmp0 = _mm_xor_si128(tmp0, lo);
|
|
tmp0 = _mm_xor_si128(tmp0, hi);
|
|
tmp0 = _mm_xor_si128(tmp1, tmp0);
|
|
tmp0 = _mm_xor_si128(tmp2, tmp0);
|
|
tmp0 = _mm_xor_si128(tmp3, tmp0);
|
|
|
|
tmp4 = _mm_slli_si128(tmp0, 8);
|
|
tmp0 = _mm_srli_si128(tmp0, 8);
|
|
|
|
lo = _mm_xor_si128(tmp4, lo);
|
|
hi = _mm_xor_si128(tmp0, hi);
|
|
|
|
tmp3 = lo;
|
|
tmp6 = hi;
|
|
|
|
tmp7 = _mm_srli_epi32(tmp3, 31);
|
|
tmp8 = _mm_srli_epi32(tmp6, 31);
|
|
tmp3 = _mm_slli_epi32(tmp3, 1);
|
|
tmp6 = _mm_slli_epi32(tmp6, 1);
|
|
|
|
tmp9 = _mm_srli_si128(tmp7, 12);
|
|
tmp8 = _mm_slli_si128(tmp8, 4);
|
|
tmp7 = _mm_slli_si128(tmp7, 4);
|
|
tmp3 = _mm_or_si128(tmp3, tmp7);
|
|
tmp6 = _mm_or_si128(tmp6, tmp8);
|
|
tmp6 = _mm_or_si128(tmp6, tmp9);
|
|
|
|
tmp7 = _mm_slli_epi32(tmp3, 31);
|
|
tmp8 = _mm_slli_epi32(tmp3, 30);
|
|
tmp9 = _mm_slli_epi32(tmp3, 25);
|
|
|
|
tmp7 = _mm_xor_si128(tmp7, tmp8);
|
|
tmp7 = _mm_xor_si128(tmp7, tmp9);
|
|
tmp8 = _mm_srli_si128(tmp7, 4);
|
|
tmp7 = _mm_slli_si128(tmp7, 12);
|
|
tmp3 = _mm_xor_si128(tmp3, tmp7);
|
|
|
|
tmp2 = _mm_srli_epi32(tmp3, 1);
|
|
tmp4 = _mm_srli_epi32(tmp3, 2);
|
|
tmp5 = _mm_srli_epi32(tmp3, 7);
|
|
tmp2 = _mm_xor_si128(tmp2, tmp4);
|
|
tmp2 = _mm_xor_si128(tmp2, tmp5);
|
|
tmp2 = _mm_xor_si128(tmp2, tmp8);
|
|
tmp3 = _mm_xor_si128(tmp3, tmp2);
|
|
tmp6 = _mm_xor_si128(tmp6, tmp3);
|
|
|
|
*res = tmp6;
|
|
}
|
|
|
|
/*
|
|
* Figure 12. AES-GCM: Processing Four Blocks in Parallel with Aggregated
|
|
* Every Four Blocks
|
|
*/
|
|
/*
|
|
* per NIST SP-800-38D, 5.2.1.1, len(p) <= 2^39-256 (in bits), or
|
|
* 2^32-256*8*16 bytes.
|
|
*/
|
|
void
|
|
AES_GCM_encrypt(const unsigned char *in, unsigned char *out,
|
|
const unsigned char *addt, const unsigned char *ivec,
|
|
unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
|
|
const unsigned char *key, int nr)
|
|
{
|
|
int i, j ,k;
|
|
__m128i tmp1, tmp2, tmp3, tmp4;
|
|
__m128i tmp5, tmp6, tmp7, tmp8;
|
|
__m128i H, H2, H3, H4, Y, T;
|
|
__m128i *KEY = (__m128i*)key;
|
|
__m128i ctr1, ctr2, ctr3, ctr4;
|
|
__m128i ctr5, ctr6, ctr7, ctr8;
|
|
__m128i last_block = _mm_setzero_si128();
|
|
__m128i ONE = _mm_set_epi32(0, 1, 0, 0);
|
|
__m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
|
|
__m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
|
|
7);
|
|
__m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
|
|
15);
|
|
__m128i X = _mm_setzero_si128();
|
|
|
|
if (ibytes == 96/8) {
|
|
Y = _mm_loadu_si128((__m128i*)ivec);
|
|
Y = _mm_insert_epi32(Y, 0x1000000, 3);
|
|
/*(Compute E[ZERO, KS] and E[Y0, KS] together*/
|
|
tmp1 = _mm_xor_si128(X, KEY[0]);
|
|
tmp2 = _mm_xor_si128(Y, KEY[0]);
|
|
for (j=1; j < nr-1; j+=2) {
|
|
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
|
|
tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
|
|
|
|
tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
|
|
tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
|
|
}
|
|
tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
|
|
tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
|
|
|
|
H = _mm_aesenclast_si128(tmp1, KEY[nr]);
|
|
T = _mm_aesenclast_si128(tmp2, KEY[nr]);
|
|
|
|
H = _mm_shuffle_epi8(H, BSWAP_MASK);
|
|
} else {
|
|
tmp1 = _mm_xor_si128(X, KEY[0]);
|
|
for (j=1; j <nr; j++)
|
|
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
|
|
H = _mm_aesenclast_si128(tmp1, KEY[nr]);
|
|
|
|
H = _mm_shuffle_epi8(H, BSWAP_MASK);
|
|
Y = _mm_setzero_si128();
|
|
|
|
for (i=0; i < ibytes/16; i++) {
|
|
tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]);
|
|
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
|
Y = _mm_xor_si128(Y, tmp1);
|
|
gfmul(Y, H, &Y);
|
|
}
|
|
if (ibytes%16) {
|
|
for (j=0; j < ibytes%16; j++)
|
|
((unsigned char*)&last_block)[j] = ivec[i*16+j];
|
|
tmp1 = last_block;
|
|
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
|
Y = _mm_xor_si128(Y, tmp1);
|
|
gfmul(Y, H, &Y);
|
|
}
|
|
tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
|
|
tmp1 = _mm_insert_epi64(tmp1, 0, 1);
|
|
|
|
Y = _mm_xor_si128(Y, tmp1);
|
|
gfmul(Y, H, &Y);
|
|
Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
|
|
tmp1 = _mm_xor_si128(Y, KEY[0]);
|
|
for (j=1; j < nr; j++)
|
|
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
|
|
T = _mm_aesenclast_si128(tmp1, KEY[nr]);
|
|
}
|
|
|
|
gfmul(H,H,&H2);
|
|
gfmul(H,H2,&H3);
|
|
gfmul(H,H3,&H4);
|
|
|
|
for (i=0; i<abytes/16/4; i++) {
|
|
tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i*4]);
|
|
tmp2 = _mm_loadu_si128(&((__m128i*)addt)[i*4+1]);
|
|
tmp3 = _mm_loadu_si128(&((__m128i*)addt)[i*4+2]);
|
|
tmp4 = _mm_loadu_si128(&((__m128i*)addt)[i*4+3]);
|
|
|
|
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
|
tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
|
|
tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
|
|
tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
|
|
tmp1 = _mm_xor_si128(X, tmp1);
|
|
|
|
reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
|
|
}
|
|
for (i=i*4; i<abytes/16; i++) {
|
|
tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]);
|
|
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
|
X = _mm_xor_si128(X,tmp1);
|
|
gfmul(X, H, &X);
|
|
}
|
|
if (abytes%16) {
|
|
last_block = _mm_setzero_si128();
|
|
for (j=0; j<abytes%16; j++)
|
|
((unsigned char*)&last_block)[j] = addt[i*16+j];
|
|
tmp1 = last_block;
|
|
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
|
X =_mm_xor_si128(X,tmp1);
|
|
gfmul(X,H,&X);
|
|
}
|
|
|
|
ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
|
|
ctr1 = _mm_add_epi64(ctr1, ONE);
|
|
ctr2 = _mm_add_epi64(ctr1, ONE);
|
|
ctr3 = _mm_add_epi64(ctr2, ONE);
|
|
ctr4 = _mm_add_epi64(ctr3, ONE);
|
|
ctr5 = _mm_add_epi64(ctr4, ONE);
|
|
ctr6 = _mm_add_epi64(ctr5, ONE);
|
|
ctr7 = _mm_add_epi64(ctr6, ONE);
|
|
ctr8 = _mm_add_epi64(ctr7, ONE);
|
|
|
|
for (i=0; i<nbytes/16/8; i++) {
|
|
tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
|
|
tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
|
|
tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
|
|
tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
|
|
tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
|
|
tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
|
|
tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
|
|
tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
|
|
|
|
ctr1 = _mm_add_epi64(ctr1, EIGHT);
|
|
ctr2 = _mm_add_epi64(ctr2, EIGHT);
|
|
ctr3 = _mm_add_epi64(ctr3, EIGHT);
|
|
ctr4 = _mm_add_epi64(ctr4, EIGHT);
|
|
ctr5 = _mm_add_epi64(ctr5, EIGHT);
|
|
ctr6 = _mm_add_epi64(ctr6, EIGHT);
|
|
ctr7 = _mm_add_epi64(ctr7, EIGHT);
|
|
ctr8 = _mm_add_epi64(ctr8, EIGHT);
|
|
|
|
tmp1 =_mm_xor_si128(tmp1, KEY[0]);
|
|
tmp2 =_mm_xor_si128(tmp2, KEY[0]);
|
|
tmp3 =_mm_xor_si128(tmp3, KEY[0]);
|
|
tmp4 =_mm_xor_si128(tmp4, KEY[0]);
|
|
tmp5 =_mm_xor_si128(tmp5, KEY[0]);
|
|
tmp6 =_mm_xor_si128(tmp6, KEY[0]);
|
|
tmp7 =_mm_xor_si128(tmp7, KEY[0]);
|
|
tmp8 =_mm_xor_si128(tmp8, KEY[0]);
|
|
|
|
for (j=1; j<nr; j++) {
|
|
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
|
|
tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
|
|
tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
|
|
tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
|
|
tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
|
|
tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
|
|
tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
|
|
tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
|
|
}
|
|
tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
|
|
tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
|
|
tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
|
|
tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
|
|
tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
|
|
tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
|
|
tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
|
|
tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
|
|
|
|
tmp1 = _mm_xor_si128(tmp1,
|
|
_mm_loadu_si128(&((__m128i*)in)[i*8+0]));
|
|
tmp2 = _mm_xor_si128(tmp2,
|
|
_mm_loadu_si128(&((__m128i*)in)[i*8+1]));
|
|
tmp3 = _mm_xor_si128(tmp3,
|
|
_mm_loadu_si128(&((__m128i*)in)[i*8+2]));
|
|
tmp4 = _mm_xor_si128(tmp4,
|
|
_mm_loadu_si128(&((__m128i*)in)[i*8+3]));
|
|
tmp5 = _mm_xor_si128(tmp5,
|
|
_mm_loadu_si128(&((__m128i*)in)[i*8+4]));
|
|
tmp6 = _mm_xor_si128(tmp6,
|
|
_mm_loadu_si128(&((__m128i*)in)[i*8+5]));
|
|
tmp7 = _mm_xor_si128(tmp7,
|
|
_mm_loadu_si128(&((__m128i*)in)[i*8+6]));
|
|
tmp8 = _mm_xor_si128(tmp8,
|
|
_mm_loadu_si128(&((__m128i*)in)[i*8+7]));
|
|
|
|
_mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
|
|
_mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
|
|
_mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
|
|
_mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
|
|
_mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
|
|
_mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
|
|
_mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
|
|
_mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
|
|
|
|
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
|
tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
|
|
tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
|
|
tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
|
|
tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
|
|
tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
|
|
tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
|
|
tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
|
|
|
|
tmp1 = _mm_xor_si128(X, tmp1);
|
|
|
|
reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
|
|
|
|
tmp5 = _mm_xor_si128(X, tmp5);
|
|
reduce4(H, H2, H3, H4, tmp8, tmp7, tmp6, tmp5, &X);
|
|
}
|
|
for (k=i*8; k<nbytes/16; k++) {
|
|
tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
|
|
ctr1 = _mm_add_epi64(ctr1, ONE);
|
|
tmp1 = _mm_xor_si128(tmp1, KEY[0]);
|
|
for (j=1; j<nr-1; j+=2) {
|
|
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
|
|
tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
|
|
}
|
|
tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
|
|
tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
|
|
tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k]));
|
|
_mm_storeu_si128(&((__m128i*)out)[k], tmp1);
|
|
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
|
X = _mm_xor_si128(X, tmp1);
|
|
gfmul(X,H,&X);
|
|
}
|
|
//If remains one incomplete block
|
|
if (nbytes%16) {
|
|
tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
|
|
tmp1 = _mm_xor_si128(tmp1, KEY[0]);
|
|
for (j=1; j<nr-1; j+=2) {
|
|
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
|
|
tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
|
|
}
|
|
tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
|
|
tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
|
|
tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k]));
|
|
last_block = tmp1;
|
|
for (j=0; j<nbytes%16; j++)
|
|
out[k*16+j] = ((unsigned char*)&last_block)[j];
|
|
for ((void)j; j<16; j++)
|
|
((unsigned char*)&last_block)[j] = 0;
|
|
tmp1 = last_block;
|
|
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
|
X = _mm_xor_si128(X, tmp1);
|
|
gfmul(X, H, &X);
|
|
}
|
|
tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
|
|
tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
|
|
|
|
X = _mm_xor_si128(X, tmp1);
|
|
gfmul(X,H,&X);
|
|
X = _mm_shuffle_epi8(X, BSWAP_MASK);
|
|
T = _mm_xor_si128(X, T);
|
|
_mm_storeu_si128((__m128i*)tag, T);
|
|
}
|
|
|
|
/* My modification of _encrypt to be _decrypt */
|
|
int
|
|
AES_GCM_decrypt(const unsigned char *in, unsigned char *out,
|
|
const unsigned char *addt, const unsigned char *ivec,
|
|
unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
|
|
const unsigned char *key, int nr)
|
|
{
|
|
int i, j ,k;
|
|
__m128i tmp1, tmp2, tmp3, tmp4;
|
|
__m128i tmp5, tmp6, tmp7, tmp8;
|
|
__m128i H, H2, H3, H4, Y, T;
|
|
__m128i *KEY = (__m128i*)key;
|
|
__m128i ctr1, ctr2, ctr3, ctr4;
|
|
__m128i ctr5, ctr6, ctr7, ctr8;
|
|
__m128i last_block = _mm_setzero_si128();
|
|
__m128i ONE = _mm_set_epi32(0, 1, 0, 0);
|
|
__m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
|
|
__m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
|
|
7);
|
|
__m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
|
|
15);
|
|
__m128i X = _mm_setzero_si128();
|
|
|
|
if (ibytes == 96/8) {
|
|
Y = _mm_loadu_si128((__m128i*)ivec);
|
|
Y = _mm_insert_epi32(Y, 0x1000000, 3);
|
|
/*(Compute E[ZERO, KS] and E[Y0, KS] together*/
|
|
tmp1 = _mm_xor_si128(X, KEY[0]);
|
|
tmp2 = _mm_xor_si128(Y, KEY[0]);
|
|
for (j=1; j < nr-1; j+=2) {
|
|
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
|
|
tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
|
|
|
|
tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
|
|
tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
|
|
}
|
|
tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
|
|
tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
|
|
|
|
H = _mm_aesenclast_si128(tmp1, KEY[nr]);
|
|
T = _mm_aesenclast_si128(tmp2, KEY[nr]);
|
|
|
|
H = _mm_shuffle_epi8(H, BSWAP_MASK);
|
|
} else {
|
|
tmp1 = _mm_xor_si128(X, KEY[0]);
|
|
for (j=1; j <nr; j++)
|
|
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
|
|
H = _mm_aesenclast_si128(tmp1, KEY[nr]);
|
|
|
|
H = _mm_shuffle_epi8(H, BSWAP_MASK);
|
|
Y = _mm_setzero_si128();
|
|
|
|
for (i=0; i < ibytes/16; i++) {
|
|
tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]);
|
|
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
|
Y = _mm_xor_si128(Y, tmp1);
|
|
gfmul(Y, H, &Y);
|
|
}
|
|
if (ibytes%16) {
|
|
for (j=0; j < ibytes%16; j++)
|
|
((unsigned char*)&last_block)[j] = ivec[i*16+j];
|
|
tmp1 = last_block;
|
|
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
|
Y = _mm_xor_si128(Y, tmp1);
|
|
gfmul(Y, H, &Y);
|
|
}
|
|
tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
|
|
tmp1 = _mm_insert_epi64(tmp1, 0, 1);
|
|
|
|
Y = _mm_xor_si128(Y, tmp1);
|
|
gfmul(Y, H, &Y);
|
|
Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
|
|
tmp1 = _mm_xor_si128(Y, KEY[0]);
|
|
for (j=1; j < nr; j++)
|
|
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
|
|
T = _mm_aesenclast_si128(tmp1, KEY[nr]);
|
|
}
|
|
|
|
gfmul(H,H,&H2);
|
|
gfmul(H,H2,&H3);
|
|
gfmul(H,H3,&H4);
|
|
|
|
for (i=0; i<abytes/16/4; i++) {
|
|
tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i*4]);
|
|
tmp2 = _mm_loadu_si128(&((__m128i*)addt)[i*4+1]);
|
|
tmp3 = _mm_loadu_si128(&((__m128i*)addt)[i*4+2]);
|
|
tmp4 = _mm_loadu_si128(&((__m128i*)addt)[i*4+3]);
|
|
|
|
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
|
tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
|
|
tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
|
|
tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
|
|
|
|
tmp1 = _mm_xor_si128(X, tmp1);
|
|
|
|
reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
|
|
}
|
|
for (i=i*4; i<abytes/16; i++) {
|
|
tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]);
|
|
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
|
X = _mm_xor_si128(X,tmp1);
|
|
gfmul(X, H, &X);
|
|
}
|
|
if (abytes%16) {
|
|
last_block = _mm_setzero_si128();
|
|
for (j=0; j<abytes%16; j++)
|
|
((unsigned char*)&last_block)[j] = addt[i*16+j];
|
|
tmp1 = last_block;
|
|
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
|
X =_mm_xor_si128(X,tmp1);
|
|
gfmul(X,H,&X);
|
|
}
|
|
|
|
/* This is where we validate the cipher text before decrypt */
|
|
for (i = 0; i<nbytes/16/4; i++) {
|
|
tmp1 = _mm_loadu_si128(&((__m128i*)in)[i*4]);
|
|
tmp2 = _mm_loadu_si128(&((__m128i*)in)[i*4+1]);
|
|
tmp3 = _mm_loadu_si128(&((__m128i*)in)[i*4+2]);
|
|
tmp4 = _mm_loadu_si128(&((__m128i*)in)[i*4+3]);
|
|
|
|
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
|
tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
|
|
tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
|
|
tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
|
|
|
|
tmp1 = _mm_xor_si128(X, tmp1);
|
|
|
|
reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
|
|
}
|
|
for (i = i*4; i<nbytes/16; i++) {
|
|
tmp1 = _mm_loadu_si128(&((__m128i*)in)[i]);
|
|
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
|
X = _mm_xor_si128(X, tmp1);
|
|
gfmul(X,H,&X);
|
|
}
|
|
if (nbytes%16) {
|
|
last_block = _mm_setzero_si128();
|
|
for (j=0; j<nbytes%16; j++)
|
|
((unsigned char*)&last_block)[j] = in[i*16+j];
|
|
tmp1 = last_block;
|
|
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
|
X = _mm_xor_si128(X, tmp1);
|
|
gfmul(X, H, &X);
|
|
}
|
|
|
|
tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
|
|
tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
|
|
|
|
X = _mm_xor_si128(X, tmp1);
|
|
gfmul(X,H,&X);
|
|
X = _mm_shuffle_epi8(X, BSWAP_MASK);
|
|
T = _mm_xor_si128(X, T);
|
|
|
|
if (!m128icmp(T, _mm_loadu_si128((__m128i*)tag)))
|
|
return 0; //in case the authentication failed
|
|
|
|
ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
|
|
ctr1 = _mm_add_epi64(ctr1, ONE);
|
|
ctr2 = _mm_add_epi64(ctr1, ONE);
|
|
ctr3 = _mm_add_epi64(ctr2, ONE);
|
|
ctr4 = _mm_add_epi64(ctr3, ONE);
|
|
ctr5 = _mm_add_epi64(ctr4, ONE);
|
|
ctr6 = _mm_add_epi64(ctr5, ONE);
|
|
ctr7 = _mm_add_epi64(ctr6, ONE);
|
|
ctr8 = _mm_add_epi64(ctr7, ONE);
|
|
|
|
for (i=0; i<nbytes/16/8; i++) {
|
|
tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
|
|
tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
|
|
tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
|
|
tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
|
|
tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
|
|
tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
|
|
tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
|
|
tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
|
|
|
|
ctr1 = _mm_add_epi64(ctr1, EIGHT);
|
|
ctr2 = _mm_add_epi64(ctr2, EIGHT);
|
|
ctr3 = _mm_add_epi64(ctr3, EIGHT);
|
|
ctr4 = _mm_add_epi64(ctr4, EIGHT);
|
|
ctr5 = _mm_add_epi64(ctr5, EIGHT);
|
|
ctr6 = _mm_add_epi64(ctr6, EIGHT);
|
|
ctr7 = _mm_add_epi64(ctr7, EIGHT);
|
|
ctr8 = _mm_add_epi64(ctr8, EIGHT);
|
|
|
|
tmp1 =_mm_xor_si128(tmp1, KEY[0]);
|
|
tmp2 =_mm_xor_si128(tmp2, KEY[0]);
|
|
tmp3 =_mm_xor_si128(tmp3, KEY[0]);
|
|
tmp4 =_mm_xor_si128(tmp4, KEY[0]);
|
|
tmp5 =_mm_xor_si128(tmp5, KEY[0]);
|
|
tmp6 =_mm_xor_si128(tmp6, KEY[0]);
|
|
tmp7 =_mm_xor_si128(tmp7, KEY[0]);
|
|
tmp8 =_mm_xor_si128(tmp8, KEY[0]);
|
|
|
|
for (j=1; j<nr; j++) {
|
|
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
|
|
tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
|
|
tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
|
|
tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
|
|
tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
|
|
tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
|
|
tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
|
|
tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
|
|
}
|
|
tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
|
|
tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
|
|
tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
|
|
tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
|
|
tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
|
|
tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
|
|
tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
|
|
tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
|
|
|
|
tmp1 = _mm_xor_si128(tmp1,
|
|
_mm_loadu_si128(&((__m128i*)in)[i*8+0]));
|
|
tmp2 = _mm_xor_si128(tmp2,
|
|
_mm_loadu_si128(&((__m128i*)in)[i*8+1]));
|
|
tmp3 = _mm_xor_si128(tmp3,
|
|
_mm_loadu_si128(&((__m128i*)in)[i*8+2]));
|
|
tmp4 = _mm_xor_si128(tmp4,
|
|
_mm_loadu_si128(&((__m128i*)in)[i*8+3]));
|
|
tmp5 = _mm_xor_si128(tmp5,
|
|
_mm_loadu_si128(&((__m128i*)in)[i*8+4]));
|
|
tmp6 = _mm_xor_si128(tmp6,
|
|
_mm_loadu_si128(&((__m128i*)in)[i*8+5]));
|
|
tmp7 = _mm_xor_si128(tmp7,
|
|
_mm_loadu_si128(&((__m128i*)in)[i*8+6]));
|
|
tmp8 = _mm_xor_si128(tmp8,
|
|
_mm_loadu_si128(&((__m128i*)in)[i*8+7]));
|
|
|
|
_mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
|
|
_mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
|
|
_mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
|
|
_mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
|
|
_mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
|
|
_mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
|
|
_mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
|
|
_mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
|
|
|
|
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
|
tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
|
|
tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
|
|
tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
|
|
tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
|
|
tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
|
|
tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
|
|
tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
|
|
}
|
|
for (k=i*8; k<nbytes/16; k++) {
|
|
tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
|
|
ctr1 = _mm_add_epi64(ctr1, ONE);
|
|
tmp1 = _mm_xor_si128(tmp1, KEY[0]);
|
|
for (j=1; j<nr-1; j+=2) {
|
|
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
|
|
tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
|
|
}
|
|
tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
|
|
tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
|
|
tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k]));
|
|
_mm_storeu_si128(&((__m128i*)out)[k], tmp1);
|
|
}
|
|
//If remains one incomplete block
|
|
if (nbytes%16) {
|
|
tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
|
|
tmp1 = _mm_xor_si128(tmp1, KEY[0]);
|
|
for (j=1; j<nr-1; j+=2) {
|
|
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
|
|
tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
|
|
}
|
|
tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
|
|
tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
|
|
tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k]));
|
|
last_block = tmp1;
|
|
for (j=0; j<nbytes%16; j++)
|
|
out[k*16+j] = ((unsigned char*)&last_block)[j];
|
|
}
|
|
return 1; //when sucessfull returns 1
|
|
}
|