Build OpenSSL assembly sources for aarch64. Tested with ThunderX by andrew.
This commit is contained in:
parent
726f4773ec
commit
7518a9bd2b
@ -42,7 +42,7 @@ $code=<<___;
|
||||
#if __ARM_MAX_ARCH__>=7
|
||||
.text
|
||||
___
|
||||
$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
|
||||
# $code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
|
||||
$code.=".arch armv7-a\n.fpu neon\n.code 32\n" if ($flavour !~ /64/);
|
||||
#^^^^^^ this is done to simplify adoption by not depending
|
||||
# on latest binutils.
|
||||
|
@ -1,7 +1,6 @@
|
||||
#include "arm_arch.h"
|
||||
|
||||
.text
|
||||
.arch armv8-a+crypto
|
||||
|
||||
.align 5
|
||||
.global _armv7_neon_probe
|
||||
|
@ -49,7 +49,7 @@ $code=<<___;
|
||||
|
||||
.text
|
||||
___
|
||||
$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
|
||||
# $code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
|
||||
$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/);
|
||||
|
||||
################################################################################
|
||||
|
@ -22,7 +22,10 @@ MAN+= config.5 des_modes.7
|
||||
# base sources
|
||||
SRCS= cpt_err.c cryptlib.c cversion.c ex_data.c mem.c mem_dbg.c o_dir.c \
|
||||
o_fips.c o_init.c o_str.c o_time.c uid.c
|
||||
.if defined(ASM_amd64)
|
||||
.if defined(ASM_aarch64)
|
||||
SRCS+= arm64cpuid.S armcap.c mem_clr.c
|
||||
CFLAGS.arm64cpuid.S= -march=armv8-a+crypto
|
||||
.elif defined(ASM_amd64)
|
||||
SRCS+= x86_64cpuid.S
|
||||
.elif defined(ASM_arm)
|
||||
SRCS+= armcap.c armv4cpuid.S
|
||||
@ -35,7 +38,10 @@ INCS+= crypto.h ebcdic.h opensslv.h ossl_typ.h symhacks.h ../e_os2.h
|
||||
|
||||
# aes
|
||||
SRCS+= aes_cfb.c aes_ctr.c aes_ecb.c aes_ige.c aes_misc.c aes_ofb.c aes_wrap.c
|
||||
.if defined(ASM_amd64)
|
||||
.if defined(ASM_aarch64)
|
||||
SRCS+= aes_cbc.c aes_core.c aesv8-armx.S
|
||||
CFLAGS.aesv8-armx.S= -march=armv8-a+crypto
|
||||
.elif defined(ASM_amd64)
|
||||
SRCS+= aes-x86_64.S aesni-mb-x86_64.S aesni-sha1-x86_64.S \
|
||||
aesni-sha256-x86_64.S aesni-x86_64.S bsaes-x86_64.S vpaes-x86_64.S
|
||||
.elif defined(ASM_arm)
|
||||
@ -238,7 +244,10 @@ INCS+= mdc2.h
|
||||
# modes
|
||||
SRCS+= cbc128.c ccm128.c cfb128.c ctr128.c cts128.c gcm128.c ofb128.c \
|
||||
wrap128.c xts128.c
|
||||
.if defined(ASM_amd64)
|
||||
.if defined(ASM_aarch64)
|
||||
SRCS+= ghashv8-armx.S
|
||||
CFLAGS.ghashv8-armx.S= -march=armv8-a+crypto
|
||||
.elif defined(ASM_amd64)
|
||||
SRCS+= aesni-gcm-x86_64.S ghash-x86_64.S
|
||||
.elif defined(ASM_arm)
|
||||
SRCS+= ghash-armv4.S ghashv8-armx.S
|
||||
@ -324,7 +333,9 @@ INCS+= seed.h
|
||||
|
||||
# sha
|
||||
SRCS+= sha1_one.c sha1dgst.c sha256.c sha512.c sha_dgst.c sha_one.c
|
||||
.if defined(ASM_amd64)
|
||||
.if defined(ASM_aarch64)
|
||||
SRCS+= sha1-armv8.S sha256-armv8.S sha512-armv8.S
|
||||
.elif defined(ASM_amd64)
|
||||
SRCS+= sha1-mb-x86_64.S sha1-x86_64.S sha256-mb-x86_64.S sha256-x86_64.S \
|
||||
sha512-x86_64.S
|
||||
.elif defined(ASM_arm)
|
||||
|
@ -6,7 +6,44 @@
|
||||
|
||||
.include "Makefile.inc"
|
||||
|
||||
.if defined(ASM_amd64)
|
||||
.if defined(ASM_aarch64)
|
||||
|
||||
.PATH: ${LCRYPTO_SRC}/crypto \
|
||||
${LCRYPTO_SRC}/crypto/aes/asm \
|
||||
${LCRYPTO_SRC}/crypto/modes/asm \
|
||||
${LCRYPTO_SRC}/crypto/sha/asm
|
||||
|
||||
PERLPATH= -I${LCRYPTO_SRC}/crypto/perlasm
|
||||
|
||||
# aes
|
||||
SRCS= aesv8-armx.pl
|
||||
|
||||
# modes
|
||||
SRCS+= ghashv8-armx.pl
|
||||
|
||||
# sha
|
||||
SRCS+= sha1-armv8.pl sha512-armv8.pl
|
||||
|
||||
ASM= ${SRCS:R:S/$/.S/} sha256-armv8.S
|
||||
|
||||
all: ${ASM}
|
||||
|
||||
CLEANFILES= ${ASM} ${SRCS:R:S/$/.s/} sha256-armv8.s
|
||||
.SUFFIXES: .pl
|
||||
|
||||
sha256-armv8.S: sha512-armv8.pl
|
||||
env CC=cc perl ${.ALLSRC} 64 ${.TARGET:R:S/$/.s/}
|
||||
( echo '/* $$'FreeBSD'$$ */' ;\
|
||||
echo '/* Do not modify. This file is auto-generated from ${.ALLSRC:T:R:S/$/.pl/}. */' ;\
|
||||
cat ${.TARGET:R:S/$/.s/}) > ${.TARGET}
|
||||
|
||||
.pl.S:
|
||||
env CC=cc perl ${.IMPSRC} 64 ${.TARGET:R:S/$/.s/}
|
||||
( echo '/* $$'FreeBSD'$$ */' ;\
|
||||
echo '/* Do not modify. This file is auto-generated from ${.IMPSRC:T:R:S/$/.pl/}. */' ;\
|
||||
cat ${.TARGET:R:S/$/.s/}) > ${.TARGET}
|
||||
|
||||
.elif defined(ASM_amd64)
|
||||
|
||||
.PATH: ${LCRYPTO_SRC}/crypto \
|
||||
${LCRYPTO_SRC}/crypto/aes/asm \
|
||||
|
@ -21,7 +21,9 @@ CFLAGS+=-DL_ENDIAN
|
||||
CFLAGS+=-DB_ENDIAN
|
||||
.endif
|
||||
|
||||
.if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386"
|
||||
.if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "arm"
|
||||
ASM_${MACHINE_CPUARCH}=
|
||||
.elif ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386"
|
||||
_ASM_AVX!= { \
|
||||
echo vzeroall | \
|
||||
${CC} -x assembler -o /dev/null -c - 2> /dev/null; \
|
||||
@ -29,11 +31,11 @@ _ASM_AVX!= { \
|
||||
.if ${_ASM_AVX} == yes
|
||||
ASM_${MACHINE_CPUARCH}=
|
||||
.endif
|
||||
.elif ${MACHINE_CPUARCH} == "arm"
|
||||
ASM_arm=
|
||||
.endif
|
||||
|
||||
.if defined(ASM_amd64)
|
||||
.if defined(ASM_aarch64)
|
||||
CFLAGS+=-DSHA1_ASM -DSHA256_ASM -DSHA512_ASM
|
||||
.elif defined(ASM_amd64)
|
||||
CFLAGS+=-DOPENSSL_IA32_SSE2
|
||||
CFLAGS+=-DAES_ASM -DBSAES_ASM -DVPAES_ASM
|
||||
CFLAGS+=-DECP_NISTZ256_ASM
|
||||
|
748
secure/lib/libcrypto/aarch64/aesv8-armx.S
Normal file
748
secure/lib/libcrypto/aarch64/aesv8-armx.S
Normal file
@ -0,0 +1,748 @@
|
||||
/* $FreeBSD$ */
|
||||
/* Do not modify. This file is auto-generated from aesv8-armx.pl. */
|
||||
#include "arm_arch.h"
|
||||
|
||||
#if __ARM_MAX_ARCH__>=7
|
||||
.text
|
||||
.align 5
|
||||
rcon:
|
||||
.long 0x01,0x01,0x01,0x01
|
||||
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
|
||||
.long 0x1b,0x1b,0x1b,0x1b
|
||||
|
||||
.globl aes_v8_set_encrypt_key
|
||||
.type aes_v8_set_encrypt_key,%function
|
||||
.align 5
|
||||
aes_v8_set_encrypt_key:
|
||||
.Lenc_key:
|
||||
stp x29,x30,[sp,#-16]!
|
||||
add x29,sp,#0
|
||||
mov x3,#-1
|
||||
cmp x0,#0
|
||||
b.eq .Lenc_key_abort
|
||||
cmp x2,#0
|
||||
b.eq .Lenc_key_abort
|
||||
mov x3,#-2
|
||||
cmp w1,#128
|
||||
b.lt .Lenc_key_abort
|
||||
cmp w1,#256
|
||||
b.gt .Lenc_key_abort
|
||||
tst w1,#0x3f
|
||||
b.ne .Lenc_key_abort
|
||||
|
||||
adr x3,rcon
|
||||
cmp w1,#192
|
||||
|
||||
eor v0.16b,v0.16b,v0.16b
|
||||
ld1 {v3.16b},[x0],#16
|
||||
mov w1,#8 // reuse w1
|
||||
ld1 {v1.4s,v2.4s},[x3],#32
|
||||
|
||||
b.lt .Loop128
|
||||
b.eq .L192
|
||||
b .L256
|
||||
|
||||
.align 4
|
||||
.Loop128:
|
||||
tbl v6.16b,{v3.16b},v2.16b
|
||||
ext v5.16b,v0.16b,v3.16b,#12
|
||||
st1 {v3.4s},[x2],#16
|
||||
aese v6.16b,v0.16b
|
||||
subs w1,w1,#1
|
||||
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v6.16b,v6.16b,v1.16b
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
shl v1.16b,v1.16b,#1
|
||||
eor v3.16b,v3.16b,v6.16b
|
||||
b.ne .Loop128
|
||||
|
||||
ld1 {v1.4s},[x3]
|
||||
|
||||
tbl v6.16b,{v3.16b},v2.16b
|
||||
ext v5.16b,v0.16b,v3.16b,#12
|
||||
st1 {v3.4s},[x2],#16
|
||||
aese v6.16b,v0.16b
|
||||
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v6.16b,v6.16b,v1.16b
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
shl v1.16b,v1.16b,#1
|
||||
eor v3.16b,v3.16b,v6.16b
|
||||
|
||||
tbl v6.16b,{v3.16b},v2.16b
|
||||
ext v5.16b,v0.16b,v3.16b,#12
|
||||
st1 {v3.4s},[x2],#16
|
||||
aese v6.16b,v0.16b
|
||||
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v6.16b,v6.16b,v1.16b
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
eor v3.16b,v3.16b,v6.16b
|
||||
st1 {v3.4s},[x2]
|
||||
add x2,x2,#0x50
|
||||
|
||||
mov w12,#10
|
||||
b .Ldone
|
||||
|
||||
.align 4
|
||||
.L192:
|
||||
ld1 {v4.8b},[x0],#8
|
||||
movi v6.16b,#8 // borrow v6.16b
|
||||
st1 {v3.4s},[x2],#16
|
||||
sub v2.16b,v2.16b,v6.16b // adjust the mask
|
||||
|
||||
.Loop192:
|
||||
tbl v6.16b,{v4.16b},v2.16b
|
||||
ext v5.16b,v0.16b,v3.16b,#12
|
||||
st1 {v4.8b},[x2],#8
|
||||
aese v6.16b,v0.16b
|
||||
subs w1,w1,#1
|
||||
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
|
||||
dup v5.4s,v3.s[3]
|
||||
eor v5.16b,v5.16b,v4.16b
|
||||
eor v6.16b,v6.16b,v1.16b
|
||||
ext v4.16b,v0.16b,v4.16b,#12
|
||||
shl v1.16b,v1.16b,#1
|
||||
eor v4.16b,v4.16b,v5.16b
|
||||
eor v3.16b,v3.16b,v6.16b
|
||||
eor v4.16b,v4.16b,v6.16b
|
||||
st1 {v3.4s},[x2],#16
|
||||
b.ne .Loop192
|
||||
|
||||
mov w12,#12
|
||||
add x2,x2,#0x20
|
||||
b .Ldone
|
||||
|
||||
.align 4
|
||||
.L256:
|
||||
ld1 {v4.16b},[x0]
|
||||
mov w1,#7
|
||||
mov w12,#14
|
||||
st1 {v3.4s},[x2],#16
|
||||
|
||||
.Loop256:
|
||||
tbl v6.16b,{v4.16b},v2.16b
|
||||
ext v5.16b,v0.16b,v3.16b,#12
|
||||
st1 {v4.4s},[x2],#16
|
||||
aese v6.16b,v0.16b
|
||||
subs w1,w1,#1
|
||||
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v6.16b,v6.16b,v1.16b
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
shl v1.16b,v1.16b,#1
|
||||
eor v3.16b,v3.16b,v6.16b
|
||||
st1 {v3.4s},[x2],#16
|
||||
b.eq .Ldone
|
||||
|
||||
dup v6.4s,v3.s[3] // just splat
|
||||
ext v5.16b,v0.16b,v4.16b,#12
|
||||
aese v6.16b,v0.16b
|
||||
|
||||
eor v4.16b,v4.16b,v5.16b
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v4.16b,v4.16b,v5.16b
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v4.16b,v4.16b,v5.16b
|
||||
|
||||
eor v4.16b,v4.16b,v6.16b
|
||||
b .Loop256
|
||||
|
||||
.Ldone:
|
||||
str w12,[x2]
|
||||
mov x3,#0
|
||||
|
||||
.Lenc_key_abort:
|
||||
mov x0,x3 // return value
|
||||
ldr x29,[sp],#16
|
||||
ret
|
||||
.size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
|
||||
|
||||
.globl aes_v8_set_decrypt_key
|
||||
.type aes_v8_set_decrypt_key,%function
|
||||
.align 5
|
||||
aes_v8_set_decrypt_key:
|
||||
stp x29,x30,[sp,#-16]!
|
||||
add x29,sp,#0
|
||||
bl .Lenc_key
|
||||
|
||||
cmp x0,#0
|
||||
b.ne .Ldec_key_abort
|
||||
|
||||
sub x2,x2,#240 // restore original x2
|
||||
mov x4,#-16
|
||||
add x0,x2,x12,lsl#4 // end of key schedule
|
||||
|
||||
ld1 {v0.4s},[x2]
|
||||
ld1 {v1.4s},[x0]
|
||||
st1 {v0.4s},[x0],x4
|
||||
st1 {v1.4s},[x2],#16
|
||||
|
||||
.Loop_imc:
|
||||
ld1 {v0.4s},[x2]
|
||||
ld1 {v1.4s},[x0]
|
||||
aesimc v0.16b,v0.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
st1 {v0.4s},[x0],x4
|
||||
st1 {v1.4s},[x2],#16
|
||||
cmp x0,x2
|
||||
b.hi .Loop_imc
|
||||
|
||||
ld1 {v0.4s},[x2]
|
||||
aesimc v0.16b,v0.16b
|
||||
st1 {v0.4s},[x0]
|
||||
|
||||
eor x0,x0,x0 // return value
|
||||
.Ldec_key_abort:
|
||||
ldp x29,x30,[sp],#16
|
||||
ret
|
||||
.size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
|
||||
.globl aes_v8_encrypt
|
||||
.type aes_v8_encrypt,%function
|
||||
.align 5
|
||||
aes_v8_encrypt:
|
||||
ldr w3,[x2,#240]
|
||||
ld1 {v0.4s},[x2],#16
|
||||
ld1 {v2.16b},[x0]
|
||||
sub w3,w3,#2
|
||||
ld1 {v1.4s},[x2],#16
|
||||
|
||||
.Loop_enc:
|
||||
aese v2.16b,v0.16b
|
||||
aesmc v2.16b,v2.16b
|
||||
ld1 {v0.4s},[x2],#16
|
||||
subs w3,w3,#2
|
||||
aese v2.16b,v1.16b
|
||||
aesmc v2.16b,v2.16b
|
||||
ld1 {v1.4s},[x2],#16
|
||||
b.gt .Loop_enc
|
||||
|
||||
aese v2.16b,v0.16b
|
||||
aesmc v2.16b,v2.16b
|
||||
ld1 {v0.4s},[x2]
|
||||
aese v2.16b,v1.16b
|
||||
eor v2.16b,v2.16b,v0.16b
|
||||
|
||||
st1 {v2.16b},[x1]
|
||||
ret
|
||||
.size aes_v8_encrypt,.-aes_v8_encrypt
|
||||
.globl aes_v8_decrypt
|
||||
.type aes_v8_decrypt,%function
|
||||
.align 5
|
||||
aes_v8_decrypt:
|
||||
ldr w3,[x2,#240]
|
||||
ld1 {v0.4s},[x2],#16
|
||||
ld1 {v2.16b},[x0]
|
||||
sub w3,w3,#2
|
||||
ld1 {v1.4s},[x2],#16
|
||||
|
||||
.Loop_dec:
|
||||
aesd v2.16b,v0.16b
|
||||
aesimc v2.16b,v2.16b
|
||||
ld1 {v0.4s},[x2],#16
|
||||
subs w3,w3,#2
|
||||
aesd v2.16b,v1.16b
|
||||
aesimc v2.16b,v2.16b
|
||||
ld1 {v1.4s},[x2],#16
|
||||
b.gt .Loop_dec
|
||||
|
||||
aesd v2.16b,v0.16b
|
||||
aesimc v2.16b,v2.16b
|
||||
ld1 {v0.4s},[x2]
|
||||
aesd v2.16b,v1.16b
|
||||
eor v2.16b,v2.16b,v0.16b
|
||||
|
||||
st1 {v2.16b},[x1]
|
||||
ret
|
||||
.size aes_v8_decrypt,.-aes_v8_decrypt
|
||||
.globl aes_v8_cbc_encrypt
|
||||
.type aes_v8_cbc_encrypt,%function
|
||||
.align 5
|
||||
aes_v8_cbc_encrypt:
|
||||
stp x29,x30,[sp,#-16]!
|
||||
add x29,sp,#0
|
||||
subs x2,x2,#16
|
||||
mov x8,#16
|
||||
b.lo .Lcbc_abort
|
||||
csel x8,xzr,x8,eq
|
||||
|
||||
cmp w5,#0 // en- or decrypting?
|
||||
ldr w5,[x3,#240]
|
||||
and x2,x2,#-16
|
||||
ld1 {v6.16b},[x4]
|
||||
ld1 {v0.16b},[x0],x8
|
||||
|
||||
ld1 {v16.4s-v17.4s},[x3] // load key schedule...
|
||||
sub w5,w5,#6
|
||||
add x7,x3,x5,lsl#4 // pointer to last 7 round keys
|
||||
sub w5,w5,#2
|
||||
ld1 {v18.4s-v19.4s},[x7],#32
|
||||
ld1 {v20.4s-v21.4s},[x7],#32
|
||||
ld1 {v22.4s-v23.4s},[x7],#32
|
||||
ld1 {v7.4s},[x7]
|
||||
|
||||
add x7,x3,#32
|
||||
mov w6,w5
|
||||
b.eq .Lcbc_dec
|
||||
|
||||
cmp w5,#2
|
||||
eor v0.16b,v0.16b,v6.16b
|
||||
eor v5.16b,v16.16b,v7.16b
|
||||
b.eq .Lcbc_enc128
|
||||
|
||||
ld1 {v2.4s-v3.4s},[x7]
|
||||
add x7,x3,#16
|
||||
add x6,x3,#16*4
|
||||
add x12,x3,#16*5
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
add x14,x3,#16*6
|
||||
add x3,x3,#16*7
|
||||
b .Lenter_cbc_enc
|
||||
|
||||
.align 4
|
||||
.Loop_cbc_enc:
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
st1 {v6.16b},[x1],#16
|
||||
.Lenter_cbc_enc:
|
||||
aese v0.16b,v17.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v0.16b,v2.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
ld1 {v16.4s},[x6]
|
||||
cmp w5,#4
|
||||
aese v0.16b,v3.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
ld1 {v17.4s},[x12]
|
||||
b.eq .Lcbc_enc192
|
||||
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
ld1 {v16.4s},[x14]
|
||||
aese v0.16b,v17.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
ld1 {v17.4s},[x3]
|
||||
nop
|
||||
|
||||
.Lcbc_enc192:
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
subs x2,x2,#16
|
||||
aese v0.16b,v17.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
csel x8,xzr,x8,eq
|
||||
aese v0.16b,v18.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v0.16b,v19.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
ld1 {v16.16b},[x0],x8
|
||||
aese v0.16b,v20.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
eor v16.16b,v16.16b,v5.16b
|
||||
aese v0.16b,v21.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
ld1 {v17.4s},[x7] // re-pre-load rndkey[1]
|
||||
aese v0.16b,v22.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v0.16b,v23.16b
|
||||
eor v6.16b,v0.16b,v7.16b
|
||||
b.hs .Loop_cbc_enc
|
||||
|
||||
st1 {v6.16b},[x1],#16
|
||||
b .Lcbc_done
|
||||
|
||||
.align 5
|
||||
.Lcbc_enc128:
|
||||
ld1 {v2.4s-v3.4s},[x7]
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
b .Lenter_cbc_enc128
|
||||
.Loop_cbc_enc128:
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
st1 {v6.16b},[x1],#16
|
||||
.Lenter_cbc_enc128:
|
||||
aese v0.16b,v17.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
subs x2,x2,#16
|
||||
aese v0.16b,v2.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
csel x8,xzr,x8,eq
|
||||
aese v0.16b,v3.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v0.16b,v18.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v0.16b,v19.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
ld1 {v16.16b},[x0],x8
|
||||
aese v0.16b,v20.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v0.16b,v21.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v0.16b,v22.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
eor v16.16b,v16.16b,v5.16b
|
||||
aese v0.16b,v23.16b
|
||||
eor v6.16b,v0.16b,v7.16b
|
||||
b.hs .Loop_cbc_enc128
|
||||
|
||||
st1 {v6.16b},[x1],#16
|
||||
b .Lcbc_done
|
||||
.align 5
|
||||
.Lcbc_dec:
|
||||
ld1 {v18.16b},[x0],#16
|
||||
subs x2,x2,#32 // bias
|
||||
add w6,w5,#2
|
||||
orr v3.16b,v0.16b,v0.16b
|
||||
orr v1.16b,v0.16b,v0.16b
|
||||
orr v19.16b,v18.16b,v18.16b
|
||||
b.lo .Lcbc_dec_tail
|
||||
|
||||
orr v1.16b,v18.16b,v18.16b
|
||||
ld1 {v18.16b},[x0],#16
|
||||
orr v2.16b,v0.16b,v0.16b
|
||||
orr v3.16b,v1.16b,v1.16b
|
||||
orr v19.16b,v18.16b,v18.16b
|
||||
|
||||
.Loop3x_cbc_dec:
|
||||
aesd v0.16b,v16.16b
|
||||
aesimc v0.16b,v0.16b
|
||||
aesd v1.16b,v16.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v16.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
ld1 {v16.4s},[x7],#16
|
||||
subs w6,w6,#2
|
||||
aesd v0.16b,v17.16b
|
||||
aesimc v0.16b,v0.16b
|
||||
aesd v1.16b,v17.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v17.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
ld1 {v17.4s},[x7],#16
|
||||
b.gt .Loop3x_cbc_dec
|
||||
|
||||
aesd v0.16b,v16.16b
|
||||
aesimc v0.16b,v0.16b
|
||||
aesd v1.16b,v16.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v16.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
eor v4.16b,v6.16b,v7.16b
|
||||
subs x2,x2,#0x30
|
||||
eor v5.16b,v2.16b,v7.16b
|
||||
csel x6,x2,x6,lo // x6, w6, is zero at this point
|
||||
aesd v0.16b,v17.16b
|
||||
aesimc v0.16b,v0.16b
|
||||
aesd v1.16b,v17.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v17.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
eor v17.16b,v3.16b,v7.16b
|
||||
add x0,x0,x6 // x0 is adjusted in such way that
|
||||
// at exit from the loop v1.16b-v18.16b
|
||||
// are loaded with last "words"
|
||||
orr v6.16b,v19.16b,v19.16b
|
||||
mov x7,x3
|
||||
aesd v0.16b,v20.16b
|
||||
aesimc v0.16b,v0.16b
|
||||
aesd v1.16b,v20.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v20.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
ld1 {v2.16b},[x0],#16
|
||||
aesd v0.16b,v21.16b
|
||||
aesimc v0.16b,v0.16b
|
||||
aesd v1.16b,v21.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v21.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
ld1 {v3.16b},[x0],#16
|
||||
aesd v0.16b,v22.16b
|
||||
aesimc v0.16b,v0.16b
|
||||
aesd v1.16b,v22.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v22.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
ld1 {v19.16b},[x0],#16
|
||||
aesd v0.16b,v23.16b
|
||||
aesd v1.16b,v23.16b
|
||||
aesd v18.16b,v23.16b
|
||||
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
|
||||
add w6,w5,#2
|
||||
eor v4.16b,v4.16b,v0.16b
|
||||
eor v5.16b,v5.16b,v1.16b
|
||||
eor v18.16b,v18.16b,v17.16b
|
||||
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
|
||||
st1 {v4.16b},[x1],#16
|
||||
orr v0.16b,v2.16b,v2.16b
|
||||
st1 {v5.16b},[x1],#16
|
||||
orr v1.16b,v3.16b,v3.16b
|
||||
st1 {v18.16b},[x1],#16
|
||||
orr v18.16b,v19.16b,v19.16b
|
||||
b.hs .Loop3x_cbc_dec
|
||||
|
||||
cmn x2,#0x30
|
||||
b.eq .Lcbc_done
|
||||
nop
|
||||
|
||||
.Lcbc_dec_tail:
|
||||
aesd v1.16b,v16.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v16.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
ld1 {v16.4s},[x7],#16
|
||||
subs w6,w6,#2
|
||||
aesd v1.16b,v17.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v17.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
ld1 {v17.4s},[x7],#16
|
||||
b.gt .Lcbc_dec_tail
|
||||
|
||||
aesd v1.16b,v16.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v16.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
aesd v1.16b,v17.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v17.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
aesd v1.16b,v20.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v20.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
cmn x2,#0x20
|
||||
aesd v1.16b,v21.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v21.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
eor v5.16b,v6.16b,v7.16b
|
||||
aesd v1.16b,v22.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v22.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
eor v17.16b,v3.16b,v7.16b
|
||||
aesd v1.16b,v23.16b
|
||||
aesd v18.16b,v23.16b
|
||||
b.eq .Lcbc_dec_one
|
||||
eor v5.16b,v5.16b,v1.16b
|
||||
eor v17.16b,v17.16b,v18.16b
|
||||
orr v6.16b,v19.16b,v19.16b
|
||||
st1 {v5.16b},[x1],#16
|
||||
st1 {v17.16b},[x1],#16
|
||||
b .Lcbc_done
|
||||
|
||||
.Lcbc_dec_one:
|
||||
eor v5.16b,v5.16b,v18.16b
|
||||
orr v6.16b,v19.16b,v19.16b
|
||||
st1 {v5.16b},[x1],#16
|
||||
|
||||
.Lcbc_done:
|
||||
st1 {v6.16b},[x4]
|
||||
.Lcbc_abort:
|
||||
ldr x29,[sp],#16
|
||||
ret
|
||||
.size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
|
||||
.globl aes_v8_ctr32_encrypt_blocks
|
||||
.type aes_v8_ctr32_encrypt_blocks,%function
|
||||
.align 5
|
||||
aes_v8_ctr32_encrypt_blocks:
|
||||
stp x29,x30,[sp,#-16]!
|
||||
add x29,sp,#0
|
||||
ldr w5,[x3,#240]
|
||||
|
||||
ldr w8, [x4, #12]
|
||||
ld1 {v0.4s},[x4]
|
||||
|
||||
ld1 {v16.4s-v17.4s},[x3] // load key schedule...
|
||||
sub w5,w5,#4
|
||||
mov x12,#16
|
||||
cmp x2,#2
|
||||
add x7,x3,x5,lsl#4 // pointer to last 5 round keys
|
||||
sub w5,w5,#2
|
||||
ld1 {v20.4s-v21.4s},[x7],#32
|
||||
ld1 {v22.4s-v23.4s},[x7],#32
|
||||
ld1 {v7.4s},[x7]
|
||||
add x7,x3,#32
|
||||
mov w6,w5
|
||||
csel x12,xzr,x12,lo
|
||||
#ifndef __ARMEB__
|
||||
rev w8, w8
|
||||
#endif
|
||||
orr v1.16b,v0.16b,v0.16b
|
||||
add w10, w8, #1
|
||||
orr v18.16b,v0.16b,v0.16b
|
||||
add w8, w8, #2
|
||||
orr v6.16b,v0.16b,v0.16b
|
||||
rev w10, w10
|
||||
mov v1.s[3],w10
|
||||
b.ls .Lctr32_tail
|
||||
rev w12, w8
|
||||
sub x2,x2,#3 // bias
|
||||
mov v18.s[3],w12
|
||||
b .Loop3x_ctr32
|
||||
|
||||
.align 4
|
||||
.Loop3x_ctr32:
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v16.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
aese v18.16b,v16.16b
|
||||
aesmc v18.16b,v18.16b
|
||||
ld1 {v16.4s},[x7],#16
|
||||
subs w6,w6,#2
|
||||
aese v0.16b,v17.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v17.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
aese v18.16b,v17.16b
|
||||
aesmc v18.16b,v18.16b
|
||||
ld1 {v17.4s},[x7],#16
|
||||
b.gt .Loop3x_ctr32
|
||||
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v4.16b,v0.16b
|
||||
aese v1.16b,v16.16b
|
||||
aesmc v5.16b,v1.16b
|
||||
ld1 {v2.16b},[x0],#16
|
||||
orr v0.16b,v6.16b,v6.16b
|
||||
aese v18.16b,v16.16b
|
||||
aesmc v18.16b,v18.16b
|
||||
ld1 {v3.16b},[x0],#16
|
||||
orr v1.16b,v6.16b,v6.16b
|
||||
aese v4.16b,v17.16b
|
||||
aesmc v4.16b,v4.16b
|
||||
aese v5.16b,v17.16b
|
||||
aesmc v5.16b,v5.16b
|
||||
ld1 {v19.16b},[x0],#16
|
||||
mov x7,x3
|
||||
aese v18.16b,v17.16b
|
||||
aesmc v17.16b,v18.16b
|
||||
orr v18.16b,v6.16b,v6.16b
|
||||
add w9,w8,#1
|
||||
aese v4.16b,v20.16b
|
||||
aesmc v4.16b,v4.16b
|
||||
aese v5.16b,v20.16b
|
||||
aesmc v5.16b,v5.16b
|
||||
eor v2.16b,v2.16b,v7.16b
|
||||
add w10,w8,#2
|
||||
aese v17.16b,v20.16b
|
||||
aesmc v17.16b,v17.16b
|
||||
eor v3.16b,v3.16b,v7.16b
|
||||
add w8,w8,#3
|
||||
aese v4.16b,v21.16b
|
||||
aesmc v4.16b,v4.16b
|
||||
aese v5.16b,v21.16b
|
||||
aesmc v5.16b,v5.16b
|
||||
eor v19.16b,v19.16b,v7.16b
|
||||
rev w9,w9
|
||||
aese v17.16b,v21.16b
|
||||
aesmc v17.16b,v17.16b
|
||||
mov v0.s[3], w9
|
||||
rev w10,w10
|
||||
aese v4.16b,v22.16b
|
||||
aesmc v4.16b,v4.16b
|
||||
aese v5.16b,v22.16b
|
||||
aesmc v5.16b,v5.16b
|
||||
mov v1.s[3], w10
|
||||
rev w12,w8
|
||||
aese v17.16b,v22.16b
|
||||
aesmc v17.16b,v17.16b
|
||||
mov v18.s[3], w12
|
||||
subs x2,x2,#3
|
||||
aese v4.16b,v23.16b
|
||||
aese v5.16b,v23.16b
|
||||
aese v17.16b,v23.16b
|
||||
|
||||
eor v2.16b,v2.16b,v4.16b
|
||||
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
|
||||
st1 {v2.16b},[x1],#16
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
mov w6,w5
|
||||
st1 {v3.16b},[x1],#16
|
||||
eor v19.16b,v19.16b,v17.16b
|
||||
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
|
||||
st1 {v19.16b},[x1],#16
|
||||
b.hs .Loop3x_ctr32
|
||||
|
||||
adds x2,x2,#3
|
||||
b.eq .Lctr32_done
|
||||
cmp x2,#1
|
||||
mov x12,#16
|
||||
csel x12,xzr,x12,eq
|
||||
|
||||
.Lctr32_tail:
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v16.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
ld1 {v16.4s},[x7],#16
|
||||
subs w6,w6,#2
|
||||
aese v0.16b,v17.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v17.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
ld1 {v17.4s},[x7],#16
|
||||
b.gt .Lctr32_tail
|
||||
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v16.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
aese v0.16b,v17.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v17.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
ld1 {v2.16b},[x0],x12
|
||||
aese v0.16b,v20.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v20.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
ld1 {v3.16b},[x0]
|
||||
aese v0.16b,v21.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v21.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
eor v2.16b,v2.16b,v7.16b
|
||||
aese v0.16b,v22.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v22.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
eor v3.16b,v3.16b,v7.16b
|
||||
aese v0.16b,v23.16b
|
||||
aese v1.16b,v23.16b
|
||||
|
||||
cmp x2,#1
|
||||
eor v2.16b,v2.16b,v0.16b
|
||||
eor v3.16b,v3.16b,v1.16b
|
||||
st1 {v2.16b},[x1],#16
|
||||
b.eq .Lctr32_done
|
||||
st1 {v3.16b},[x1]
|
||||
|
||||
.Lctr32_done:
|
||||
ldr x29,[sp],#16
|
||||
ret
|
||||
.size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
|
||||
#endif
|
228
secure/lib/libcrypto/aarch64/ghashv8-armx.S
Normal file
228
secure/lib/libcrypto/aarch64/ghashv8-armx.S
Normal file
@ -0,0 +1,228 @@
|
||||
/* $FreeBSD$ */
|
||||
/* Do not modify. This file is auto-generated from ghashv8-armx.pl. */
|
||||
#include "arm_arch.h"
|
||||
|
||||
.text
|
||||
.global gcm_init_v8
|
||||
.type gcm_init_v8,%function
|
||||
.align 4
|
||||
gcm_init_v8:
|
||||
ld1 {v17.2d},[x1] //load input H
|
||||
movi v19.16b,#0xe1
|
||||
shl v19.2d,v19.2d,#57 //0xc2.0
|
||||
ext v3.16b,v17.16b,v17.16b,#8
|
||||
ushr v18.2d,v19.2d,#63
|
||||
dup v17.4s,v17.s[1]
|
||||
ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01
|
||||
ushr v18.2d,v3.2d,#63
|
||||
sshr v17.4s,v17.4s,#31 //broadcast carry bit
|
||||
and v18.16b,v18.16b,v16.16b
|
||||
shl v3.2d,v3.2d,#1
|
||||
ext v18.16b,v18.16b,v18.16b,#8
|
||||
and v16.16b,v16.16b,v17.16b
|
||||
orr v3.16b,v3.16b,v18.16b //H<<<=1
|
||||
eor v20.16b,v3.16b,v16.16b //twisted H
|
||||
st1 {v20.2d},[x0],#16 //store Htable[0]
|
||||
|
||||
//calculate H^2
|
||||
ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing
|
||||
pmull v0.1q,v20.1d,v20.1d
|
||||
eor v16.16b,v16.16b,v20.16b
|
||||
pmull2 v2.1q,v20.2d,v20.2d
|
||||
pmull v1.1q,v16.1d,v16.1d
|
||||
|
||||
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
|
||||
eor v18.16b,v0.16b,v2.16b
|
||||
eor v1.16b,v1.16b,v17.16b
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
pmull v18.1q,v0.1d,v19.1d //1st phase
|
||||
|
||||
ins v2.d[0],v1.d[1]
|
||||
ins v1.d[1],v0.d[0]
|
||||
eor v0.16b,v1.16b,v18.16b
|
||||
|
||||
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
|
||||
pmull v0.1q,v0.1d,v19.1d
|
||||
eor v18.16b,v18.16b,v2.16b
|
||||
eor v22.16b,v0.16b,v18.16b
|
||||
|
||||
ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing
|
||||
eor v17.16b,v17.16b,v22.16b
|
||||
ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
|
||||
st1 {v21.2d-v22.2d},[x0] //store Htable[1..2]
|
||||
|
||||
ret
|
||||
.size gcm_init_v8,.-gcm_init_v8
|
||||
.global gcm_gmult_v8
|
||||
.type gcm_gmult_v8,%function
|
||||
.align 4
|
||||
gcm_gmult_v8:
|
||||
ld1 {v17.2d},[x0] //load Xi
|
||||
movi v19.16b,#0xe1
|
||||
ld1 {v20.2d-v21.2d},[x1] //load twisted H, ...
|
||||
shl v19.2d,v19.2d,#57
|
||||
#ifndef __ARMEB__
|
||||
rev64 v17.16b,v17.16b
|
||||
#endif
|
||||
ext v3.16b,v17.16b,v17.16b,#8
|
||||
|
||||
pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
|
||||
eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
|
||||
pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
|
||||
pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
|
||||
|
||||
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
|
||||
eor v18.16b,v0.16b,v2.16b
|
||||
eor v1.16b,v1.16b,v17.16b
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
|
||||
|
||||
ins v2.d[0],v1.d[1]
|
||||
ins v1.d[1],v0.d[0]
|
||||
eor v0.16b,v1.16b,v18.16b
|
||||
|
||||
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
|
||||
pmull v0.1q,v0.1d,v19.1d
|
||||
eor v18.16b,v18.16b,v2.16b
|
||||
eor v0.16b,v0.16b,v18.16b
|
||||
|
||||
#ifndef __ARMEB__
|
||||
rev64 v0.16b,v0.16b
|
||||
#endif
|
||||
ext v0.16b,v0.16b,v0.16b,#8
|
||||
st1 {v0.2d},[x0] //write out Xi
|
||||
|
||||
ret
|
||||
.size gcm_gmult_v8,.-gcm_gmult_v8
|
||||
.global gcm_ghash_v8
|
||||
.type gcm_ghash_v8,%function
|
||||
.align 4
|
||||
gcm_ghash_v8:
|
||||
ld1 {v0.2d},[x0] //load [rotated] Xi
|
||||
//"[rotated]" means that
|
||||
//loaded value would have
|
||||
//to be rotated in order to
|
||||
//make it appear as in
|
||||
//alorithm specification
|
||||
subs x3,x3,#32 //see if x3 is 32 or larger
|
||||
mov x12,#16 //x12 is used as post-
|
||||
//increment for input pointer;
|
||||
//as loop is modulo-scheduled
|
||||
//x12 is zeroed just in time
|
||||
//to preclude oversteping
|
||||
//inp[len], which means that
|
||||
//last block[s] are actually
|
||||
//loaded twice, but last
|
||||
//copy is not processed
|
||||
ld1 {v20.2d-v21.2d},[x1],#32 //load twisted H, ..., H^2
|
||||
movi v19.16b,#0xe1
|
||||
ld1 {v22.2d},[x1]
|
||||
csel x12,xzr,x12,eq //is it time to zero x12?
|
||||
ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi
|
||||
ld1 {v16.2d},[x2],#16 //load [rotated] I[0]
|
||||
shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
|
||||
#ifndef __ARMEB__
|
||||
rev64 v16.16b,v16.16b
|
||||
rev64 v0.16b,v0.16b
|
||||
#endif
|
||||
ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]
|
||||
b.lo .Lodd_tail_v8 //x3 was less than 32
|
||||
ld1 {v17.2d},[x2],x12 //load [rotated] I[1]
|
||||
#ifndef __ARMEB__
|
||||
rev64 v17.16b,v17.16b
|
||||
#endif
|
||||
ext v7.16b,v17.16b,v17.16b,#8
|
||||
eor v3.16b,v3.16b,v0.16b //I[i]^=Xi
|
||||
pmull v4.1q,v20.1d,v7.1d //H·Ii+1
|
||||
eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
|
||||
pmull2 v6.1q,v20.2d,v7.2d
|
||||
b .Loop_mod2x_v8
|
||||
|
||||
.align 4
|
||||
.Loop_mod2x_v8:
|
||||
ext v18.16b,v3.16b,v3.16b,#8
|
||||
subs x3,x3,#32 //is there more data?
|
||||
pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo
|
||||
csel x12,xzr,x12,lo //is it time to zero x12?
|
||||
|
||||
pmull v5.1q,v21.1d,v17.1d
|
||||
eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing
|
||||
pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi
|
||||
eor v0.16b,v0.16b,v4.16b //accumulate
|
||||
pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
|
||||
ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2]
|
||||
|
||||
eor v2.16b,v2.16b,v6.16b
|
||||
csel x12,xzr,x12,eq //is it time to zero x12?
|
||||
eor v1.16b,v1.16b,v5.16b
|
||||
|
||||
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
|
||||
eor v18.16b,v0.16b,v2.16b
|
||||
eor v1.16b,v1.16b,v17.16b
|
||||
ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]
|
||||
#ifndef __ARMEB__
|
||||
rev64 v16.16b,v16.16b
|
||||
#endif
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
|
||||
|
||||
#ifndef __ARMEB__
|
||||
rev64 v17.16b,v17.16b
|
||||
#endif
|
||||
ins v2.d[0],v1.d[1]
|
||||
ins v1.d[1],v0.d[0]
|
||||
ext v7.16b,v17.16b,v17.16b,#8
|
||||
ext v3.16b,v16.16b,v16.16b,#8
|
||||
eor v0.16b,v1.16b,v18.16b
|
||||
pmull v4.1q,v20.1d,v7.1d //H·Ii+1
|
||||
eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early
|
||||
|
||||
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
|
||||
pmull v0.1q,v0.1d,v19.1d
|
||||
eor v3.16b,v3.16b,v18.16b
|
||||
eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
|
||||
eor v3.16b,v3.16b,v0.16b
|
||||
pmull2 v6.1q,v20.2d,v7.2d
|
||||
b.hs .Loop_mod2x_v8 //there was at least 32 more bytes
|
||||
|
||||
eor v2.16b,v2.16b,v18.16b
|
||||
ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b
|
||||
adds x3,x3,#32 //re-construct x3
|
||||
eor v0.16b,v0.16b,v2.16b //re-construct v0.16b
|
||||
b.eq .Ldone_v8 //is x3 zero?
|
||||
.Lodd_tail_v8:
|
||||
ext v18.16b,v0.16b,v0.16b,#8
|
||||
eor v3.16b,v3.16b,v0.16b //inp^=Xi
|
||||
eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi
|
||||
|
||||
pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
|
||||
eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
|
||||
pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
|
||||
pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
|
||||
|
||||
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
|
||||
eor v18.16b,v0.16b,v2.16b
|
||||
eor v1.16b,v1.16b,v17.16b
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
|
||||
|
||||
ins v2.d[0],v1.d[1]
|
||||
ins v1.d[1],v0.d[0]
|
||||
eor v0.16b,v1.16b,v18.16b
|
||||
|
||||
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
|
||||
pmull v0.1q,v0.1d,v19.1d
|
||||
eor v18.16b,v18.16b,v2.16b
|
||||
eor v0.16b,v0.16b,v18.16b
|
||||
|
||||
.Ldone_v8:
|
||||
#ifndef __ARMEB__
|
||||
rev64 v0.16b,v0.16b
|
||||
#endif
|
||||
ext v0.16b,v0.16b,v0.16b,#8
|
||||
st1 {v0.2d},[x0] //write out Xi
|
||||
|
||||
ret
|
||||
.size gcm_ghash_v8,.-gcm_ghash_v8
|
||||
.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
|
||||
.align 2
|
1213
secure/lib/libcrypto/aarch64/sha1-armv8.S
Normal file
1213
secure/lib/libcrypto/aarch64/sha1-armv8.S
Normal file
File diff suppressed because it is too large
Load Diff
1143
secure/lib/libcrypto/aarch64/sha256-armv8.S
Normal file
1143
secure/lib/libcrypto/aarch64/sha256-armv8.S
Normal file
File diff suppressed because it is too large
Load Diff
1023
secure/lib/libcrypto/aarch64/sha512-armv8.S
Normal file
1023
secure/lib/libcrypto/aarch64/sha512-armv8.S
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user