Build OpenSSL assembly sources for aarch64. Tested with ThunderX by andrew.

This commit is contained in:
Jung-uk Kim 2016-10-26 20:02:22 +00:00
parent 726f4773ec
commit 7518a9bd2b
11 changed files with 4416 additions and 12 deletions

View File

@ -42,7 +42,7 @@ $code=<<___;
#if __ARM_MAX_ARCH__>=7
.text
___
$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
# $code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
$code.=".arch armv7-a\n.fpu neon\n.code 32\n" if ($flavour !~ /64/);
#^^^^^^ this is done to simplify adoption by not depending
# on latest binutils.

View File

@ -1,7 +1,6 @@
#include "arm_arch.h"
.text
.arch armv8-a+crypto
.align 5
.global _armv7_neon_probe

View File

@ -49,7 +49,7 @@ $code=<<___;
.text
___
$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
# $code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/);
################################################################################

View File

@ -22,7 +22,10 @@ MAN+= config.5 des_modes.7
# base sources
SRCS= cpt_err.c cryptlib.c cversion.c ex_data.c mem.c mem_dbg.c o_dir.c \
o_fips.c o_init.c o_str.c o_time.c uid.c
.if defined(ASM_amd64)
.if defined(ASM_aarch64)
SRCS+= arm64cpuid.S armcap.c mem_clr.c
CFLAGS.arm64cpuid.S= -march=armv8-a+crypto
.elif defined(ASM_amd64)
SRCS+= x86_64cpuid.S
.elif defined(ASM_arm)
SRCS+= armcap.c armv4cpuid.S
@ -35,7 +38,10 @@ INCS+= crypto.h ebcdic.h opensslv.h ossl_typ.h symhacks.h ../e_os2.h
# aes
SRCS+= aes_cfb.c aes_ctr.c aes_ecb.c aes_ige.c aes_misc.c aes_ofb.c aes_wrap.c
.if defined(ASM_amd64)
.if defined(ASM_aarch64)
SRCS+= aes_cbc.c aes_core.c aesv8-armx.S
CFLAGS.aesv8-armx.S= -march=armv8-a+crypto
.elif defined(ASM_amd64)
SRCS+= aes-x86_64.S aesni-mb-x86_64.S aesni-sha1-x86_64.S \
aesni-sha256-x86_64.S aesni-x86_64.S bsaes-x86_64.S vpaes-x86_64.S
.elif defined(ASM_arm)
@ -238,7 +244,10 @@ INCS+= mdc2.h
# modes
SRCS+= cbc128.c ccm128.c cfb128.c ctr128.c cts128.c gcm128.c ofb128.c \
wrap128.c xts128.c
.if defined(ASM_amd64)
.if defined(ASM_aarch64)
SRCS+= ghashv8-armx.S
CFLAGS.ghashv8-armx.S= -march=armv8-a+crypto
.elif defined(ASM_amd64)
SRCS+= aesni-gcm-x86_64.S ghash-x86_64.S
.elif defined(ASM_arm)
SRCS+= ghash-armv4.S ghashv8-armx.S
@ -324,7 +333,9 @@ INCS+= seed.h
# sha
SRCS+= sha1_one.c sha1dgst.c sha256.c sha512.c sha_dgst.c sha_one.c
.if defined(ASM_amd64)
.if defined(ASM_aarch64)
SRCS+= sha1-armv8.S sha256-armv8.S sha512-armv8.S
.elif defined(ASM_amd64)
SRCS+= sha1-mb-x86_64.S sha1-x86_64.S sha256-mb-x86_64.S sha256-x86_64.S \
sha512-x86_64.S
.elif defined(ASM_arm)

View File

@ -6,7 +6,44 @@
.include "Makefile.inc"
.if defined(ASM_amd64)
.if defined(ASM_aarch64)
.PATH: ${LCRYPTO_SRC}/crypto \
${LCRYPTO_SRC}/crypto/aes/asm \
${LCRYPTO_SRC}/crypto/modes/asm \
${LCRYPTO_SRC}/crypto/sha/asm
PERLPATH= -I${LCRYPTO_SRC}/crypto/perlasm
# aes
SRCS= aesv8-armx.pl
# modes
SRCS+= ghashv8-armx.pl
# sha
SRCS+= sha1-armv8.pl sha512-armv8.pl
ASM= ${SRCS:R:S/$/.S/} sha256-armv8.S
all: ${ASM}
CLEANFILES= ${ASM} ${SRCS:R:S/$/.s/} sha256-armv8.s
.SUFFIXES: .pl
sha256-armv8.S: sha512-armv8.pl
env CC=cc perl ${.ALLSRC} 64 ${.TARGET:R:S/$/.s/}
( echo '/* $$'FreeBSD'$$ */' ;\
echo '/* Do not modify. This file is auto-generated from ${.ALLSRC:T:R:S/$/.pl/}. */' ;\
cat ${.TARGET:R:S/$/.s/}) > ${.TARGET}
.pl.S:
env CC=cc perl ${.IMPSRC} 64 ${.TARGET:R:S/$/.s/}
( echo '/* $$'FreeBSD'$$ */' ;\
echo '/* Do not modify. This file is auto-generated from ${.IMPSRC:T:R:S/$/.pl/}. */' ;\
cat ${.TARGET:R:S/$/.s/}) > ${.TARGET}
.elif defined(ASM_amd64)
.PATH: ${LCRYPTO_SRC}/crypto \
${LCRYPTO_SRC}/crypto/aes/asm \

View File

@ -21,7 +21,9 @@ CFLAGS+=-DL_ENDIAN
CFLAGS+=-DB_ENDIAN
.endif
.if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386"
.if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "arm"
ASM_${MACHINE_CPUARCH}=
.elif ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386"
_ASM_AVX!= { \
echo vzeroall | \
${CC} -x assembler -o /dev/null -c - 2> /dev/null; \
@ -29,11 +31,11 @@ _ASM_AVX!= { \
.if ${_ASM_AVX} == yes
ASM_${MACHINE_CPUARCH}=
.endif
.elif ${MACHINE_CPUARCH} == "arm"
ASM_arm=
.endif
.if defined(ASM_amd64)
.if defined(ASM_aarch64)
CFLAGS+=-DSHA1_ASM -DSHA256_ASM -DSHA512_ASM
.elif defined(ASM_amd64)
CFLAGS+=-DOPENSSL_IA32_SSE2
CFLAGS+=-DAES_ASM -DBSAES_ASM -DVPAES_ASM
CFLAGS+=-DECP_NISTZ256_ASM

View File

@ -0,0 +1,748 @@
/* $FreeBSD$ */
/* Do not modify. This file is auto-generated from aesv8-armx.pl. */
#include "arm_arch.h"
#if __ARM_MAX_ARCH__>=7
.text
.align 5
rcon:
.long 0x01,0x01,0x01,0x01
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
.long 0x1b,0x1b,0x1b,0x1b
.globl aes_v8_set_encrypt_key
.type aes_v8_set_encrypt_key,%function
.align 5
aes_v8_set_encrypt_key:
.Lenc_key:
stp x29,x30,[sp,#-16]!
add x29,sp,#0
mov x3,#-1
cmp x0,#0
b.eq .Lenc_key_abort
cmp x2,#0
b.eq .Lenc_key_abort
mov x3,#-2
cmp w1,#128
b.lt .Lenc_key_abort
cmp w1,#256
b.gt .Lenc_key_abort
tst w1,#0x3f
b.ne .Lenc_key_abort
adr x3,rcon
cmp w1,#192
eor v0.16b,v0.16b,v0.16b
ld1 {v3.16b},[x0],#16
mov w1,#8 // reuse w1
ld1 {v1.4s,v2.4s},[x3],#32
b.lt .Loop128
b.eq .L192
b .L256
.align 4
.Loop128:
tbl v6.16b,{v3.16b},v2.16b
ext v5.16b,v0.16b,v3.16b,#12
st1 {v3.4s},[x2],#16
aese v6.16b,v0.16b
subs w1,w1,#1
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v6.16b,v6.16b,v1.16b
eor v3.16b,v3.16b,v5.16b
shl v1.16b,v1.16b,#1
eor v3.16b,v3.16b,v6.16b
b.ne .Loop128
ld1 {v1.4s},[x3]
tbl v6.16b,{v3.16b},v2.16b
ext v5.16b,v0.16b,v3.16b,#12
st1 {v3.4s},[x2],#16
aese v6.16b,v0.16b
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v6.16b,v6.16b,v1.16b
eor v3.16b,v3.16b,v5.16b
shl v1.16b,v1.16b,#1
eor v3.16b,v3.16b,v6.16b
tbl v6.16b,{v3.16b},v2.16b
ext v5.16b,v0.16b,v3.16b,#12
st1 {v3.4s},[x2],#16
aese v6.16b,v0.16b
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v6.16b,v6.16b,v1.16b
eor v3.16b,v3.16b,v5.16b
eor v3.16b,v3.16b,v6.16b
st1 {v3.4s},[x2]
add x2,x2,#0x50
mov w12,#10
b .Ldone
.align 4
.L192:
ld1 {v4.8b},[x0],#8
movi v6.16b,#8 // borrow v6.16b
st1 {v3.4s},[x2],#16
sub v2.16b,v2.16b,v6.16b // adjust the mask
.Loop192:
tbl v6.16b,{v4.16b},v2.16b
ext v5.16b,v0.16b,v3.16b,#12
st1 {v4.8b},[x2],#8
aese v6.16b,v0.16b
subs w1,w1,#1
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
dup v5.4s,v3.s[3]
eor v5.16b,v5.16b,v4.16b
eor v6.16b,v6.16b,v1.16b
ext v4.16b,v0.16b,v4.16b,#12
shl v1.16b,v1.16b,#1
eor v4.16b,v4.16b,v5.16b
eor v3.16b,v3.16b,v6.16b
eor v4.16b,v4.16b,v6.16b
st1 {v3.4s},[x2],#16
b.ne .Loop192
mov w12,#12
add x2,x2,#0x20
b .Ldone
.align 4
.L256:
ld1 {v4.16b},[x0]
mov w1,#7
mov w12,#14
st1 {v3.4s},[x2],#16
.Loop256:
tbl v6.16b,{v4.16b},v2.16b
ext v5.16b,v0.16b,v3.16b,#12
st1 {v4.4s},[x2],#16
aese v6.16b,v0.16b
subs w1,w1,#1
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v6.16b,v6.16b,v1.16b
eor v3.16b,v3.16b,v5.16b
shl v1.16b,v1.16b,#1
eor v3.16b,v3.16b,v6.16b
st1 {v3.4s},[x2],#16
b.eq .Ldone
dup v6.4s,v3.s[3] // just splat
ext v5.16b,v0.16b,v4.16b,#12
aese v6.16b,v0.16b
eor v4.16b,v4.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v4.16b,v4.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v4.16b,v4.16b,v5.16b
eor v4.16b,v4.16b,v6.16b
b .Loop256
.Ldone:
str w12,[x2]
mov x3,#0
.Lenc_key_abort:
mov x0,x3 // return value
ldr x29,[sp],#16
ret
.size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
.globl aes_v8_set_decrypt_key
.type aes_v8_set_decrypt_key,%function
.align 5
aes_v8_set_decrypt_key:
stp x29,x30,[sp,#-16]!
add x29,sp,#0
bl .Lenc_key
cmp x0,#0
b.ne .Ldec_key_abort
sub x2,x2,#240 // restore original x2
mov x4,#-16
add x0,x2,x12,lsl#4 // end of key schedule
ld1 {v0.4s},[x2]
ld1 {v1.4s},[x0]
st1 {v0.4s},[x0],x4
st1 {v1.4s},[x2],#16
.Loop_imc:
ld1 {v0.4s},[x2]
ld1 {v1.4s},[x0]
aesimc v0.16b,v0.16b
aesimc v1.16b,v1.16b
st1 {v0.4s},[x0],x4
st1 {v1.4s},[x2],#16
cmp x0,x2
b.hi .Loop_imc
ld1 {v0.4s},[x2]
aesimc v0.16b,v0.16b
st1 {v0.4s},[x0]
eor x0,x0,x0 // return value
.Ldec_key_abort:
ldp x29,x30,[sp],#16
ret
.size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
.globl aes_v8_encrypt
.type aes_v8_encrypt,%function
.align 5
aes_v8_encrypt:
ldr w3,[x2,#240]
ld1 {v0.4s},[x2],#16
ld1 {v2.16b},[x0]
sub w3,w3,#2
ld1 {v1.4s},[x2],#16
.Loop_enc:
aese v2.16b,v0.16b
aesmc v2.16b,v2.16b
ld1 {v0.4s},[x2],#16
subs w3,w3,#2
aese v2.16b,v1.16b
aesmc v2.16b,v2.16b
ld1 {v1.4s},[x2],#16
b.gt .Loop_enc
aese v2.16b,v0.16b
aesmc v2.16b,v2.16b
ld1 {v0.4s},[x2]
aese v2.16b,v1.16b
eor v2.16b,v2.16b,v0.16b
st1 {v2.16b},[x1]
ret
.size aes_v8_encrypt,.-aes_v8_encrypt
.globl aes_v8_decrypt
.type aes_v8_decrypt,%function
.align 5
aes_v8_decrypt:
ldr w3,[x2,#240]
ld1 {v0.4s},[x2],#16
ld1 {v2.16b},[x0]
sub w3,w3,#2
ld1 {v1.4s},[x2],#16
.Loop_dec:
aesd v2.16b,v0.16b
aesimc v2.16b,v2.16b
ld1 {v0.4s},[x2],#16
subs w3,w3,#2
aesd v2.16b,v1.16b
aesimc v2.16b,v2.16b
ld1 {v1.4s},[x2],#16
b.gt .Loop_dec
aesd v2.16b,v0.16b
aesimc v2.16b,v2.16b
ld1 {v0.4s},[x2]
aesd v2.16b,v1.16b
eor v2.16b,v2.16b,v0.16b
st1 {v2.16b},[x1]
ret
.size aes_v8_decrypt,.-aes_v8_decrypt
.globl aes_v8_cbc_encrypt
.type aes_v8_cbc_encrypt,%function
.align 5
aes_v8_cbc_encrypt:
stp x29,x30,[sp,#-16]!
add x29,sp,#0
subs x2,x2,#16
mov x8,#16
b.lo .Lcbc_abort
csel x8,xzr,x8,eq
cmp w5,#0 // en- or decrypting?
ldr w5,[x3,#240]
and x2,x2,#-16
ld1 {v6.16b},[x4]
ld1 {v0.16b},[x0],x8
ld1 {v16.4s-v17.4s},[x3] // load key schedule...
sub w5,w5,#6
add x7,x3,x5,lsl#4 // pointer to last 7 round keys
sub w5,w5,#2
ld1 {v18.4s-v19.4s},[x7],#32
ld1 {v20.4s-v21.4s},[x7],#32
ld1 {v22.4s-v23.4s},[x7],#32
ld1 {v7.4s},[x7]
add x7,x3,#32
mov w6,w5
b.eq .Lcbc_dec
cmp w5,#2
eor v0.16b,v0.16b,v6.16b
eor v5.16b,v16.16b,v7.16b
b.eq .Lcbc_enc128
ld1 {v2.4s-v3.4s},[x7]
add x7,x3,#16
add x6,x3,#16*4
add x12,x3,#16*5
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
add x14,x3,#16*6
add x3,x3,#16*7
b .Lenter_cbc_enc
.align 4
.Loop_cbc_enc:
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
st1 {v6.16b},[x1],#16
.Lenter_cbc_enc:
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
aese v0.16b,v2.16b
aesmc v0.16b,v0.16b
ld1 {v16.4s},[x6]
cmp w5,#4
aese v0.16b,v3.16b
aesmc v0.16b,v0.16b
ld1 {v17.4s},[x12]
b.eq .Lcbc_enc192
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
ld1 {v16.4s},[x14]
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
ld1 {v17.4s},[x3]
nop
.Lcbc_enc192:
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
subs x2,x2,#16
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
csel x8,xzr,x8,eq
aese v0.16b,v18.16b
aesmc v0.16b,v0.16b
aese v0.16b,v19.16b
aesmc v0.16b,v0.16b
ld1 {v16.16b},[x0],x8
aese v0.16b,v20.16b
aesmc v0.16b,v0.16b
eor v16.16b,v16.16b,v5.16b
aese v0.16b,v21.16b
aesmc v0.16b,v0.16b
ld1 {v17.4s},[x7] // re-pre-load rndkey[1]
aese v0.16b,v22.16b
aesmc v0.16b,v0.16b
aese v0.16b,v23.16b
eor v6.16b,v0.16b,v7.16b
b.hs .Loop_cbc_enc
st1 {v6.16b},[x1],#16
b .Lcbc_done
.align 5
.Lcbc_enc128:
ld1 {v2.4s-v3.4s},[x7]
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
b .Lenter_cbc_enc128
.Loop_cbc_enc128:
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
st1 {v6.16b},[x1],#16
.Lenter_cbc_enc128:
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
subs x2,x2,#16
aese v0.16b,v2.16b
aesmc v0.16b,v0.16b
csel x8,xzr,x8,eq
aese v0.16b,v3.16b
aesmc v0.16b,v0.16b
aese v0.16b,v18.16b
aesmc v0.16b,v0.16b
aese v0.16b,v19.16b
aesmc v0.16b,v0.16b
ld1 {v16.16b},[x0],x8
aese v0.16b,v20.16b
aesmc v0.16b,v0.16b
aese v0.16b,v21.16b
aesmc v0.16b,v0.16b
aese v0.16b,v22.16b
aesmc v0.16b,v0.16b
eor v16.16b,v16.16b,v5.16b
aese v0.16b,v23.16b
eor v6.16b,v0.16b,v7.16b
b.hs .Loop_cbc_enc128
st1 {v6.16b},[x1],#16
b .Lcbc_done
.align 5
.Lcbc_dec:
ld1 {v18.16b},[x0],#16
subs x2,x2,#32 // bias
add w6,w5,#2
orr v3.16b,v0.16b,v0.16b
orr v1.16b,v0.16b,v0.16b
orr v19.16b,v18.16b,v18.16b
b.lo .Lcbc_dec_tail
orr v1.16b,v18.16b,v18.16b
ld1 {v18.16b},[x0],#16
orr v2.16b,v0.16b,v0.16b
orr v3.16b,v1.16b,v1.16b
orr v19.16b,v18.16b,v18.16b
.Loop3x_cbc_dec:
aesd v0.16b,v16.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v16.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aesd v0.16b,v17.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v17.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
ld1 {v17.4s},[x7],#16
b.gt .Loop3x_cbc_dec
aesd v0.16b,v16.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v16.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
eor v4.16b,v6.16b,v7.16b
subs x2,x2,#0x30
eor v5.16b,v2.16b,v7.16b
csel x6,x2,x6,lo // x6, w6, is zero at this point
aesd v0.16b,v17.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v17.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
eor v17.16b,v3.16b,v7.16b
add x0,x0,x6 // x0 is adjusted in such way that
// at exit from the loop v1.16b-v18.16b
// are loaded with last "words"
orr v6.16b,v19.16b,v19.16b
mov x7,x3
aesd v0.16b,v20.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v20.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v20.16b
aesimc v18.16b,v18.16b
ld1 {v2.16b},[x0],#16
aesd v0.16b,v21.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v21.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v21.16b
aesimc v18.16b,v18.16b
ld1 {v3.16b},[x0],#16
aesd v0.16b,v22.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v22.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v22.16b
aesimc v18.16b,v18.16b
ld1 {v19.16b},[x0],#16
aesd v0.16b,v23.16b
aesd v1.16b,v23.16b
aesd v18.16b,v23.16b
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
add w6,w5,#2
eor v4.16b,v4.16b,v0.16b
eor v5.16b,v5.16b,v1.16b
eor v18.16b,v18.16b,v17.16b
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
st1 {v4.16b},[x1],#16
orr v0.16b,v2.16b,v2.16b
st1 {v5.16b},[x1],#16
orr v1.16b,v3.16b,v3.16b
st1 {v18.16b},[x1],#16
orr v18.16b,v19.16b,v19.16b
b.hs .Loop3x_cbc_dec
cmn x2,#0x30
b.eq .Lcbc_done
nop
.Lcbc_dec_tail:
aesd v1.16b,v16.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aesd v1.16b,v17.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
ld1 {v17.4s},[x7],#16
b.gt .Lcbc_dec_tail
aesd v1.16b,v16.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
aesd v1.16b,v17.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
aesd v1.16b,v20.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v20.16b
aesimc v18.16b,v18.16b
cmn x2,#0x20
aesd v1.16b,v21.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v21.16b
aesimc v18.16b,v18.16b
eor v5.16b,v6.16b,v7.16b
aesd v1.16b,v22.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v22.16b
aesimc v18.16b,v18.16b
eor v17.16b,v3.16b,v7.16b
aesd v1.16b,v23.16b
aesd v18.16b,v23.16b
b.eq .Lcbc_dec_one
eor v5.16b,v5.16b,v1.16b
eor v17.16b,v17.16b,v18.16b
orr v6.16b,v19.16b,v19.16b
st1 {v5.16b},[x1],#16
st1 {v17.16b},[x1],#16
b .Lcbc_done
.Lcbc_dec_one:
eor v5.16b,v5.16b,v18.16b
orr v6.16b,v19.16b,v19.16b
st1 {v5.16b},[x1],#16
.Lcbc_done:
st1 {v6.16b},[x4]
.Lcbc_abort:
ldr x29,[sp],#16
ret
.size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
.globl aes_v8_ctr32_encrypt_blocks
.type aes_v8_ctr32_encrypt_blocks,%function
.align 5
aes_v8_ctr32_encrypt_blocks:
stp x29,x30,[sp,#-16]!
add x29,sp,#0
ldr w5,[x3,#240]
ldr w8, [x4, #12]
ld1 {v0.4s},[x4]
ld1 {v16.4s-v17.4s},[x3] // load key schedule...
sub w5,w5,#4
mov x12,#16
cmp x2,#2
add x7,x3,x5,lsl#4 // pointer to last 5 round keys
sub w5,w5,#2
ld1 {v20.4s-v21.4s},[x7],#32
ld1 {v22.4s-v23.4s},[x7],#32
ld1 {v7.4s},[x7]
add x7,x3,#32
mov w6,w5
csel x12,xzr,x12,lo
#ifndef __ARMEB__
rev w8, w8
#endif
orr v1.16b,v0.16b,v0.16b
add w10, w8, #1
orr v18.16b,v0.16b,v0.16b
add w8, w8, #2
orr v6.16b,v0.16b,v0.16b
rev w10, w10
mov v1.s[3],w10
b.ls .Lctr32_tail
rev w12, w8
sub x2,x2,#3 // bias
mov v18.s[3],w12
b .Loop3x_ctr32
.align 4
.Loop3x_ctr32:
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
aese v1.16b,v16.16b
aesmc v1.16b,v1.16b
aese v18.16b,v16.16b
aesmc v18.16b,v18.16b
ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
aese v1.16b,v17.16b
aesmc v1.16b,v1.16b
aese v18.16b,v17.16b
aesmc v18.16b,v18.16b
ld1 {v17.4s},[x7],#16
b.gt .Loop3x_ctr32
aese v0.16b,v16.16b
aesmc v4.16b,v0.16b
aese v1.16b,v16.16b
aesmc v5.16b,v1.16b
ld1 {v2.16b},[x0],#16
orr v0.16b,v6.16b,v6.16b
aese v18.16b,v16.16b
aesmc v18.16b,v18.16b
ld1 {v3.16b},[x0],#16
orr v1.16b,v6.16b,v6.16b
aese v4.16b,v17.16b
aesmc v4.16b,v4.16b
aese v5.16b,v17.16b
aesmc v5.16b,v5.16b
ld1 {v19.16b},[x0],#16
mov x7,x3
aese v18.16b,v17.16b
aesmc v17.16b,v18.16b
orr v18.16b,v6.16b,v6.16b
add w9,w8,#1
aese v4.16b,v20.16b
aesmc v4.16b,v4.16b
aese v5.16b,v20.16b
aesmc v5.16b,v5.16b
eor v2.16b,v2.16b,v7.16b
add w10,w8,#2
aese v17.16b,v20.16b
aesmc v17.16b,v17.16b
eor v3.16b,v3.16b,v7.16b
add w8,w8,#3
aese v4.16b,v21.16b
aesmc v4.16b,v4.16b
aese v5.16b,v21.16b
aesmc v5.16b,v5.16b
eor v19.16b,v19.16b,v7.16b
rev w9,w9
aese v17.16b,v21.16b
aesmc v17.16b,v17.16b
mov v0.s[3], w9
rev w10,w10
aese v4.16b,v22.16b
aesmc v4.16b,v4.16b
aese v5.16b,v22.16b
aesmc v5.16b,v5.16b
mov v1.s[3], w10
rev w12,w8
aese v17.16b,v22.16b
aesmc v17.16b,v17.16b
mov v18.s[3], w12
subs x2,x2,#3
aese v4.16b,v23.16b
aese v5.16b,v23.16b
aese v17.16b,v23.16b
eor v2.16b,v2.16b,v4.16b
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
st1 {v2.16b},[x1],#16
eor v3.16b,v3.16b,v5.16b
mov w6,w5
st1 {v3.16b},[x1],#16
eor v19.16b,v19.16b,v17.16b
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
st1 {v19.16b},[x1],#16
b.hs .Loop3x_ctr32
adds x2,x2,#3
b.eq .Lctr32_done
cmp x2,#1
mov x12,#16
csel x12,xzr,x12,eq
.Lctr32_tail:
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
aese v1.16b,v16.16b
aesmc v1.16b,v1.16b
ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
aese v1.16b,v17.16b
aesmc v1.16b,v1.16b
ld1 {v17.4s},[x7],#16
b.gt .Lctr32_tail
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
aese v1.16b,v16.16b
aesmc v1.16b,v1.16b
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
aese v1.16b,v17.16b
aesmc v1.16b,v1.16b
ld1 {v2.16b},[x0],x12
aese v0.16b,v20.16b
aesmc v0.16b,v0.16b
aese v1.16b,v20.16b
aesmc v1.16b,v1.16b
ld1 {v3.16b},[x0]
aese v0.16b,v21.16b
aesmc v0.16b,v0.16b
aese v1.16b,v21.16b
aesmc v1.16b,v1.16b
eor v2.16b,v2.16b,v7.16b
aese v0.16b,v22.16b
aesmc v0.16b,v0.16b
aese v1.16b,v22.16b
aesmc v1.16b,v1.16b
eor v3.16b,v3.16b,v7.16b
aese v0.16b,v23.16b
aese v1.16b,v23.16b
cmp x2,#1
eor v2.16b,v2.16b,v0.16b
eor v3.16b,v3.16b,v1.16b
st1 {v2.16b},[x1],#16
b.eq .Lctr32_done
st1 {v3.16b},[x1]
.Lctr32_done:
ldr x29,[sp],#16
ret
.size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
#endif

View File

@ -0,0 +1,228 @@
/* $FreeBSD$ */
/* Do not modify. This file is auto-generated from ghashv8-armx.pl. */
#include "arm_arch.h"
.text
.global gcm_init_v8
.type gcm_init_v8,%function
.align 4
gcm_init_v8:
ld1 {v17.2d},[x1] //load input H
movi v19.16b,#0xe1
shl v19.2d,v19.2d,#57 //0xc2.0
ext v3.16b,v17.16b,v17.16b,#8
ushr v18.2d,v19.2d,#63
dup v17.4s,v17.s[1]
ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01
ushr v18.2d,v3.2d,#63
sshr v17.4s,v17.4s,#31 //broadcast carry bit
and v18.16b,v18.16b,v16.16b
shl v3.2d,v3.2d,#1
ext v18.16b,v18.16b,v18.16b,#8
and v16.16b,v16.16b,v17.16b
orr v3.16b,v3.16b,v18.16b //H<<<=1
eor v20.16b,v3.16b,v16.16b //twisted H
st1 {v20.2d},[x0],#16 //store Htable[0]
//calculate H^2
ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing
pmull v0.1q,v20.1d,v20.1d
eor v16.16b,v16.16b,v20.16b
pmull2 v2.1q,v20.2d,v20.2d
pmull v1.1q,v16.1d,v16.1d
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
eor v0.16b,v1.16b,v18.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v22.16b,v0.16b,v18.16b
ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing
eor v17.16b,v17.16b,v22.16b
ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
st1 {v21.2d-v22.2d},[x0] //store Htable[1..2]
ret
.size gcm_init_v8,.-gcm_init_v8
.global gcm_gmult_v8
.type gcm_gmult_v8,%function
.align 4
gcm_gmult_v8:
ld1 {v17.2d},[x0] //load Xi
movi v19.16b,#0xe1
ld1 {v20.2d-v21.2d},[x1] //load twisted H, ...
shl v19.2d,v19.2d,#57
#ifndef __ARMEB__
rev64 v17.16b,v17.16b
#endif
ext v3.16b,v17.16b,v17.16b,#8
pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
eor v0.16b,v1.16b,v18.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v0.16b,v0.16b,v18.16b
#ifndef __ARMEB__
rev64 v0.16b,v0.16b
#endif
ext v0.16b,v0.16b,v0.16b,#8
st1 {v0.2d},[x0] //write out Xi
ret
.size gcm_gmult_v8,.-gcm_gmult_v8
.global gcm_ghash_v8
.type gcm_ghash_v8,%function
.align 4
gcm_ghash_v8:
ld1 {v0.2d},[x0] //load [rotated] Xi
//"[rotated]" means that
//loaded value would have
//to be rotated in order to
//make it appear as in
//alorithm specification
subs x3,x3,#32 //see if x3 is 32 or larger
mov x12,#16 //x12 is used as post-
//increment for input pointer;
//as loop is modulo-scheduled
//x12 is zeroed just in time
//to preclude oversteping
//inp[len], which means that
//last block[s] are actually
//loaded twice, but last
//copy is not processed
ld1 {v20.2d-v21.2d},[x1],#32 //load twisted H, ..., H^2
movi v19.16b,#0xe1
ld1 {v22.2d},[x1]
csel x12,xzr,x12,eq //is it time to zero x12?
ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi
ld1 {v16.2d},[x2],#16 //load [rotated] I[0]
shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
#ifndef __ARMEB__
rev64 v16.16b,v16.16b
rev64 v0.16b,v0.16b
#endif
ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]
b.lo .Lodd_tail_v8 //x3 was less than 32
ld1 {v17.2d},[x2],x12 //load [rotated] I[1]
#ifndef __ARMEB__
rev64 v17.16b,v17.16b
#endif
ext v7.16b,v17.16b,v17.16b,#8
eor v3.16b,v3.16b,v0.16b //I[i]^=Xi
pmull v4.1q,v20.1d,v7.1d //H·Ii+1
eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
pmull2 v6.1q,v20.2d,v7.2d
b .Loop_mod2x_v8
.align 4
.Loop_mod2x_v8:
ext v18.16b,v3.16b,v3.16b,#8
subs x3,x3,#32 //is there more data?
pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo
csel x12,xzr,x12,lo //is it time to zero x12?
pmull v5.1q,v21.1d,v17.1d
eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing
pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi
eor v0.16b,v0.16b,v4.16b //accumulate
pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2]
eor v2.16b,v2.16b,v6.16b
csel x12,xzr,x12,eq //is it time to zero x12?
eor v1.16b,v1.16b,v5.16b
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]
#ifndef __ARMEB__
rev64 v16.16b,v16.16b
#endif
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
#ifndef __ARMEB__
rev64 v17.16b,v17.16b
#endif
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
ext v7.16b,v17.16b,v17.16b,#8
ext v3.16b,v16.16b,v16.16b,#8
eor v0.16b,v1.16b,v18.16b
pmull v4.1q,v20.1d,v7.1d //H·Ii+1
eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v3.16b,v3.16b,v18.16b
eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
eor v3.16b,v3.16b,v0.16b
pmull2 v6.1q,v20.2d,v7.2d
b.hs .Loop_mod2x_v8 //there was at least 32 more bytes
eor v2.16b,v2.16b,v18.16b
ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b
adds x3,x3,#32 //re-construct x3
eor v0.16b,v0.16b,v2.16b //re-construct v0.16b
b.eq .Ldone_v8 //is x3 zero?
.Lodd_tail_v8:
ext v18.16b,v0.16b,v0.16b,#8
eor v3.16b,v3.16b,v0.16b //inp^=Xi
eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi
pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
eor v0.16b,v1.16b,v18.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v0.16b,v0.16b,v18.16b
.Ldone_v8:
#ifndef __ARMEB__
rev64 v0.16b,v0.16b
#endif
ext v0.16b,v0.16b,v0.16b,#8
st1 {v0.2d},[x0] //write out Xi
ret
.size gcm_ghash_v8,.-gcm_ghash_v8
.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
.align 2

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff