Regen assemply files for aarch64.

This commit is contained in:
Jung-uk Kim 2018-09-22 02:23:03 +00:00
parent 0633b14ba1
commit bde62812ae
12 changed files with 13476 additions and 874 deletions

View File

@ -10,19 +10,35 @@
.PATH: ${LCRYPTO_SRC}/crypto \
${LCRYPTO_SRC}/crypto/aes/asm \
${LCRYPTO_SRC}/crypto/bn/asm \
${LCRYPTO_SRC}/crypto/chacha/asm \
${LCRYPTO_SRC}/crypto/ec/asm \
${LCRYPTO_SRC}/crypto/modes/asm \
${LCRYPTO_SRC}/crypto/poly1305/asm \
${LCRYPTO_SRC}/crypto/sha/asm
PERLPATH= -I${LCRYPTO_SRC}/crypto/perlasm
# aes
SRCS= aesv8-armx.pl
SRCS= aesv8-armx.pl vpaes-armv8.pl
# bn
SRCS+= armv8-mont.pl
# chacha
SRCS+= chacha-armv8.pl
# ec
SRCS+= ecp_nistz256-armv8.pl
# modes
SRCS+= ghashv8-armx.pl
# poly1305
SRCS+= poly1305-armv8.pl
# sha
SRCS+= sha1-armv8.pl sha512-armv8.pl
SRCS+= keccak1600-armv8.pl sha1-armv8.pl sha512-armv8.pl
ASM= ${SRCS:R:S/$/.S/} sha256-armv8.S
@ -32,13 +48,13 @@ CLEANFILES= ${ASM} ${SRCS:R:S/$/.s/} sha256-armv8.s
.SUFFIXES: .pl
sha256-armv8.S: sha512-armv8.pl
env CC=cc perl ${.ALLSRC} 64 ${.TARGET:R:S/$/.s/}
env CC=cc perl ${.ALLSRC} linux64 ${.TARGET:R:S/$/.s/}
( echo '/* $$'FreeBSD'$$ */' ;\
echo '/* Do not modify. This file is auto-generated from ${.ALLSRC:T:R:S/$/.pl/}. */' ;\
cat ${.TARGET:R:S/$/.s/}) > ${.TARGET}
.pl.S:
env CC=cc perl ${.IMPSRC} 64 ${.TARGET:R:S/$/.s/}
env CC=cc perl ${.IMPSRC} linux64 ${.TARGET:R:S/$/.s/}
( echo '/* $$'FreeBSD'$$ */' ;\
echo '/* Do not modify. This file is auto-generated from ${.IMPSRC:T:R:S/$/.pl/}. */' ;\
cat ${.TARGET:R:S/$/.s/}) > ${.TARGET}
@ -160,10 +176,10 @@ CLEANFILES= ${ASM} ${SRCS:R:S/$/.s/}
aes-armv4.S: aes-armv4.pl
( echo '/* $$'FreeBSD'$$ */' ;\
echo '/* Do not modify. This file is auto-generated from ${.ALLSRC:T}. */' ;\
env CC=cc perl ${.ALLSRC} elf ) > ${.TARGET}
env CC=cc perl ${.ALLSRC} linux32 ) > ${.TARGET}
.pl.S:
env CC=cc perl ${.IMPSRC} elf ${.TARGET:R:S/$/.s/}
env CC=cc perl ${.IMPSRC} linux32 ${.TARGET:R:S/$/.s/}
( echo '/* $$'FreeBSD'$$ */' ;\
echo '/* Do not modify. This file is auto-generated from ${.IMPSRC:T:R:S/$/.pl/}. */' ;\
cat ${.TARGET:R:S/$/.s/}) > ${.TARGET}

View File

@ -5,7 +5,7 @@
#if __ARM_MAX_ARCH__>=7
.text
.align 5
rcon:
.Lrcon:
.long 0x01,0x01,0x01,0x01
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
.long 0x1b,0x1b,0x1b,0x1b
@ -30,7 +30,7 @@ aes_v8_set_encrypt_key:
tst w1,#0x3f
b.ne .Lenc_key_abort
adr x3,rcon
adr x3,.Lrcon
cmp w1,#192
eor v0.16b,v0.16b,v0.16b
@ -54,7 +54,7 @@ aes_v8_set_encrypt_key:
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v6.16b,v6.16b,v1.16b
eor v6.16b,v6.16b,v1.16b
eor v3.16b,v3.16b,v5.16b
shl v1.16b,v1.16b,#1
eor v3.16b,v3.16b,v6.16b
@ -71,7 +71,7 @@ aes_v8_set_encrypt_key:
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v6.16b,v6.16b,v1.16b
eor v6.16b,v6.16b,v1.16b
eor v3.16b,v3.16b,v5.16b
shl v1.16b,v1.16b,#1
eor v3.16b,v3.16b,v6.16b
@ -85,7 +85,7 @@ aes_v8_set_encrypt_key:
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v6.16b,v6.16b,v1.16b
eor v6.16b,v6.16b,v1.16b
eor v3.16b,v3.16b,v5.16b
eor v3.16b,v3.16b,v6.16b
st1 {v3.4s},[x2]
@ -116,7 +116,7 @@ aes_v8_set_encrypt_key:
dup v5.4s,v3.s[3]
eor v5.16b,v5.16b,v4.16b
eor v6.16b,v6.16b,v1.16b
eor v6.16b,v6.16b,v1.16b
ext v4.16b,v0.16b,v4.16b,#12
shl v1.16b,v1.16b,#1
eor v4.16b,v4.16b,v5.16b
@ -147,7 +147,7 @@ aes_v8_set_encrypt_key:
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v6.16b,v6.16b,v1.16b
eor v6.16b,v6.16b,v1.16b
eor v3.16b,v3.16b,v5.16b
shl v1.16b,v1.16b,#1
eor v3.16b,v3.16b,v6.16b
@ -291,13 +291,13 @@ aes_v8_cbc_encrypt:
ld1 {v6.16b},[x4]
ld1 {v0.16b},[x0],x8
ld1 {v16.4s-v17.4s},[x3] // load key schedule...
ld1 {v16.4s,v17.4s},[x3] // load key schedule...
sub w5,w5,#6
add x7,x3,x5,lsl#4 // pointer to last 7 round keys
sub w5,w5,#2
ld1 {v18.4s-v19.4s},[x7],#32
ld1 {v20.4s-v21.4s},[x7],#32
ld1 {v22.4s-v23.4s},[x7],#32
ld1 {v18.4s,v19.4s},[x7],#32
ld1 {v20.4s,v21.4s},[x7],#32
ld1 {v22.4s,v23.4s},[x7],#32
ld1 {v7.4s},[x7]
add x7,x3,#32
@ -309,7 +309,7 @@ aes_v8_cbc_encrypt:
eor v5.16b,v16.16b,v7.16b
b.eq .Lcbc_enc128
ld1 {v2.4s-v3.4s},[x7]
ld1 {v2.4s,v3.4s},[x7]
add x7,x3,#16
add x6,x3,#16*4
add x12,x3,#16*5
@ -323,7 +323,7 @@ aes_v8_cbc_encrypt:
.Loop_cbc_enc:
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
st1 {v6.16b},[x1],#16
st1 {v6.16b},[x1],#16
.Lenter_cbc_enc:
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
@ -347,21 +347,21 @@ aes_v8_cbc_encrypt:
.Lcbc_enc192:
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
subs x2,x2,#16
subs x2,x2,#16
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
csel x8,xzr,x8,eq
csel x8,xzr,x8,eq
aese v0.16b,v18.16b
aesmc v0.16b,v0.16b
aese v0.16b,v19.16b
aesmc v0.16b,v0.16b
ld1 {v16.16b},[x0],x8
ld1 {v16.16b},[x0],x8
aese v0.16b,v20.16b
aesmc v0.16b,v0.16b
eor v16.16b,v16.16b,v5.16b
eor v16.16b,v16.16b,v5.16b
aese v0.16b,v21.16b
aesmc v0.16b,v0.16b
ld1 {v17.4s},[x7] // re-pre-load rndkey[1]
ld1 {v17.4s},[x7] // re-pre-load rndkey[1]
aese v0.16b,v22.16b
aesmc v0.16b,v0.16b
aese v0.16b,v23.16b
@ -373,35 +373,35 @@ aes_v8_cbc_encrypt:
.align 5
.Lcbc_enc128:
ld1 {v2.4s-v3.4s},[x7]
ld1 {v2.4s,v3.4s},[x7]
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
b .Lenter_cbc_enc128
.Loop_cbc_enc128:
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
st1 {v6.16b},[x1],#16
st1 {v6.16b},[x1],#16
.Lenter_cbc_enc128:
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
subs x2,x2,#16
subs x2,x2,#16
aese v0.16b,v2.16b
aesmc v0.16b,v0.16b
csel x8,xzr,x8,eq
csel x8,xzr,x8,eq
aese v0.16b,v3.16b
aesmc v0.16b,v0.16b
aese v0.16b,v18.16b
aesmc v0.16b,v0.16b
aese v0.16b,v19.16b
aesmc v0.16b,v0.16b
ld1 {v16.16b},[x0],x8
ld1 {v16.16b},[x0],x8
aese v0.16b,v20.16b
aesmc v0.16b,v0.16b
aese v0.16b,v21.16b
aesmc v0.16b,v0.16b
aese v0.16b,v22.16b
aesmc v0.16b,v0.16b
eor v16.16b,v16.16b,v5.16b
eor v16.16b,v16.16b,v5.16b
aese v0.16b,v23.16b
eor v6.16b,v0.16b,v7.16b
b.hs .Loop_cbc_enc128
@ -448,58 +448,58 @@ aes_v8_cbc_encrypt:
aesimc v1.16b,v1.16b
aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
eor v4.16b,v6.16b,v7.16b
subs x2,x2,#0x30
eor v5.16b,v2.16b,v7.16b
csel x6,x2,x6,lo // x6, w6, is zero at this point
eor v4.16b,v6.16b,v7.16b
subs x2,x2,#0x30
eor v5.16b,v2.16b,v7.16b
csel x6,x2,x6,lo // x6, w6, is zero at this point
aesd v0.16b,v17.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v17.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
eor v17.16b,v3.16b,v7.16b
add x0,x0,x6 // x0 is adjusted in such way that
eor v17.16b,v3.16b,v7.16b
add x0,x0,x6 // x0 is adjusted in such way that
// at exit from the loop v1.16b-v18.16b
// are loaded with last "words"
orr v6.16b,v19.16b,v19.16b
mov x7,x3
orr v6.16b,v19.16b,v19.16b
mov x7,x3
aesd v0.16b,v20.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v20.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v20.16b
aesimc v18.16b,v18.16b
ld1 {v2.16b},[x0],#16
ld1 {v2.16b},[x0],#16
aesd v0.16b,v21.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v21.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v21.16b
aesimc v18.16b,v18.16b
ld1 {v3.16b},[x0],#16
ld1 {v3.16b},[x0],#16
aesd v0.16b,v22.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v22.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v22.16b
aesimc v18.16b,v18.16b
ld1 {v19.16b},[x0],#16
ld1 {v19.16b},[x0],#16
aesd v0.16b,v23.16b
aesd v1.16b,v23.16b
aesd v18.16b,v23.16b
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
add w6,w5,#2
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
add w6,w5,#2
eor v4.16b,v4.16b,v0.16b
eor v5.16b,v5.16b,v1.16b
eor v18.16b,v18.16b,v17.16b
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
st1 {v4.16b},[x1],#16
orr v0.16b,v2.16b,v2.16b
orr v0.16b,v2.16b,v2.16b
st1 {v5.16b},[x1],#16
orr v1.16b,v3.16b,v3.16b
orr v1.16b,v3.16b,v3.16b
st1 {v18.16b},[x1],#16
orr v18.16b,v19.16b,v19.16b
orr v18.16b,v19.16b,v19.16b
b.hs .Loop3x_cbc_dec
cmn x2,#0x30
@ -532,30 +532,30 @@ aes_v8_cbc_encrypt:
aesimc v1.16b,v1.16b
aesd v18.16b,v20.16b
aesimc v18.16b,v18.16b
cmn x2,#0x20
cmn x2,#0x20
aesd v1.16b,v21.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v21.16b
aesimc v18.16b,v18.16b
eor v5.16b,v6.16b,v7.16b
eor v5.16b,v6.16b,v7.16b
aesd v1.16b,v22.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v22.16b
aesimc v18.16b,v18.16b
eor v17.16b,v3.16b,v7.16b
eor v17.16b,v3.16b,v7.16b
aesd v1.16b,v23.16b
aesd v18.16b,v23.16b
b.eq .Lcbc_dec_one
eor v5.16b,v5.16b,v1.16b
eor v17.16b,v17.16b,v18.16b
orr v6.16b,v19.16b,v19.16b
orr v6.16b,v19.16b,v19.16b
st1 {v5.16b},[x1],#16
st1 {v17.16b},[x1],#16
b .Lcbc_done
.Lcbc_dec_one:
eor v5.16b,v5.16b,v18.16b
orr v6.16b,v19.16b,v19.16b
orr v6.16b,v19.16b,v19.16b
st1 {v5.16b},[x1],#16
.Lcbc_done:
@ -568,181 +568,181 @@ aes_v8_cbc_encrypt:
.type aes_v8_ctr32_encrypt_blocks,%function
.align 5
aes_v8_ctr32_encrypt_blocks:
stp x29,x30,[sp,#-16]!
add x29,sp,#0
ldr w5,[x3,#240]
stp x29,x30,[sp,#-16]!
add x29,sp,#0
ldr w5,[x3,#240]
ldr w8, [x4, #12]
ld1 {v0.4s},[x4]
ldr w8, [x4, #12]
ld1 {v0.4s},[x4]
ld1 {v16.4s-v17.4s},[x3] // load key schedule...
sub w5,w5,#4
mov x12,#16
cmp x2,#2
add x7,x3,x5,lsl#4 // pointer to last 5 round keys
sub w5,w5,#2
ld1 {v20.4s-v21.4s},[x7],#32
ld1 {v22.4s-v23.4s},[x7],#32
ld1 {v7.4s},[x7]
add x7,x3,#32
mov w6,w5
ld1 {v16.4s,v17.4s},[x3] // load key schedule...
sub w5,w5,#4
mov x12,#16
cmp x2,#2
add x7,x3,x5,lsl#4 // pointer to last 5 round keys
sub w5,w5,#2
ld1 {v20.4s,v21.4s},[x7],#32
ld1 {v22.4s,v23.4s},[x7],#32
ld1 {v7.4s},[x7]
add x7,x3,#32
mov w6,w5
csel x12,xzr,x12,lo
#ifndef __ARMEB__
rev w8, w8
rev w8, w8
#endif
orr v1.16b,v0.16b,v0.16b
add w10, w8, #1
orr v18.16b,v0.16b,v0.16b
add w8, w8, #2
orr v6.16b,v0.16b,v0.16b
rev w10, w10
mov v1.s[3],w10
b.ls .Lctr32_tail
rev w12, w8
sub x2,x2,#3 // bias
mov v18.s[3],w12
b .Loop3x_ctr32
orr v1.16b,v0.16b,v0.16b
add w10, w8, #1
orr v18.16b,v0.16b,v0.16b
add w8, w8, #2
orr v6.16b,v0.16b,v0.16b
rev w10, w10
mov v1.s[3],w10
b.ls .Lctr32_tail
rev w12, w8
sub x2,x2,#3 // bias
mov v18.s[3],w12
b .Loop3x_ctr32
.align 4
.Loop3x_ctr32:
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
aese v1.16b,v16.16b
aesmc v1.16b,v1.16b
aese v18.16b,v16.16b
aesmc v18.16b,v18.16b
ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
aese v1.16b,v17.16b
aesmc v1.16b,v1.16b
aese v18.16b,v17.16b
aesmc v18.16b,v18.16b
ld1 {v17.4s},[x7],#16
b.gt .Loop3x_ctr32
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
aese v1.16b,v16.16b
aesmc v1.16b,v1.16b
aese v18.16b,v16.16b
aesmc v18.16b,v18.16b
ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
aese v1.16b,v17.16b
aesmc v1.16b,v1.16b
aese v18.16b,v17.16b
aesmc v18.16b,v18.16b
ld1 {v17.4s},[x7],#16
b.gt .Loop3x_ctr32
aese v0.16b,v16.16b
aesmc v4.16b,v0.16b
aese v1.16b,v16.16b
aesmc v5.16b,v1.16b
ld1 {v2.16b},[x0],#16
orr v0.16b,v6.16b,v6.16b
aese v18.16b,v16.16b
aesmc v18.16b,v18.16b
ld1 {v3.16b},[x0],#16
orr v1.16b,v6.16b,v6.16b
aese v4.16b,v17.16b
aesmc v4.16b,v4.16b
aese v5.16b,v17.16b
aesmc v5.16b,v5.16b
ld1 {v19.16b},[x0],#16
mov x7,x3
aese v18.16b,v17.16b
aesmc v17.16b,v18.16b
orr v18.16b,v6.16b,v6.16b
add w9,w8,#1
aese v4.16b,v20.16b
aesmc v4.16b,v4.16b
aese v5.16b,v20.16b
aesmc v5.16b,v5.16b
eor v2.16b,v2.16b,v7.16b
add w10,w8,#2
aese v17.16b,v20.16b
aesmc v17.16b,v17.16b
eor v3.16b,v3.16b,v7.16b
add w8,w8,#3
aese v4.16b,v21.16b
aesmc v4.16b,v4.16b
aese v5.16b,v21.16b
aesmc v5.16b,v5.16b
eor v19.16b,v19.16b,v7.16b
rev w9,w9
aese v17.16b,v21.16b
aesmc v17.16b,v17.16b
mov v0.s[3], w9
rev w10,w10
aese v4.16b,v22.16b
aesmc v4.16b,v4.16b
aese v5.16b,v22.16b
aesmc v5.16b,v5.16b
mov v1.s[3], w10
rev w12,w8
aese v17.16b,v22.16b
aesmc v17.16b,v17.16b
mov v18.s[3], w12
subs x2,x2,#3
aese v4.16b,v23.16b
aese v5.16b,v23.16b
aese v17.16b,v23.16b
aese v0.16b,v16.16b
aesmc v4.16b,v0.16b
aese v1.16b,v16.16b
aesmc v5.16b,v1.16b
ld1 {v2.16b},[x0],#16
orr v0.16b,v6.16b,v6.16b
aese v18.16b,v16.16b
aesmc v18.16b,v18.16b
ld1 {v3.16b},[x0],#16
orr v1.16b,v6.16b,v6.16b
aese v4.16b,v17.16b
aesmc v4.16b,v4.16b
aese v5.16b,v17.16b
aesmc v5.16b,v5.16b
ld1 {v19.16b},[x0],#16
mov x7,x3
aese v18.16b,v17.16b
aesmc v17.16b,v18.16b
orr v18.16b,v6.16b,v6.16b
add w9,w8,#1
aese v4.16b,v20.16b
aesmc v4.16b,v4.16b
aese v5.16b,v20.16b
aesmc v5.16b,v5.16b
eor v2.16b,v2.16b,v7.16b
add w10,w8,#2
aese v17.16b,v20.16b
aesmc v17.16b,v17.16b
eor v3.16b,v3.16b,v7.16b
add w8,w8,#3
aese v4.16b,v21.16b
aesmc v4.16b,v4.16b
aese v5.16b,v21.16b
aesmc v5.16b,v5.16b
eor v19.16b,v19.16b,v7.16b
rev w9,w9
aese v17.16b,v21.16b
aesmc v17.16b,v17.16b
mov v0.s[3], w9
rev w10,w10
aese v4.16b,v22.16b
aesmc v4.16b,v4.16b
aese v5.16b,v22.16b
aesmc v5.16b,v5.16b
mov v1.s[3], w10
rev w12,w8
aese v17.16b,v22.16b
aesmc v17.16b,v17.16b
mov v18.s[3], w12
subs x2,x2,#3
aese v4.16b,v23.16b
aese v5.16b,v23.16b
aese v17.16b,v23.16b
eor v2.16b,v2.16b,v4.16b
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
st1 {v2.16b},[x1],#16
eor v3.16b,v3.16b,v5.16b
mov w6,w5
st1 {v3.16b},[x1],#16
eor v19.16b,v19.16b,v17.16b
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
st1 {v19.16b},[x1],#16
b.hs .Loop3x_ctr32
eor v2.16b,v2.16b,v4.16b
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
st1 {v2.16b},[x1],#16
eor v3.16b,v3.16b,v5.16b
mov w6,w5
st1 {v3.16b},[x1],#16
eor v19.16b,v19.16b,v17.16b
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
st1 {v19.16b},[x1],#16
b.hs .Loop3x_ctr32
adds x2,x2,#3
b.eq .Lctr32_done
cmp x2,#1
mov x12,#16
adds x2,x2,#3
b.eq .Lctr32_done
cmp x2,#1
mov x12,#16
csel x12,xzr,x12,eq
.Lctr32_tail:
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
aese v1.16b,v16.16b
aesmc v1.16b,v1.16b
ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
aese v1.16b,v17.16b
aesmc v1.16b,v1.16b
ld1 {v17.4s},[x7],#16
b.gt .Lctr32_tail
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
aese v1.16b,v16.16b
aesmc v1.16b,v1.16b
ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
aese v1.16b,v17.16b
aesmc v1.16b,v1.16b
ld1 {v17.4s},[x7],#16
b.gt .Lctr32_tail
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
aese v1.16b,v16.16b
aesmc v1.16b,v1.16b
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
aese v1.16b,v17.16b
aesmc v1.16b,v1.16b
ld1 {v2.16b},[x0],x12
aese v0.16b,v20.16b
aesmc v0.16b,v0.16b
aese v1.16b,v20.16b
aesmc v1.16b,v1.16b
ld1 {v3.16b},[x0]
aese v0.16b,v21.16b
aesmc v0.16b,v0.16b
aese v1.16b,v21.16b
aesmc v1.16b,v1.16b
eor v2.16b,v2.16b,v7.16b
aese v0.16b,v22.16b
aesmc v0.16b,v0.16b
aese v1.16b,v22.16b
aesmc v1.16b,v1.16b
eor v3.16b,v3.16b,v7.16b
aese v0.16b,v23.16b
aese v1.16b,v23.16b
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
aese v1.16b,v16.16b
aesmc v1.16b,v1.16b
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
aese v1.16b,v17.16b
aesmc v1.16b,v1.16b
ld1 {v2.16b},[x0],x12
aese v0.16b,v20.16b
aesmc v0.16b,v0.16b
aese v1.16b,v20.16b
aesmc v1.16b,v1.16b
ld1 {v3.16b},[x0]
aese v0.16b,v21.16b
aesmc v0.16b,v0.16b
aese v1.16b,v21.16b
aesmc v1.16b,v1.16b
eor v2.16b,v2.16b,v7.16b
aese v0.16b,v22.16b
aesmc v0.16b,v0.16b
aese v1.16b,v22.16b
aesmc v1.16b,v1.16b
eor v3.16b,v3.16b,v7.16b
aese v0.16b,v23.16b
aese v1.16b,v23.16b
cmp x2,#1
eor v2.16b,v2.16b,v0.16b
eor v3.16b,v3.16b,v1.16b
st1 {v2.16b},[x1],#16
b.eq .Lctr32_done
st1 {v3.16b},[x1]
cmp x2,#1
eor v2.16b,v2.16b,v0.16b
eor v3.16b,v3.16b,v1.16b
st1 {v2.16b},[x1],#16
b.eq .Lctr32_done
st1 {v3.16b},[x1]
.Lctr32_done:
ldr x29,[sp],#16
ldr x29,[sp],#16
ret
.size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -2,227 +2,552 @@
/* Do not modify. This file is auto-generated from ghashv8-armx.pl. */
#include "arm_arch.h"
#if __ARM_MAX_ARCH__>=7
.text
.global gcm_init_v8
.globl gcm_init_v8
.type gcm_init_v8,%function
.align 4
gcm_init_v8:
ld1 {v17.2d},[x1] //load input H
movi v19.16b,#0xe1
ld1 {v17.2d},[x1] //load input H
movi v19.16b,#0xe1
shl v19.2d,v19.2d,#57 //0xc2.0
ext v3.16b,v17.16b,v17.16b,#8
ext v3.16b,v17.16b,v17.16b,#8
ushr v18.2d,v19.2d,#63
dup v17.4s,v17.s[1]
ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01
dup v17.4s,v17.s[1]
ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01
ushr v18.2d,v3.2d,#63
sshr v17.4s,v17.4s,#31 //broadcast carry bit
and v18.16b,v18.16b,v16.16b
and v18.16b,v18.16b,v16.16b
shl v3.2d,v3.2d,#1
ext v18.16b,v18.16b,v18.16b,#8
and v16.16b,v16.16b,v17.16b
orr v3.16b,v3.16b,v18.16b //H<<<=1
eor v20.16b,v3.16b,v16.16b //twisted H
st1 {v20.2d},[x0],#16 //store Htable[0]
ext v18.16b,v18.16b,v18.16b,#8
and v16.16b,v16.16b,v17.16b
orr v3.16b,v3.16b,v18.16b //H<<<=1
eor v20.16b,v3.16b,v16.16b //twisted H
st1 {v20.2d},[x0],#16 //store Htable[0]
//calculate H^2
ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing
ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing
pmull v0.1q,v20.1d,v20.1d
eor v16.16b,v16.16b,v20.16b
eor v16.16b,v16.16b,v20.16b
pmull2 v2.1q,v20.2d,v20.2d
pmull v1.1q,v16.1d,v16.1d
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
eor v1.16b,v1.16b,v18.16b
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
eor v0.16b,v1.16b,v18.16b
eor v0.16b,v1.16b,v18.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v22.16b,v0.16b,v18.16b
eor v18.16b,v18.16b,v2.16b
eor v22.16b,v0.16b,v18.16b
ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing
eor v17.16b,v17.16b,v22.16b
ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
st1 {v21.2d-v22.2d},[x0] //store Htable[1..2]
ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing
eor v17.16b,v17.16b,v22.16b
ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2]
//calculate H^3 and H^4
pmull v0.1q,v20.1d, v22.1d
pmull v5.1q,v22.1d,v22.1d
pmull2 v2.1q,v20.2d, v22.2d
pmull2 v7.1q,v22.2d,v22.2d
pmull v1.1q,v16.1d,v17.1d
pmull v6.1q,v17.1d,v17.1d
ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
ext v17.16b,v5.16b,v7.16b,#8
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v16.16b
eor v4.16b,v5.16b,v7.16b
eor v6.16b,v6.16b,v17.16b
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase
eor v6.16b,v6.16b,v4.16b
pmull v4.1q,v5.1d,v19.1d
ins v2.d[0],v1.d[1]
ins v7.d[0],v6.d[1]
ins v1.d[1],v0.d[0]
ins v6.d[1],v5.d[0]
eor v0.16b,v1.16b,v18.16b
eor v5.16b,v6.16b,v4.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
ext v4.16b,v5.16b,v5.16b,#8
pmull v0.1q,v0.1d,v19.1d
pmull v5.1q,v5.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v4.16b,v4.16b,v7.16b
eor v20.16b, v0.16b,v18.16b //H^3
eor v22.16b,v5.16b,v4.16b //H^4
ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing
ext v17.16b,v22.16b,v22.16b,#8
eor v16.16b,v16.16b,v20.16b
eor v17.16b,v17.16b,v22.16b
ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5]
ret
.size gcm_init_v8,.-gcm_init_v8
.global gcm_gmult_v8
.globl gcm_gmult_v8
.type gcm_gmult_v8,%function
.align 4
gcm_gmult_v8:
ld1 {v17.2d},[x0] //load Xi
movi v19.16b,#0xe1
ld1 {v20.2d-v21.2d},[x1] //load twisted H, ...
ld1 {v17.2d},[x0] //load Xi
movi v19.16b,#0xe1
ld1 {v20.2d,v21.2d},[x1] //load twisted H, ...
shl v19.2d,v19.2d,#57
#ifndef __ARMEB__
rev64 v17.16b,v17.16b
#endif
ext v3.16b,v17.16b,v17.16b,#8
ext v3.16b,v17.16b,v17.16b,#8
pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
eor v1.16b,v1.16b,v18.16b
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
eor v0.16b,v1.16b,v18.16b
eor v0.16b,v1.16b,v18.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v0.16b,v0.16b,v18.16b
eor v18.16b,v18.16b,v2.16b
eor v0.16b,v0.16b,v18.16b
#ifndef __ARMEB__
rev64 v0.16b,v0.16b
#endif
ext v0.16b,v0.16b,v0.16b,#8
st1 {v0.2d},[x0] //write out Xi
ext v0.16b,v0.16b,v0.16b,#8
st1 {v0.2d},[x0] //write out Xi
ret
.size gcm_gmult_v8,.-gcm_gmult_v8
.global gcm_ghash_v8
.globl gcm_ghash_v8
.type gcm_ghash_v8,%function
.align 4
gcm_ghash_v8:
ld1 {v0.2d},[x0] //load [rotated] Xi
cmp x3,#64
b.hs .Lgcm_ghash_v8_4x
ld1 {v0.2d},[x0] //load [rotated] Xi
//"[rotated]" means that
//loaded value would have
//to be rotated in order to
//make it appear as in
//alorithm specification
subs x3,x3,#32 //see if x3 is 32 or larger
mov x12,#16 //x12 is used as post-
//algorithm specification
subs x3,x3,#32 //see if x3 is 32 or larger
mov x12,#16 //x12 is used as post-
//increment for input pointer;
//as loop is modulo-scheduled
//x12 is zeroed just in time
//to preclude oversteping
//to preclude overstepping
//inp[len], which means that
//last block[s] are actually
//loaded twice, but last
//copy is not processed
ld1 {v20.2d-v21.2d},[x1],#32 //load twisted H, ..., H^2
movi v19.16b,#0xe1
ld1 {v22.2d},[x1]
ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2
movi v19.16b,#0xe1
ld1 {v22.2d},[x1]
csel x12,xzr,x12,eq //is it time to zero x12?
ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi
ld1 {v16.2d},[x2],#16 //load [rotated] I[0]
ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi
ld1 {v16.2d},[x2],#16 //load [rotated] I[0]
shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
#ifndef __ARMEB__
rev64 v16.16b,v16.16b
rev64 v0.16b,v0.16b
#endif
ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]
b.lo .Lodd_tail_v8 //x3 was less than 32
ld1 {v17.2d},[x2],x12 //load [rotated] I[1]
ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]
b.lo .Lodd_tail_v8 //x3 was less than 32
ld1 {v17.2d},[x2],x12 //load [rotated] I[1]
#ifndef __ARMEB__
rev64 v17.16b,v17.16b
#endif
ext v7.16b,v17.16b,v17.16b,#8
eor v3.16b,v3.16b,v0.16b //I[i]^=Xi
ext v7.16b,v17.16b,v17.16b,#8
eor v3.16b,v3.16b,v0.16b //I[i]^=Xi
pmull v4.1q,v20.1d,v7.1d //H·Ii+1
eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
pmull2 v6.1q,v20.2d,v7.2d
b .Loop_mod2x_v8
b .Loop_mod2x_v8
.align 4
.Loop_mod2x_v8:
ext v18.16b,v3.16b,v3.16b,#8
subs x3,x3,#32 //is there more data?
ext v18.16b,v3.16b,v3.16b,#8
subs x3,x3,#32 //is there more data?
pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo
csel x12,xzr,x12,lo //is it time to zero x12?
pmull v5.1q,v21.1d,v17.1d
eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing
pmull v5.1q,v21.1d,v17.1d
eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing
pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi
eor v0.16b,v0.16b,v4.16b //accumulate
eor v0.16b,v0.16b,v4.16b //accumulate
pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2]
ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2]
eor v2.16b,v2.16b,v6.16b
csel x12,xzr,x12,eq //is it time to zero x12?
eor v1.16b,v1.16b,v5.16b
eor v2.16b,v2.16b,v6.16b
csel x12,xzr,x12,eq //is it time to zero x12?
eor v1.16b,v1.16b,v5.16b
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]
#ifndef __ARMEB__
rev64 v16.16b,v16.16b
rev64 v16.16b,v16.16b
#endif
eor v1.16b,v1.16b,v18.16b
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
#ifndef __ARMEB__
rev64 v17.16b,v17.16b
rev64 v17.16b,v17.16b
#endif
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
ext v7.16b,v17.16b,v17.16b,#8
ext v3.16b,v16.16b,v16.16b,#8
eor v0.16b,v1.16b,v18.16b
pmull v4.1q,v20.1d,v7.1d //H·Ii+1
eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early
ext v7.16b,v17.16b,v17.16b,#8
ext v3.16b,v16.16b,v16.16b,#8
eor v0.16b,v1.16b,v18.16b
pmull v4.1q,v20.1d,v7.1d //H·Ii+1
eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v3.16b,v3.16b,v18.16b
eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
eor v3.16b,v3.16b,v0.16b
pmull2 v6.1q,v20.2d,v7.2d
b.hs .Loop_mod2x_v8 //there was at least 32 more bytes
eor v3.16b,v3.16b,v18.16b
eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
eor v3.16b,v3.16b,v0.16b
pmull2 v6.1q,v20.2d,v7.2d
b.hs .Loop_mod2x_v8 //there was at least 32 more bytes
eor v2.16b,v2.16b,v18.16b
ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b
adds x3,x3,#32 //re-construct x3
eor v0.16b,v0.16b,v2.16b //re-construct v0.16b
b.eq .Ldone_v8 //is x3 zero?
eor v2.16b,v2.16b,v18.16b
ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b
adds x3,x3,#32 //re-construct x3
eor v0.16b,v0.16b,v2.16b //re-construct v0.16b
b.eq .Ldone_v8 //is x3 zero?
.Lodd_tail_v8:
ext v18.16b,v0.16b,v0.16b,#8
eor v3.16b,v3.16b,v0.16b //inp^=Xi
eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi
ext v18.16b,v0.16b,v0.16b,#8
eor v3.16b,v3.16b,v0.16b //inp^=Xi
eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi
pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
eor v1.16b,v1.16b,v18.16b
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
eor v0.16b,v1.16b,v18.16b
eor v0.16b,v1.16b,v18.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v0.16b,v0.16b,v18.16b
eor v18.16b,v18.16b,v2.16b
eor v0.16b,v0.16b,v18.16b
.Ldone_v8:
#ifndef __ARMEB__
rev64 v0.16b,v0.16b
#endif
ext v0.16b,v0.16b,v0.16b,#8
st1 {v0.2d},[x0] //write out Xi
ext v0.16b,v0.16b,v0.16b,#8
st1 {v0.2d},[x0] //write out Xi
ret
.size gcm_ghash_v8,.-gcm_ghash_v8
.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
.align 2
.type gcm_ghash_v8_4x,%function
.align 4
gcm_ghash_v8_4x:
.Lgcm_ghash_v8_4x:
ld1 {v0.2d},[x0] //load [rotated] Xi
ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2
movi v19.16b,#0xe1
ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4
shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
#ifndef __ARMEB__
rev64 v0.16b,v0.16b
rev64 v5.16b,v5.16b
rev64 v6.16b,v6.16b
rev64 v7.16b,v7.16b
rev64 v4.16b,v4.16b
#endif
ext v25.16b,v7.16b,v7.16b,#8
ext v24.16b,v6.16b,v6.16b,#8
ext v23.16b,v5.16b,v5.16b,#8
pmull v29.1q,v20.1d,v25.1d //H·Ii+3
eor v7.16b,v7.16b,v25.16b
pmull2 v31.1q,v20.2d,v25.2d
pmull v30.1q,v21.1d,v7.1d
pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
eor v6.16b,v6.16b,v24.16b
pmull2 v24.1q,v22.2d,v24.2d
pmull2 v6.1q,v21.2d,v6.2d
eor v29.16b,v29.16b,v16.16b
eor v31.16b,v31.16b,v24.16b
eor v30.16b,v30.16b,v6.16b
pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
eor v5.16b,v5.16b,v23.16b
pmull2 v23.1q,v26.2d,v23.2d
pmull v5.1q,v27.1d,v5.1d
eor v29.16b,v29.16b,v7.16b
eor v31.16b,v31.16b,v23.16b
eor v30.16b,v30.16b,v5.16b
subs x3,x3,#128
b.lo .Ltail4x
b .Loop4x
.align 4
.Loop4x:
eor v16.16b,v4.16b,v0.16b
ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
ext v3.16b,v16.16b,v16.16b,#8
#ifndef __ARMEB__
rev64 v5.16b,v5.16b
rev64 v6.16b,v6.16b
rev64 v7.16b,v7.16b
rev64 v4.16b,v4.16b
#endif
pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
eor v16.16b,v16.16b,v3.16b
pmull2 v2.1q,v28.2d,v3.2d
ext v25.16b,v7.16b,v7.16b,#8
pmull2 v1.1q,v27.2d,v16.2d
eor v0.16b,v0.16b,v29.16b
eor v2.16b,v2.16b,v31.16b
ext v24.16b,v6.16b,v6.16b,#8
eor v1.16b,v1.16b,v30.16b
ext v23.16b,v5.16b,v5.16b,#8
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
pmull v29.1q,v20.1d,v25.1d //H·Ii+3
eor v7.16b,v7.16b,v25.16b
eor v1.16b,v1.16b,v17.16b
pmull2 v31.1q,v20.2d,v25.2d
eor v1.16b,v1.16b,v18.16b
pmull v30.1q,v21.1d,v7.1d
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
eor v6.16b,v6.16b,v24.16b
pmull2 v24.1q,v22.2d,v24.2d
eor v0.16b,v1.16b,v18.16b
pmull2 v6.1q,v21.2d,v6.2d
eor v29.16b,v29.16b,v16.16b
eor v31.16b,v31.16b,v24.16b
eor v30.16b,v30.16b,v6.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
eor v5.16b,v5.16b,v23.16b
eor v18.16b,v18.16b,v2.16b
pmull2 v23.1q,v26.2d,v23.2d
pmull v5.1q,v27.1d,v5.1d
eor v0.16b,v0.16b,v18.16b
eor v29.16b,v29.16b,v7.16b
eor v31.16b,v31.16b,v23.16b
ext v0.16b,v0.16b,v0.16b,#8
eor v30.16b,v30.16b,v5.16b
subs x3,x3,#64
b.hs .Loop4x
.Ltail4x:
eor v16.16b,v4.16b,v0.16b
ext v3.16b,v16.16b,v16.16b,#8
pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
eor v16.16b,v16.16b,v3.16b
pmull2 v2.1q,v28.2d,v3.2d
pmull2 v1.1q,v27.2d,v16.2d
eor v0.16b,v0.16b,v29.16b
eor v2.16b,v2.16b,v31.16b
eor v1.16b,v1.16b,v30.16b
adds x3,x3,#64
b.eq .Ldone4x
cmp x3,#32
b.lo .Lone
b.eq .Ltwo
.Lthree:
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
ld1 {v4.2d,v5.2d,v6.2d},[x2]
eor v1.16b,v1.16b,v18.16b
#ifndef __ARMEB__
rev64 v5.16b,v5.16b
rev64 v6.16b,v6.16b
rev64 v4.16b,v4.16b
#endif
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
ext v24.16b,v6.16b,v6.16b,#8
ext v23.16b,v5.16b,v5.16b,#8
eor v0.16b,v1.16b,v18.16b
pmull v29.1q,v20.1d,v24.1d //H·Ii+2
eor v6.16b,v6.16b,v24.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
pmull2 v31.1q,v20.2d,v24.2d
pmull v30.1q,v21.1d,v6.1d
eor v0.16b,v0.16b,v18.16b
pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1
eor v5.16b,v5.16b,v23.16b
ext v0.16b,v0.16b,v0.16b,#8
pmull2 v23.1q,v22.2d,v23.2d
eor v16.16b,v4.16b,v0.16b
pmull2 v5.1q,v21.2d,v5.2d
ext v3.16b,v16.16b,v16.16b,#8
eor v29.16b,v29.16b,v7.16b
eor v31.16b,v31.16b,v23.16b
eor v30.16b,v30.16b,v5.16b
pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii)
eor v16.16b,v16.16b,v3.16b
pmull2 v2.1q,v26.2d,v3.2d
pmull v1.1q,v27.1d,v16.1d
eor v0.16b,v0.16b,v29.16b
eor v2.16b,v2.16b,v31.16b
eor v1.16b,v1.16b,v30.16b
b .Ldone4x
.align 4
.Ltwo:
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
ld1 {v4.2d,v5.2d},[x2]
eor v1.16b,v1.16b,v18.16b
#ifndef __ARMEB__
rev64 v5.16b,v5.16b
rev64 v4.16b,v4.16b
#endif
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
ext v23.16b,v5.16b,v5.16b,#8
eor v0.16b,v1.16b,v18.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v0.16b,v0.16b,v18.16b
ext v0.16b,v0.16b,v0.16b,#8
pmull v29.1q,v20.1d,v23.1d //H·Ii+1
eor v5.16b,v5.16b,v23.16b
eor v16.16b,v4.16b,v0.16b
ext v3.16b,v16.16b,v16.16b,#8
pmull2 v31.1q,v20.2d,v23.2d
pmull v30.1q,v21.1d,v5.1d
pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii)
eor v16.16b,v16.16b,v3.16b
pmull2 v2.1q,v22.2d,v3.2d
pmull2 v1.1q,v21.2d,v16.2d
eor v0.16b,v0.16b,v29.16b
eor v2.16b,v2.16b,v31.16b
eor v1.16b,v1.16b,v30.16b
b .Ldone4x
.align 4
.Lone:
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
ld1 {v4.2d},[x2]
eor v1.16b,v1.16b,v18.16b
#ifndef __ARMEB__
rev64 v4.16b,v4.16b
#endif
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
eor v0.16b,v1.16b,v18.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v0.16b,v0.16b,v18.16b
ext v0.16b,v0.16b,v0.16b,#8
eor v16.16b,v4.16b,v0.16b
ext v3.16b,v16.16b,v16.16b,#8
pmull v0.1q,v20.1d,v3.1d
eor v16.16b,v16.16b,v3.16b
pmull2 v2.1q,v20.2d,v3.2d
pmull v1.1q,v21.1d,v16.1d
.Ldone4x:
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
eor v0.16b,v1.16b,v18.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v0.16b,v0.16b,v18.16b
ext v0.16b,v0.16b,v0.16b,#8
#ifndef __ARMEB__
rev64 v0.16b,v0.16b
#endif
st1 {v0.2d},[x0] //write out Xi
ret
.size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2
.align 2
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,866 @@
/* $FreeBSD$ */
/* Do not modify. This file is auto-generated from poly1305-armv8.pl. */
#include "arm_arch.h"
.text
// forward "declarations" are required for Apple
.globl poly1305_blocks
.globl poly1305_emit
.globl poly1305_init
.type poly1305_init,%function
.align 5
poly1305_init:
cmp x1,xzr
stp xzr,xzr,[x0] // zero hash value
stp xzr,xzr,[x0,#16] // [along with is_base2_26]
csel x0,xzr,x0,eq
b.eq .Lno_key
#ifdef __ILP32__
ldrsw x11,.LOPENSSL_armcap_P
#else
ldr x11,.LOPENSSL_armcap_P
#endif
adr x10,.LOPENSSL_armcap_P
ldp x7,x8,[x1] // load key
mov x9,#0xfffffffc0fffffff
movk x9,#0x0fff,lsl#48
ldr w17,[x10,x11]
#ifdef __ARMEB__
rev x7,x7 // flip bytes
rev x8,x8
#endif
and x7,x7,x9 // &=0ffffffc0fffffff
and x9,x9,#-4
and x8,x8,x9 // &=0ffffffc0ffffffc
stp x7,x8,[x0,#32] // save key value
tst w17,#ARMV7_NEON
adr x12,poly1305_blocks
adr x7,poly1305_blocks_neon
adr x13,poly1305_emit
adr x8,poly1305_emit_neon
csel x12,x12,x7,eq
csel x13,x13,x8,eq
#ifdef __ILP32__
stp w12,w13,[x2]
#else
stp x12,x13,[x2]
#endif
mov x0,#1
.Lno_key:
ret
.size poly1305_init,.-poly1305_init
.type poly1305_blocks,%function
.align 5
poly1305_blocks:
ands x2,x2,#-16
b.eq .Lno_data
ldp x4,x5,[x0] // load hash value
ldp x7,x8,[x0,#32] // load key value
ldr x6,[x0,#16]
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
b .Loop
.align 5
.Loop:
ldp x10,x11,[x1],#16 // load input
sub x2,x2,#16
#ifdef __ARMEB__
rev x10,x10
rev x11,x11
#endif
adds x4,x4,x10 // accumulate input
adcs x5,x5,x11
mul x12,x4,x7 // h0*r0
adc x6,x6,x3
umulh x13,x4,x7
mul x10,x5,x9 // h1*5*r1
umulh x11,x5,x9
adds x12,x12,x10
mul x10,x4,x8 // h0*r1
adc x13,x13,x11
umulh x14,x4,x8
adds x13,x13,x10
mul x10,x5,x7 // h1*r0
adc x14,x14,xzr
umulh x11,x5,x7
adds x13,x13,x10
mul x10,x6,x9 // h2*5*r1
adc x14,x14,x11
mul x11,x6,x7 // h2*r0
adds x13,x13,x10
adc x14,x14,x11
and x10,x14,#-4 // final reduction
and x6,x14,#3
add x10,x10,x14,lsr#2
adds x4,x12,x10
adcs x5,x13,xzr
adc x6,x6,xzr
cbnz x2,.Loop
stp x4,x5,[x0] // store hash value
str x6,[x0,#16]
.Lno_data:
ret
.size poly1305_blocks,.-poly1305_blocks
.type poly1305_emit,%function
.align 5
poly1305_emit:
ldp x4,x5,[x0] // load hash base 2^64
ldr x6,[x0,#16]
ldp x10,x11,[x2] // load nonce
adds x12,x4,#5 // compare to modulus
adcs x13,x5,xzr
adc x14,x6,xzr
tst x14,#-4 // see if it's carried/borrowed
csel x4,x4,x12,eq
csel x5,x5,x13,eq
#ifdef __ARMEB__
ror x10,x10,#32 // flip nonce words
ror x11,x11,#32
#endif
adds x4,x4,x10 // accumulate nonce
adc x5,x5,x11
#ifdef __ARMEB__
rev x4,x4 // flip output bytes
rev x5,x5
#endif
stp x4,x5,[x1] // write result
ret
.size poly1305_emit,.-poly1305_emit
.type poly1305_mult,%function
.align 5
poly1305_mult:
mul x12,x4,x7 // h0*r0
umulh x13,x4,x7
mul x10,x5,x9 // h1*5*r1
umulh x11,x5,x9
adds x12,x12,x10
mul x10,x4,x8 // h0*r1
adc x13,x13,x11
umulh x14,x4,x8
adds x13,x13,x10
mul x10,x5,x7 // h1*r0
adc x14,x14,xzr
umulh x11,x5,x7
adds x13,x13,x10
mul x10,x6,x9 // h2*5*r1
adc x14,x14,x11
mul x11,x6,x7 // h2*r0
adds x13,x13,x10
adc x14,x14,x11
and x10,x14,#-4 // final reduction
and x6,x14,#3
add x10,x10,x14,lsr#2
adds x4,x12,x10
adcs x5,x13,xzr
adc x6,x6,xzr
ret
.size poly1305_mult,.-poly1305_mult
.type poly1305_splat,%function
.align 5
poly1305_splat:
and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
ubfx x13,x4,#26,#26
extr x14,x5,x4,#52
and x14,x14,#0x03ffffff
ubfx x15,x5,#14,#26
extr x16,x6,x5,#40
str w12,[x0,#16*0] // r0
add w12,w13,w13,lsl#2 // r1*5
str w13,[x0,#16*1] // r1
add w13,w14,w14,lsl#2 // r2*5
str w12,[x0,#16*2] // s1
str w14,[x0,#16*3] // r2
add w14,w15,w15,lsl#2 // r3*5
str w13,[x0,#16*4] // s2
str w15,[x0,#16*5] // r3
add w15,w16,w16,lsl#2 // r4*5
str w14,[x0,#16*6] // s3
str w16,[x0,#16*7] // r4
str w15,[x0,#16*8] // s4
ret
.size poly1305_splat,.-poly1305_splat
.type poly1305_blocks_neon,%function
.align 5
poly1305_blocks_neon:
ldr x17,[x0,#24]
cmp x2,#128
b.hs .Lblocks_neon
cbz x17,poly1305_blocks
.Lblocks_neon:
stp x29,x30,[sp,#-80]!
add x29,sp,#0
ands x2,x2,#-16
b.eq .Lno_data_neon
cbz x17,.Lbase2_64_neon
ldp w10,w11,[x0] // load hash value base 2^26
ldp w12,w13,[x0,#8]
ldr w14,[x0,#16]
tst x2,#31
b.eq .Leven_neon
ldp x7,x8,[x0,#32] // load key value
add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
lsr x5,x12,#12
adds x4,x4,x12,lsl#52
add x5,x5,x13,lsl#14
adc x5,x5,xzr
lsr x6,x14,#24
adds x5,x5,x14,lsl#40
adc x14,x6,xzr // can be partially reduced...
ldp x12,x13,[x1],#16 // load input
sub x2,x2,#16
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
and x10,x14,#-4 // ... so reduce
and x6,x14,#3
add x10,x10,x14,lsr#2
adds x4,x4,x10
adcs x5,x5,xzr
adc x6,x6,xzr
#ifdef __ARMEB__
rev x12,x12
rev x13,x13
#endif
adds x4,x4,x12 // accumulate input
adcs x5,x5,x13
adc x6,x6,x3
bl poly1305_mult
ldr x30,[sp,#8]
cbz x3,.Lstore_base2_64_neon
and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
ubfx x11,x4,#26,#26
extr x12,x5,x4,#52
and x12,x12,#0x03ffffff
ubfx x13,x5,#14,#26
extr x14,x6,x5,#40
cbnz x2,.Leven_neon
stp w10,w11,[x0] // store hash value base 2^26
stp w12,w13,[x0,#8]
str w14,[x0,#16]
b .Lno_data_neon
.align 4
.Lstore_base2_64_neon:
stp x4,x5,[x0] // store hash value base 2^64
stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed
b .Lno_data_neon
.align 4
.Lbase2_64_neon:
ldp x7,x8,[x0,#32] // load key value
ldp x4,x5,[x0] // load hash value base 2^64
ldr x6,[x0,#16]
tst x2,#31
b.eq .Linit_neon
ldp x12,x13,[x1],#16 // load input
sub x2,x2,#16
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
#ifdef __ARMEB__
rev x12,x12
rev x13,x13
#endif
adds x4,x4,x12 // accumulate input
adcs x5,x5,x13
adc x6,x6,x3
bl poly1305_mult
.Linit_neon:
and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
ubfx x11,x4,#26,#26
extr x12,x5,x4,#52
and x12,x12,#0x03ffffff
ubfx x13,x5,#14,#26
extr x14,x6,x5,#40
stp d8,d9,[sp,#16] // meet ABI requirements
stp d10,d11,[sp,#32]
stp d12,d13,[sp,#48]
stp d14,d15,[sp,#64]
fmov d24,x10
fmov d25,x11
fmov d26,x12
fmov d27,x13
fmov d28,x14
////////////////////////////////// initialize r^n table
mov x4,x7 // r^1
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
mov x5,x8
mov x6,xzr
add x0,x0,#48+12
bl poly1305_splat
bl poly1305_mult // r^2
sub x0,x0,#4
bl poly1305_splat
bl poly1305_mult // r^3
sub x0,x0,#4
bl poly1305_splat
bl poly1305_mult // r^4
sub x0,x0,#4
bl poly1305_splat
ldr x30,[sp,#8]
add x16,x1,#32
adr x17,.Lzeros
subs x2,x2,#64
csel x16,x17,x16,lo
mov x4,#1
str x4,[x0,#-24] // set is_base2_26
sub x0,x0,#48 // restore original x0
b .Ldo_neon
.align 4
.Leven_neon:
add x16,x1,#32
adr x17,.Lzeros
subs x2,x2,#64
csel x16,x17,x16,lo
stp d8,d9,[sp,#16] // meet ABI requirements
stp d10,d11,[sp,#32]
stp d12,d13,[sp,#48]
stp d14,d15,[sp,#64]
fmov d24,x10
fmov d25,x11
fmov d26,x12
fmov d27,x13
fmov d28,x14
.Ldo_neon:
ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
ldp x9,x13,[x16],#48
lsl x3,x3,#24
add x15,x0,#48
#ifdef __ARMEB__
rev x8,x8
rev x12,x12
rev x9,x9
rev x13,x13
#endif
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
and x5,x9,#0x03ffffff
ubfx x6,x8,#26,#26
ubfx x7,x9,#26,#26
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
extr x8,x12,x8,#52
extr x9,x13,x9,#52
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
fmov d14,x4
and x8,x8,#0x03ffffff
and x9,x9,#0x03ffffff
ubfx x10,x12,#14,#26
ubfx x11,x13,#14,#26
add x12,x3,x12,lsr#40
add x13,x3,x13,lsr#40
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
fmov d15,x6
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
fmov d16,x8
fmov d17,x10
fmov d18,x12
ldp x8,x12,[x1],#16 // inp[0:1]
ldp x9,x13,[x1],#48
ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
ld1 {v8.4s},[x15]
#ifdef __ARMEB__
rev x8,x8
rev x12,x12
rev x9,x9
rev x13,x13
#endif
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
and x5,x9,#0x03ffffff
ubfx x6,x8,#26,#26
ubfx x7,x9,#26,#26
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
extr x8,x12,x8,#52
extr x9,x13,x9,#52
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
fmov d9,x4
and x8,x8,#0x03ffffff
and x9,x9,#0x03ffffff
ubfx x10,x12,#14,#26
ubfx x11,x13,#14,#26
add x12,x3,x12,lsr#40
add x13,x3,x13,lsr#40
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
fmov d10,x6
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
movi v31.2d,#-1
fmov d11,x8
fmov d12,x10
fmov d13,x12
ushr v31.2d,v31.2d,#38
b.ls .Lskip_loop
.align 4
.Loop_neon:
////////////////////////////////////////////////////////////////
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
// ___________________/
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
// ___________________/ ____________________/
//
// Note that we start with inp[2:3]*r^2. This is because it
// doesn't depend on reduction in previous iteration.
////////////////////////////////////////////////////////////////
// d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
// d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
// d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
// d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
subs x2,x2,#64
umull v23.2d,v14.2s,v7.s[2]
csel x16,x17,x16,lo
umull v22.2d,v14.2s,v5.s[2]
umull v21.2d,v14.2s,v3.s[2]
ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
umull v20.2d,v14.2s,v1.s[2]
ldp x9,x13,[x16],#48
umull v19.2d,v14.2s,v0.s[2]
#ifdef __ARMEB__
rev x8,x8
rev x12,x12
rev x9,x9
rev x13,x13
#endif
umlal v23.2d,v15.2s,v5.s[2]
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
umlal v22.2d,v15.2s,v3.s[2]
and x5,x9,#0x03ffffff
umlal v21.2d,v15.2s,v1.s[2]
ubfx x6,x8,#26,#26
umlal v20.2d,v15.2s,v0.s[2]
ubfx x7,x9,#26,#26
umlal v19.2d,v15.2s,v8.s[2]
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
umlal v23.2d,v16.2s,v3.s[2]
extr x8,x12,x8,#52
umlal v22.2d,v16.2s,v1.s[2]
extr x9,x13,x9,#52
umlal v21.2d,v16.2s,v0.s[2]
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
umlal v20.2d,v16.2s,v8.s[2]
fmov d14,x4
umlal v19.2d,v16.2s,v6.s[2]
and x8,x8,#0x03ffffff
umlal v23.2d,v17.2s,v1.s[2]
and x9,x9,#0x03ffffff
umlal v22.2d,v17.2s,v0.s[2]
ubfx x10,x12,#14,#26
umlal v21.2d,v17.2s,v8.s[2]
ubfx x11,x13,#14,#26
umlal v20.2d,v17.2s,v6.s[2]
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
umlal v19.2d,v17.2s,v4.s[2]
fmov d15,x6
add v11.2s,v11.2s,v26.2s
add x12,x3,x12,lsr#40
umlal v23.2d,v18.2s,v0.s[2]
add x13,x3,x13,lsr#40
umlal v22.2d,v18.2s,v8.s[2]
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
umlal v21.2d,v18.2s,v6.s[2]
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
umlal v20.2d,v18.2s,v4.s[2]
fmov d16,x8
umlal v19.2d,v18.2s,v2.s[2]
fmov d17,x10
////////////////////////////////////////////////////////////////
// (hash+inp[0:1])*r^4 and accumulate
add v9.2s,v9.2s,v24.2s
fmov d18,x12
umlal v22.2d,v11.2s,v1.s[0]
ldp x8,x12,[x1],#16 // inp[0:1]
umlal v19.2d,v11.2s,v6.s[0]
ldp x9,x13,[x1],#48
umlal v23.2d,v11.2s,v3.s[0]
umlal v20.2d,v11.2s,v8.s[0]
umlal v21.2d,v11.2s,v0.s[0]
#ifdef __ARMEB__
rev x8,x8
rev x12,x12
rev x9,x9
rev x13,x13
#endif
add v10.2s,v10.2s,v25.2s
umlal v22.2d,v9.2s,v5.s[0]
umlal v23.2d,v9.2s,v7.s[0]
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
umlal v21.2d,v9.2s,v3.s[0]
and x5,x9,#0x03ffffff
umlal v19.2d,v9.2s,v0.s[0]
ubfx x6,x8,#26,#26
umlal v20.2d,v9.2s,v1.s[0]
ubfx x7,x9,#26,#26
add v12.2s,v12.2s,v27.2s
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
umlal v22.2d,v10.2s,v3.s[0]
extr x8,x12,x8,#52
umlal v23.2d,v10.2s,v5.s[0]
extr x9,x13,x9,#52
umlal v19.2d,v10.2s,v8.s[0]
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
umlal v21.2d,v10.2s,v1.s[0]
fmov d9,x4
umlal v20.2d,v10.2s,v0.s[0]
and x8,x8,#0x03ffffff
add v13.2s,v13.2s,v28.2s
and x9,x9,#0x03ffffff
umlal v22.2d,v12.2s,v0.s[0]
ubfx x10,x12,#14,#26
umlal v19.2d,v12.2s,v4.s[0]
ubfx x11,x13,#14,#26
umlal v23.2d,v12.2s,v1.s[0]
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
umlal v20.2d,v12.2s,v6.s[0]
fmov d10,x6
umlal v21.2d,v12.2s,v8.s[0]
add x12,x3,x12,lsr#40
umlal v22.2d,v13.2s,v8.s[0]
add x13,x3,x13,lsr#40
umlal v19.2d,v13.2s,v2.s[0]
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
umlal v23.2d,v13.2s,v0.s[0]
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
umlal v20.2d,v13.2s,v4.s[0]
fmov d11,x8
umlal v21.2d,v13.2s,v6.s[0]
fmov d12,x10
fmov d13,x12
/////////////////////////////////////////////////////////////////
// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
// and P. Schwabe
//
// [see discussion in poly1305-armv4 module]
ushr v29.2d,v22.2d,#26
xtn v27.2s,v22.2d
ushr v30.2d,v19.2d,#26
and v19.16b,v19.16b,v31.16b
add v23.2d,v23.2d,v29.2d // h3 -> h4
bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff
add v20.2d,v20.2d,v30.2d // h0 -> h1
ushr v29.2d,v23.2d,#26
xtn v28.2s,v23.2d
ushr v30.2d,v20.2d,#26
xtn v25.2s,v20.2d
bic v28.2s,#0xfc,lsl#24
add v21.2d,v21.2d,v30.2d // h1 -> h2
add v19.2d,v19.2d,v29.2d
shl v29.2d,v29.2d,#2
shrn v30.2s,v21.2d,#26
xtn v26.2s,v21.2d
add v19.2d,v19.2d,v29.2d // h4 -> h0
bic v25.2s,#0xfc,lsl#24
add v27.2s,v27.2s,v30.2s // h2 -> h3
bic v26.2s,#0xfc,lsl#24
shrn v29.2s,v19.2d,#26
xtn v24.2s,v19.2d
ushr v30.2s,v27.2s,#26
bic v27.2s,#0xfc,lsl#24
bic v24.2s,#0xfc,lsl#24
add v25.2s,v25.2s,v29.2s // h0 -> h1
add v28.2s,v28.2s,v30.2s // h3 -> h4
b.hi .Loop_neon
.Lskip_loop:
dup v16.2d,v16.d[0]
add v11.2s,v11.2s,v26.2s
////////////////////////////////////////////////////////////////
// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
adds x2,x2,#32
b.ne .Long_tail
dup v16.2d,v11.d[0]
add v14.2s,v9.2s,v24.2s
add v17.2s,v12.2s,v27.2s
add v15.2s,v10.2s,v25.2s
add v18.2s,v13.2s,v28.2s
.Long_tail:
dup v14.2d,v14.d[0]
umull2 v19.2d,v16.4s,v6.4s
umull2 v22.2d,v16.4s,v1.4s
umull2 v23.2d,v16.4s,v3.4s
umull2 v21.2d,v16.4s,v0.4s
umull2 v20.2d,v16.4s,v8.4s
dup v15.2d,v15.d[0]
umlal2 v19.2d,v14.4s,v0.4s
umlal2 v21.2d,v14.4s,v3.4s
umlal2 v22.2d,v14.4s,v5.4s
umlal2 v23.2d,v14.4s,v7.4s
umlal2 v20.2d,v14.4s,v1.4s
dup v17.2d,v17.d[0]
umlal2 v19.2d,v15.4s,v8.4s
umlal2 v22.2d,v15.4s,v3.4s
umlal2 v21.2d,v15.4s,v1.4s
umlal2 v23.2d,v15.4s,v5.4s
umlal2 v20.2d,v15.4s,v0.4s
dup v18.2d,v18.d[0]
umlal2 v22.2d,v17.4s,v0.4s
umlal2 v23.2d,v17.4s,v1.4s
umlal2 v19.2d,v17.4s,v4.4s
umlal2 v20.2d,v17.4s,v6.4s
umlal2 v21.2d,v17.4s,v8.4s
umlal2 v22.2d,v18.4s,v8.4s
umlal2 v19.2d,v18.4s,v2.4s
umlal2 v23.2d,v18.4s,v0.4s
umlal2 v20.2d,v18.4s,v4.4s
umlal2 v21.2d,v18.4s,v6.4s
b.eq .Lshort_tail
////////////////////////////////////////////////////////////////
// (hash+inp[0:1])*r^4:r^3 and accumulate
add v9.2s,v9.2s,v24.2s
umlal v22.2d,v11.2s,v1.2s
umlal v19.2d,v11.2s,v6.2s
umlal v23.2d,v11.2s,v3.2s
umlal v20.2d,v11.2s,v8.2s
umlal v21.2d,v11.2s,v0.2s
add v10.2s,v10.2s,v25.2s
umlal v22.2d,v9.2s,v5.2s
umlal v19.2d,v9.2s,v0.2s
umlal v23.2d,v9.2s,v7.2s
umlal v20.2d,v9.2s,v1.2s
umlal v21.2d,v9.2s,v3.2s
add v12.2s,v12.2s,v27.2s
umlal v22.2d,v10.2s,v3.2s
umlal v19.2d,v10.2s,v8.2s
umlal v23.2d,v10.2s,v5.2s
umlal v20.2d,v10.2s,v0.2s
umlal v21.2d,v10.2s,v1.2s
add v13.2s,v13.2s,v28.2s
umlal v22.2d,v12.2s,v0.2s
umlal v19.2d,v12.2s,v4.2s
umlal v23.2d,v12.2s,v1.2s
umlal v20.2d,v12.2s,v6.2s
umlal v21.2d,v12.2s,v8.2s
umlal v22.2d,v13.2s,v8.2s
umlal v19.2d,v13.2s,v2.2s
umlal v23.2d,v13.2s,v0.2s
umlal v20.2d,v13.2s,v4.2s
umlal v21.2d,v13.2s,v6.2s
.Lshort_tail:
////////////////////////////////////////////////////////////////
// horizontal add
addp v22.2d,v22.2d,v22.2d
ldp d8,d9,[sp,#16] // meet ABI requirements
addp v19.2d,v19.2d,v19.2d
ldp d10,d11,[sp,#32]
addp v23.2d,v23.2d,v23.2d
ldp d12,d13,[sp,#48]
addp v20.2d,v20.2d,v20.2d
ldp d14,d15,[sp,#64]
addp v21.2d,v21.2d,v21.2d
////////////////////////////////////////////////////////////////
// lazy reduction, but without narrowing
ushr v29.2d,v22.2d,#26
and v22.16b,v22.16b,v31.16b
ushr v30.2d,v19.2d,#26
and v19.16b,v19.16b,v31.16b
add v23.2d,v23.2d,v29.2d // h3 -> h4
add v20.2d,v20.2d,v30.2d // h0 -> h1
ushr v29.2d,v23.2d,#26
and v23.16b,v23.16b,v31.16b
ushr v30.2d,v20.2d,#26
and v20.16b,v20.16b,v31.16b
add v21.2d,v21.2d,v30.2d // h1 -> h2
add v19.2d,v19.2d,v29.2d
shl v29.2d,v29.2d,#2
ushr v30.2d,v21.2d,#26
and v21.16b,v21.16b,v31.16b
add v19.2d,v19.2d,v29.2d // h4 -> h0
add v22.2d,v22.2d,v30.2d // h2 -> h3
ushr v29.2d,v19.2d,#26
and v19.16b,v19.16b,v31.16b
ushr v30.2d,v22.2d,#26
and v22.16b,v22.16b,v31.16b
add v20.2d,v20.2d,v29.2d // h0 -> h1
add v23.2d,v23.2d,v30.2d // h3 -> h4
////////////////////////////////////////////////////////////////
// write the result, can be partially reduced
st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
st1 {v23.s}[0],[x0]
.Lno_data_neon:
ldr x29,[sp],#80
ret
.size poly1305_blocks_neon,.-poly1305_blocks_neon
.type poly1305_emit_neon,%function
.align 5
poly1305_emit_neon:
ldr x17,[x0,#24]
cbz x17,poly1305_emit
ldp w10,w11,[x0] // load hash value base 2^26
ldp w12,w13,[x0,#8]
ldr w14,[x0,#16]
add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
lsr x5,x12,#12
adds x4,x4,x12,lsl#52
add x5,x5,x13,lsl#14
adc x5,x5,xzr
lsr x6,x14,#24
adds x5,x5,x14,lsl#40
adc x6,x6,xzr // can be partially reduced...
ldp x10,x11,[x2] // load nonce
and x12,x6,#-4 // ... so reduce
add x12,x12,x6,lsr#2
and x6,x6,#3
adds x4,x4,x12
adcs x5,x5,xzr
adc x6,x6,xzr
adds x12,x4,#5 // compare to modulus
adcs x13,x5,xzr
adc x14,x6,xzr
tst x14,#-4 // see if it's carried/borrowed
csel x4,x4,x12,eq
csel x5,x5,x13,eq
#ifdef __ARMEB__
ror x10,x10,#32 // flip nonce words
ror x11,x11,#32
#endif
adds x4,x4,x10 // accumulate nonce
adc x5,x5,x11
#ifdef __ARMEB__
rev x4,x4 // flip output bytes
rev x5,x5
#endif
stp x4,x5,[x1] // write result
ret
.size poly1305_emit_neon,.-poly1305_emit_neon
.align 5
.Lzeros:
.long 0,0,0,0,0,0,0,0
.LOPENSSL_armcap_P:
#ifdef __ILP32__
.long OPENSSL_armcap_P-.
#else
.quad OPENSSL_armcap_P-.
#endif
.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2
.align 2

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,13 +1,82 @@
/* $FreeBSD$ */
/* Do not modify. This file is auto-generated from sha512-armv8.pl. */
#include "arm_arch.h"
// Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the OpenSSL license (the "License"). You may not use
// this file except in compliance with the License. You can obtain a copy
// in the file LICENSE in the source distribution or at
// https://www.openssl.org/source/license.html
// ====================================================================
// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
// project. The module is, however, dual licensed under OpenSSL and
// CRYPTOGAMS licenses depending on where you obtain it. For further
// details see http://www.openssl.org/~appro/cryptogams/.
//
// Permission to use under GPLv2 terms is granted.
// ====================================================================
//
// SHA256/512 for ARMv8.
//
// Performance in cycles per processed byte and improvement coefficient
// over code generated with "default" compiler:
//
// SHA256-hw SHA256(*) SHA512
// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
// Denver 2.01 10.5 (+26%) 6.70 (+8%)
// X-Gene 20.0 (+100%) 12.8 (+300%(***))
// Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
// Kryo 1.92 17.4 (+30%) 11.2 (+8%)
//
// (*) Software SHA256 results are of lesser relevance, presented
// mostly for informational purposes.
// (**) The result is a trade-off: it's possible to improve it by
// 10% (or by 1 cycle per round), but at the cost of 20% loss
// on Cortex-A53 (or by 4 cycles per round).
// (***) Super-impressive coefficients over gcc-generated code are
// indication of some compiler "pathology", most notably code
// generated with -mgeneral-regs-only is significantly faster
// and the gap is only 40-90%.
//
// October 2016.
//
// Originally it was reckoned that it makes no sense to implement NEON
// version of SHA256 for 64-bit processors. This is because performance
// improvement on most wide-spread Cortex-A5x processors was observed
// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
// observed that 32-bit NEON SHA256 performs significantly better than
// 64-bit scalar version on *some* of the more recent processors. As
// result 64-bit NEON version of SHA256 was added to provide best
// all-round performance. For example it executes ~30% faster on X-Gene
// and Mongoose. [For reference, NEON version of SHA512 is bound to
// deliver much less improvement, likely *negative* on Cortex-A5x.
// Which is why NEON support is limited to SHA256.]
#ifndef __KERNEL__
# include "arm_arch.h"
#endif
.text
.globl sha512_block_data_order
.type sha512_block_data_order,%function
.align 6
sha512_block_data_order:
#ifndef __KERNEL__
# ifdef __ILP32__
ldrsw x16,.LOPENSSL_armcap_P
# else
ldr x16,.LOPENSSL_armcap_P
# endif
adr x17,.LOPENSSL_armcap_P
add x16,x16,x17
ldr w16,[x16]
tst w16,#ARMV8_SHA512
b.ne .Lv8_entry
#endif
stp x29,x30,[sp,#-128]!
add x29,sp,#0
@ -23,7 +92,7 @@ sha512_block_data_order:
ldp x24,x25,[x0,#4*8]
add x2,x1,x2,lsl#7 // end of input
ldp x26,x27,[x0,#6*8]
adr x30,K512
adr x30,.LK512
stp x0,x2,[x29,#96]
.Loop:
@ -31,7 +100,7 @@ sha512_block_data_order:
ldr x19,[x30],#8 // *K++
eor x28,x21,x22 // magic seed
str x1,[x29,#112]
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x3,x3 // 0
#endif
ror x16,x24,#14
@ -54,7 +123,7 @@ sha512_block_data_order:
add x27,x27,x28 // h+=Maj(a,b,c)
ldr x28,[x30],#8 // *K++, x19 in next round
//add x27,x27,x17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x4,x4 // 1
#endif
ldp x5,x6,[x1],#2*8
@ -79,7 +148,7 @@ sha512_block_data_order:
add x26,x26,x19 // h+=Maj(a,b,c)
ldr x19,[x30],#8 // *K++, x28 in next round
//add x26,x26,x17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x5,x5 // 2
#endif
add x26,x26,x17 // h+=Sigma0(a)
@ -103,7 +172,7 @@ sha512_block_data_order:
add x25,x25,x28 // h+=Maj(a,b,c)
ldr x28,[x30],#8 // *K++, x19 in next round
//add x25,x25,x17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x6,x6 // 3
#endif
ldp x7,x8,[x1],#2*8
@ -128,7 +197,7 @@ sha512_block_data_order:
add x24,x24,x19 // h+=Maj(a,b,c)
ldr x19,[x30],#8 // *K++, x28 in next round
//add x24,x24,x17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x7,x7 // 4
#endif
add x24,x24,x17 // h+=Sigma0(a)
@ -152,7 +221,7 @@ sha512_block_data_order:
add x23,x23,x28 // h+=Maj(a,b,c)
ldr x28,[x30],#8 // *K++, x19 in next round
//add x23,x23,x17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x8,x8 // 5
#endif
ldp x9,x10,[x1],#2*8
@ -177,7 +246,7 @@ sha512_block_data_order:
add x22,x22,x19 // h+=Maj(a,b,c)
ldr x19,[x30],#8 // *K++, x28 in next round
//add x22,x22,x17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x9,x9 // 6
#endif
add x22,x22,x17 // h+=Sigma0(a)
@ -201,7 +270,7 @@ sha512_block_data_order:
add x21,x21,x28 // h+=Maj(a,b,c)
ldr x28,[x30],#8 // *K++, x19 in next round
//add x21,x21,x17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x10,x10 // 7
#endif
ldp x11,x12,[x1],#2*8
@ -226,7 +295,7 @@ sha512_block_data_order:
add x20,x20,x19 // h+=Maj(a,b,c)
ldr x19,[x30],#8 // *K++, x28 in next round
//add x20,x20,x17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x11,x11 // 8
#endif
add x20,x20,x17 // h+=Sigma0(a)
@ -250,7 +319,7 @@ sha512_block_data_order:
add x27,x27,x28 // h+=Maj(a,b,c)
ldr x28,[x30],#8 // *K++, x19 in next round
//add x27,x27,x17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x12,x12 // 9
#endif
ldp x13,x14,[x1],#2*8
@ -275,7 +344,7 @@ sha512_block_data_order:
add x26,x26,x19 // h+=Maj(a,b,c)
ldr x19,[x30],#8 // *K++, x28 in next round
//add x26,x26,x17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x13,x13 // 10
#endif
add x26,x26,x17 // h+=Sigma0(a)
@ -299,7 +368,7 @@ sha512_block_data_order:
add x25,x25,x28 // h+=Maj(a,b,c)
ldr x28,[x30],#8 // *K++, x19 in next round
//add x25,x25,x17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x14,x14 // 11
#endif
ldp x15,x0,[x1],#2*8
@ -325,7 +394,7 @@ sha512_block_data_order:
add x24,x24,x19 // h+=Maj(a,b,c)
ldr x19,[x30],#8 // *K++, x28 in next round
//add x24,x24,x17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x15,x15 // 12
#endif
add x24,x24,x17 // h+=Sigma0(a)
@ -350,7 +419,7 @@ sha512_block_data_order:
add x23,x23,x28 // h+=Maj(a,b,c)
ldr x28,[x30],#8 // *K++, x19 in next round
//add x23,x23,x17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x0,x0 // 13
#endif
ldp x1,x2,[x1]
@ -376,7 +445,7 @@ sha512_block_data_order:
add x22,x22,x19 // h+=Maj(a,b,c)
ldr x19,[x30],#8 // *K++, x28 in next round
//add x22,x22,x17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x1,x1 // 14
#endif
ldr x6,[sp,#24]
@ -402,7 +471,7 @@ sha512_block_data_order:
add x21,x21,x28 // h+=Maj(a,b,c)
ldr x28,[x30],#8 // *K++, x19 in next round
//add x21,x21,x17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x2,x2 // 15
#endif
ldr x7,[sp,#0]
@ -971,53 +1040,581 @@ sha512_block_data_order:
.size sha512_block_data_order,.-sha512_block_data_order
.align 6
.type K512,%object
K512:
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
.quad 0x3956c25bf348b538,0x59f111f1b605d019
.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
.quad 0xd807aa98a3030242,0x12835b0145706fbe
.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
.quad 0x9bdc06a725c71235,0xc19bf174cf692694
.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
.quad 0x983e5152ee66dfab,0xa831c66d2db43210
.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
.quad 0x06ca6351e003826f,0x142929670a0e6e70
.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
.quad 0x81c2c92e47edaee6,0x92722c851482353b
.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
.quad 0xd192e819d6ef5218,0xd69906245565a910
.quad 0xf40e35855771202a,0x106aa07032bbd1b8
.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
.quad 0x90befffa23631e28,0xa4506cebde82bde9
.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
.quad 0xca273eceea26619c,0xd186b8c721c0c207
.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
.quad 0x113f9804bef90dae,0x1b710b35131c471b
.quad 0x28db77f523047d84,0x32caab7b40c72493
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
.quad 0 // terminator
.size K512,.-K512
.type .LK512,%object
.LK512:
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
.quad 0x3956c25bf348b538,0x59f111f1b605d019
.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
.quad 0xd807aa98a3030242,0x12835b0145706fbe
.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
.quad 0x9bdc06a725c71235,0xc19bf174cf692694
.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
.quad 0x983e5152ee66dfab,0xa831c66d2db43210
.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
.quad 0x06ca6351e003826f,0x142929670a0e6e70
.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
.quad 0x81c2c92e47edaee6,0x92722c851482353b
.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
.quad 0xd192e819d6ef5218,0xd69906245565a910
.quad 0xf40e35855771202a,0x106aa07032bbd1b8
.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
.quad 0x90befffa23631e28,0xa4506cebde82bde9
.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
.quad 0xca273eceea26619c,0xd186b8c721c0c207
.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
.quad 0x113f9804bef90dae,0x1b710b35131c471b
.quad 0x28db77f523047d84,0x32caab7b40c72493
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
.quad 0 // terminator
.size .LK512,.-.LK512
#ifndef __KERNEL__
.align 3
.LOPENSSL_armcap_P:
.quad OPENSSL_armcap_P-.
.asciz "SHA512 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
# ifdef __ILP32__
.long OPENSSL_armcap_P-.
# else
.quad OPENSSL_armcap_P-.
# endif
#endif
.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2
.align 2
#ifndef __KERNEL__
.type sha512_block_armv8,%function
.align 6
sha512_block_armv8:
.Lv8_entry:
stp x29,x30,[sp,#-16]!
add x29,sp,#0
ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 // load input
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context
adr x3,.LK512
rev64 v16.16b,v16.16b
rev64 v17.16b,v17.16b
rev64 v18.16b,v18.16b
rev64 v19.16b,v19.16b
rev64 v20.16b,v20.16b
rev64 v21.16b,v21.16b
rev64 v22.16b,v22.16b
rev64 v23.16b,v23.16b
b .Loop_hw
.align 4
.Loop_hw:
ld1 {v24.2d},[x3],#16
subs x2,x2,#1
sub x4,x1,#128
orr v26.16b,v0.16b,v0.16b // offload
orr v27.16b,v1.16b,v1.16b
orr v28.16b,v2.16b,v2.16b
orr v29.16b,v3.16b,v3.16b
csel x1,x1,x4,ne // conditional rewind
add v24.2d,v24.2d,v16.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v2.16b,v3.16b,#8
ext v6.16b,v1.16b,v2.16b,#8
add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec08230 //sha512su0 v16.16b,v17.16b
ext v7.16b,v20.16b,v21.16b,#8
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
add v4.2d,v1.2d,v3.2d // "D + T1"
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
add v25.2d,v25.2d,v17.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v4.16b,v2.16b,#8
ext v6.16b,v0.16b,v4.16b,#8
add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec08251 //sha512su0 v17.16b,v18.16b
ext v7.16b,v21.16b,v22.16b,#8
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
add v1.2d,v0.2d,v2.2d // "D + T1"
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
add v24.2d,v24.2d,v18.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v1.16b,v4.16b,#8
ext v6.16b,v3.16b,v1.16b,#8
add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec08272 //sha512su0 v18.16b,v19.16b
ext v7.16b,v22.16b,v23.16b,#8
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
add v0.2d,v3.2d,v4.2d // "D + T1"
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
add v25.2d,v25.2d,v19.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v0.16b,v1.16b,#8
ext v6.16b,v2.16b,v0.16b,#8
add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec08293 //sha512su0 v19.16b,v20.16b
ext v7.16b,v23.16b,v16.16b,#8
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
add v3.2d,v2.2d,v1.2d // "D + T1"
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
add v24.2d,v24.2d,v20.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v3.16b,v0.16b,#8
ext v6.16b,v4.16b,v3.16b,#8
add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
ext v7.16b,v16.16b,v17.16b,#8
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
add v2.2d,v4.2d,v0.2d // "D + T1"
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
add v25.2d,v25.2d,v21.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v2.16b,v3.16b,#8
ext v6.16b,v1.16b,v2.16b,#8
add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
ext v7.16b,v17.16b,v18.16b,#8
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
add v4.2d,v1.2d,v3.2d // "D + T1"
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
add v24.2d,v24.2d,v22.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v4.16b,v2.16b,#8
ext v6.16b,v0.16b,v4.16b,#8
add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
ext v7.16b,v18.16b,v19.16b,#8
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
add v1.2d,v0.2d,v2.2d // "D + T1"
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
add v25.2d,v25.2d,v23.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v1.16b,v4.16b,#8
ext v6.16b,v3.16b,v1.16b,#8
add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec08217 //sha512su0 v23.16b,v16.16b
ext v7.16b,v19.16b,v20.16b,#8
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
add v0.2d,v3.2d,v4.2d // "D + T1"
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
add v24.2d,v24.2d,v16.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v0.16b,v1.16b,#8
ext v6.16b,v2.16b,v0.16b,#8
add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec08230 //sha512su0 v16.16b,v17.16b
ext v7.16b,v20.16b,v21.16b,#8
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
add v3.2d,v2.2d,v1.2d // "D + T1"
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
add v25.2d,v25.2d,v17.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v3.16b,v0.16b,#8
ext v6.16b,v4.16b,v3.16b,#8
add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec08251 //sha512su0 v17.16b,v18.16b
ext v7.16b,v21.16b,v22.16b,#8
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
add v2.2d,v4.2d,v0.2d // "D + T1"
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
add v24.2d,v24.2d,v18.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v2.16b,v3.16b,#8
ext v6.16b,v1.16b,v2.16b,#8
add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec08272 //sha512su0 v18.16b,v19.16b
ext v7.16b,v22.16b,v23.16b,#8
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
add v4.2d,v1.2d,v3.2d // "D + T1"
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
add v25.2d,v25.2d,v19.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v4.16b,v2.16b,#8
ext v6.16b,v0.16b,v4.16b,#8
add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec08293 //sha512su0 v19.16b,v20.16b
ext v7.16b,v23.16b,v16.16b,#8
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
add v1.2d,v0.2d,v2.2d // "D + T1"
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
add v24.2d,v24.2d,v20.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v1.16b,v4.16b,#8
ext v6.16b,v3.16b,v1.16b,#8
add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
ext v7.16b,v16.16b,v17.16b,#8
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
add v0.2d,v3.2d,v4.2d // "D + T1"
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
add v25.2d,v25.2d,v21.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v0.16b,v1.16b,#8
ext v6.16b,v2.16b,v0.16b,#8
add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
ext v7.16b,v17.16b,v18.16b,#8
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
add v3.2d,v2.2d,v1.2d // "D + T1"
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
add v24.2d,v24.2d,v22.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v3.16b,v0.16b,#8
ext v6.16b,v4.16b,v3.16b,#8
add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
ext v7.16b,v18.16b,v19.16b,#8
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
add v2.2d,v4.2d,v0.2d // "D + T1"
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
add v25.2d,v25.2d,v23.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v2.16b,v3.16b,#8
ext v6.16b,v1.16b,v2.16b,#8
add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec08217 //sha512su0 v23.16b,v16.16b
ext v7.16b,v19.16b,v20.16b,#8
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
add v4.2d,v1.2d,v3.2d // "D + T1"
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
add v24.2d,v24.2d,v16.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v4.16b,v2.16b,#8
ext v6.16b,v0.16b,v4.16b,#8
add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec08230 //sha512su0 v16.16b,v17.16b
ext v7.16b,v20.16b,v21.16b,#8
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
add v1.2d,v0.2d,v2.2d // "D + T1"
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
add v25.2d,v25.2d,v17.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v1.16b,v4.16b,#8
ext v6.16b,v3.16b,v1.16b,#8
add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec08251 //sha512su0 v17.16b,v18.16b
ext v7.16b,v21.16b,v22.16b,#8
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
add v0.2d,v3.2d,v4.2d // "D + T1"
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
add v24.2d,v24.2d,v18.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v0.16b,v1.16b,#8
ext v6.16b,v2.16b,v0.16b,#8
add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec08272 //sha512su0 v18.16b,v19.16b
ext v7.16b,v22.16b,v23.16b,#8
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
add v3.2d,v2.2d,v1.2d // "D + T1"
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
add v25.2d,v25.2d,v19.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v3.16b,v0.16b,#8
ext v6.16b,v4.16b,v3.16b,#8
add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec08293 //sha512su0 v19.16b,v20.16b
ext v7.16b,v23.16b,v16.16b,#8
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
add v2.2d,v4.2d,v0.2d // "D + T1"
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
add v24.2d,v24.2d,v20.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v2.16b,v3.16b,#8
ext v6.16b,v1.16b,v2.16b,#8
add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
ext v7.16b,v16.16b,v17.16b,#8
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
add v4.2d,v1.2d,v3.2d // "D + T1"
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
add v25.2d,v25.2d,v21.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v4.16b,v2.16b,#8
ext v6.16b,v0.16b,v4.16b,#8
add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
ext v7.16b,v17.16b,v18.16b,#8
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
add v1.2d,v0.2d,v2.2d // "D + T1"
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
add v24.2d,v24.2d,v22.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v1.16b,v4.16b,#8
ext v6.16b,v3.16b,v1.16b,#8
add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
ext v7.16b,v18.16b,v19.16b,#8
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
add v0.2d,v3.2d,v4.2d // "D + T1"
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
add v25.2d,v25.2d,v23.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v0.16b,v1.16b,#8
ext v6.16b,v2.16b,v0.16b,#8
add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec08217 //sha512su0 v23.16b,v16.16b
ext v7.16b,v19.16b,v20.16b,#8
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
add v3.2d,v2.2d,v1.2d // "D + T1"
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
add v24.2d,v24.2d,v16.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v3.16b,v0.16b,#8
ext v6.16b,v4.16b,v3.16b,#8
add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec08230 //sha512su0 v16.16b,v17.16b
ext v7.16b,v20.16b,v21.16b,#8
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
add v2.2d,v4.2d,v0.2d // "D + T1"
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
add v25.2d,v25.2d,v17.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v2.16b,v3.16b,#8
ext v6.16b,v1.16b,v2.16b,#8
add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec08251 //sha512su0 v17.16b,v18.16b
ext v7.16b,v21.16b,v22.16b,#8
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
add v4.2d,v1.2d,v3.2d // "D + T1"
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
add v24.2d,v24.2d,v18.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v4.16b,v2.16b,#8
ext v6.16b,v0.16b,v4.16b,#8
add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec08272 //sha512su0 v18.16b,v19.16b
ext v7.16b,v22.16b,v23.16b,#8
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
add v1.2d,v0.2d,v2.2d // "D + T1"
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
add v25.2d,v25.2d,v19.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v1.16b,v4.16b,#8
ext v6.16b,v3.16b,v1.16b,#8
add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec08293 //sha512su0 v19.16b,v20.16b
ext v7.16b,v23.16b,v16.16b,#8
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
add v0.2d,v3.2d,v4.2d // "D + T1"
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
add v24.2d,v24.2d,v20.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v0.16b,v1.16b,#8
ext v6.16b,v2.16b,v0.16b,#8
add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
ext v7.16b,v16.16b,v17.16b,#8
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
add v3.2d,v2.2d,v1.2d // "D + T1"
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
add v25.2d,v25.2d,v21.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v3.16b,v0.16b,#8
ext v6.16b,v4.16b,v3.16b,#8
add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
ext v7.16b,v17.16b,v18.16b,#8
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
add v2.2d,v4.2d,v0.2d // "D + T1"
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
add v24.2d,v24.2d,v22.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v2.16b,v3.16b,#8
ext v6.16b,v1.16b,v2.16b,#8
add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
ext v7.16b,v18.16b,v19.16b,#8
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
add v4.2d,v1.2d,v3.2d // "D + T1"
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
add v25.2d,v25.2d,v23.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v4.16b,v2.16b,#8
ext v6.16b,v0.16b,v4.16b,#8
add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec08217 //sha512su0 v23.16b,v16.16b
ext v7.16b,v19.16b,v20.16b,#8
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
add v1.2d,v0.2d,v2.2d // "D + T1"
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
ld1 {v25.2d},[x3],#16
add v24.2d,v24.2d,v16.2d
ld1 {v16.16b},[x1],#16 // load next input
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v1.16b,v4.16b,#8
ext v6.16b,v3.16b,v1.16b,#8
add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
rev64 v16.16b,v16.16b
add v0.2d,v3.2d,v4.2d // "D + T1"
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
ld1 {v24.2d},[x3],#16
add v25.2d,v25.2d,v17.2d
ld1 {v17.16b},[x1],#16 // load next input
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v0.16b,v1.16b,#8
ext v6.16b,v2.16b,v0.16b,#8
add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
rev64 v17.16b,v17.16b
add v3.2d,v2.2d,v1.2d // "D + T1"
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
ld1 {v25.2d},[x3],#16
add v24.2d,v24.2d,v18.2d
ld1 {v18.16b},[x1],#16 // load next input
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v3.16b,v0.16b,#8
ext v6.16b,v4.16b,v3.16b,#8
add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
rev64 v18.16b,v18.16b
add v2.2d,v4.2d,v0.2d // "D + T1"
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
ld1 {v24.2d},[x3],#16
add v25.2d,v25.2d,v19.2d
ld1 {v19.16b},[x1],#16 // load next input
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v2.16b,v3.16b,#8
ext v6.16b,v1.16b,v2.16b,#8
add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
rev64 v19.16b,v19.16b
add v4.2d,v1.2d,v3.2d // "D + T1"
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
ld1 {v25.2d},[x3],#16
add v24.2d,v24.2d,v20.2d
ld1 {v20.16b},[x1],#16 // load next input
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v4.16b,v2.16b,#8
ext v6.16b,v0.16b,v4.16b,#8
add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
rev64 v20.16b,v20.16b
add v1.2d,v0.2d,v2.2d // "D + T1"
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
ld1 {v24.2d},[x3],#16
add v25.2d,v25.2d,v21.2d
ld1 {v21.16b},[x1],#16 // load next input
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v1.16b,v4.16b,#8
ext v6.16b,v3.16b,v1.16b,#8
add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
rev64 v21.16b,v21.16b
add v0.2d,v3.2d,v4.2d // "D + T1"
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
ld1 {v25.2d},[x3],#16
add v24.2d,v24.2d,v22.2d
ld1 {v22.16b},[x1],#16 // load next input
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v0.16b,v1.16b,#8
ext v6.16b,v2.16b,v0.16b,#8
add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
rev64 v22.16b,v22.16b
add v3.2d,v2.2d,v1.2d // "D + T1"
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
sub x3,x3,#80*8 // rewind
add v25.2d,v25.2d,v23.2d
ld1 {v23.16b},[x1],#16 // load next input
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v3.16b,v0.16b,#8
ext v6.16b,v4.16b,v3.16b,#8
add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
rev64 v23.16b,v23.16b
add v2.2d,v4.2d,v0.2d // "D + T1"
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
add v0.2d,v0.2d,v26.2d // accumulate
add v1.2d,v1.2d,v27.2d
add v2.2d,v2.2d,v28.2d
add v3.2d,v3.2d,v29.2d
cbnz x2,.Loop_hw
st1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // store context
ldr x29,[sp],#16
ret
.size sha512_block_armv8,.-sha512_block_armv8
#endif
#ifndef __KERNEL__
.comm OPENSSL_armcap_P,4,4
#endif

File diff suppressed because it is too large Load Diff