Regen assemply files for aarch64.
This commit is contained in:
parent
0633b14ba1
commit
bde62812ae
@ -10,19 +10,35 @@
|
||||
|
||||
.PATH: ${LCRYPTO_SRC}/crypto \
|
||||
${LCRYPTO_SRC}/crypto/aes/asm \
|
||||
${LCRYPTO_SRC}/crypto/bn/asm \
|
||||
${LCRYPTO_SRC}/crypto/chacha/asm \
|
||||
${LCRYPTO_SRC}/crypto/ec/asm \
|
||||
${LCRYPTO_SRC}/crypto/modes/asm \
|
||||
${LCRYPTO_SRC}/crypto/poly1305/asm \
|
||||
${LCRYPTO_SRC}/crypto/sha/asm
|
||||
|
||||
PERLPATH= -I${LCRYPTO_SRC}/crypto/perlasm
|
||||
|
||||
# aes
|
||||
SRCS= aesv8-armx.pl
|
||||
SRCS= aesv8-armx.pl vpaes-armv8.pl
|
||||
|
||||
# bn
|
||||
SRCS+= armv8-mont.pl
|
||||
|
||||
# chacha
|
||||
SRCS+= chacha-armv8.pl
|
||||
|
||||
# ec
|
||||
SRCS+= ecp_nistz256-armv8.pl
|
||||
|
||||
# modes
|
||||
SRCS+= ghashv8-armx.pl
|
||||
|
||||
# poly1305
|
||||
SRCS+= poly1305-armv8.pl
|
||||
|
||||
# sha
|
||||
SRCS+= sha1-armv8.pl sha512-armv8.pl
|
||||
SRCS+= keccak1600-armv8.pl sha1-armv8.pl sha512-armv8.pl
|
||||
|
||||
ASM= ${SRCS:R:S/$/.S/} sha256-armv8.S
|
||||
|
||||
@ -32,13 +48,13 @@ CLEANFILES= ${ASM} ${SRCS:R:S/$/.s/} sha256-armv8.s
|
||||
.SUFFIXES: .pl
|
||||
|
||||
sha256-armv8.S: sha512-armv8.pl
|
||||
env CC=cc perl ${.ALLSRC} 64 ${.TARGET:R:S/$/.s/}
|
||||
env CC=cc perl ${.ALLSRC} linux64 ${.TARGET:R:S/$/.s/}
|
||||
( echo '/* $$'FreeBSD'$$ */' ;\
|
||||
echo '/* Do not modify. This file is auto-generated from ${.ALLSRC:T:R:S/$/.pl/}. */' ;\
|
||||
cat ${.TARGET:R:S/$/.s/}) > ${.TARGET}
|
||||
|
||||
.pl.S:
|
||||
env CC=cc perl ${.IMPSRC} 64 ${.TARGET:R:S/$/.s/}
|
||||
env CC=cc perl ${.IMPSRC} linux64 ${.TARGET:R:S/$/.s/}
|
||||
( echo '/* $$'FreeBSD'$$ */' ;\
|
||||
echo '/* Do not modify. This file is auto-generated from ${.IMPSRC:T:R:S/$/.pl/}. */' ;\
|
||||
cat ${.TARGET:R:S/$/.s/}) > ${.TARGET}
|
||||
@ -160,10 +176,10 @@ CLEANFILES= ${ASM} ${SRCS:R:S/$/.s/}
|
||||
aes-armv4.S: aes-armv4.pl
|
||||
( echo '/* $$'FreeBSD'$$ */' ;\
|
||||
echo '/* Do not modify. This file is auto-generated from ${.ALLSRC:T}. */' ;\
|
||||
env CC=cc perl ${.ALLSRC} elf ) > ${.TARGET}
|
||||
env CC=cc perl ${.ALLSRC} linux32 ) > ${.TARGET}
|
||||
|
||||
.pl.S:
|
||||
env CC=cc perl ${.IMPSRC} elf ${.TARGET:R:S/$/.s/}
|
||||
env CC=cc perl ${.IMPSRC} linux32 ${.TARGET:R:S/$/.s/}
|
||||
( echo '/* $$'FreeBSD'$$ */' ;\
|
||||
echo '/* Do not modify. This file is auto-generated from ${.IMPSRC:T:R:S/$/.pl/}. */' ;\
|
||||
cat ${.TARGET:R:S/$/.s/}) > ${.TARGET}
|
||||
|
@ -5,7 +5,7 @@
|
||||
#if __ARM_MAX_ARCH__>=7
|
||||
.text
|
||||
.align 5
|
||||
rcon:
|
||||
.Lrcon:
|
||||
.long 0x01,0x01,0x01,0x01
|
||||
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
|
||||
.long 0x1b,0x1b,0x1b,0x1b
|
||||
@ -30,7 +30,7 @@ aes_v8_set_encrypt_key:
|
||||
tst w1,#0x3f
|
||||
b.ne .Lenc_key_abort
|
||||
|
||||
adr x3,rcon
|
||||
adr x3,.Lrcon
|
||||
cmp w1,#192
|
||||
|
||||
eor v0.16b,v0.16b,v0.16b
|
||||
@ -54,7 +54,7 @@ aes_v8_set_encrypt_key:
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v6.16b,v6.16b,v1.16b
|
||||
eor v6.16b,v6.16b,v1.16b
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
shl v1.16b,v1.16b,#1
|
||||
eor v3.16b,v3.16b,v6.16b
|
||||
@ -71,7 +71,7 @@ aes_v8_set_encrypt_key:
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v6.16b,v6.16b,v1.16b
|
||||
eor v6.16b,v6.16b,v1.16b
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
shl v1.16b,v1.16b,#1
|
||||
eor v3.16b,v3.16b,v6.16b
|
||||
@ -85,7 +85,7 @@ aes_v8_set_encrypt_key:
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v6.16b,v6.16b,v1.16b
|
||||
eor v6.16b,v6.16b,v1.16b
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
eor v3.16b,v3.16b,v6.16b
|
||||
st1 {v3.4s},[x2]
|
||||
@ -116,7 +116,7 @@ aes_v8_set_encrypt_key:
|
||||
|
||||
dup v5.4s,v3.s[3]
|
||||
eor v5.16b,v5.16b,v4.16b
|
||||
eor v6.16b,v6.16b,v1.16b
|
||||
eor v6.16b,v6.16b,v1.16b
|
||||
ext v4.16b,v0.16b,v4.16b,#12
|
||||
shl v1.16b,v1.16b,#1
|
||||
eor v4.16b,v4.16b,v5.16b
|
||||
@ -147,7 +147,7 @@ aes_v8_set_encrypt_key:
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v6.16b,v6.16b,v1.16b
|
||||
eor v6.16b,v6.16b,v1.16b
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
shl v1.16b,v1.16b,#1
|
||||
eor v3.16b,v3.16b,v6.16b
|
||||
@ -291,13 +291,13 @@ aes_v8_cbc_encrypt:
|
||||
ld1 {v6.16b},[x4]
|
||||
ld1 {v0.16b},[x0],x8
|
||||
|
||||
ld1 {v16.4s-v17.4s},[x3] // load key schedule...
|
||||
ld1 {v16.4s,v17.4s},[x3] // load key schedule...
|
||||
sub w5,w5,#6
|
||||
add x7,x3,x5,lsl#4 // pointer to last 7 round keys
|
||||
sub w5,w5,#2
|
||||
ld1 {v18.4s-v19.4s},[x7],#32
|
||||
ld1 {v20.4s-v21.4s},[x7],#32
|
||||
ld1 {v22.4s-v23.4s},[x7],#32
|
||||
ld1 {v18.4s,v19.4s},[x7],#32
|
||||
ld1 {v20.4s,v21.4s},[x7],#32
|
||||
ld1 {v22.4s,v23.4s},[x7],#32
|
||||
ld1 {v7.4s},[x7]
|
||||
|
||||
add x7,x3,#32
|
||||
@ -309,7 +309,7 @@ aes_v8_cbc_encrypt:
|
||||
eor v5.16b,v16.16b,v7.16b
|
||||
b.eq .Lcbc_enc128
|
||||
|
||||
ld1 {v2.4s-v3.4s},[x7]
|
||||
ld1 {v2.4s,v3.4s},[x7]
|
||||
add x7,x3,#16
|
||||
add x6,x3,#16*4
|
||||
add x12,x3,#16*5
|
||||
@ -323,7 +323,7 @@ aes_v8_cbc_encrypt:
|
||||
.Loop_cbc_enc:
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
st1 {v6.16b},[x1],#16
|
||||
st1 {v6.16b},[x1],#16
|
||||
.Lenter_cbc_enc:
|
||||
aese v0.16b,v17.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
@ -347,21 +347,21 @@ aes_v8_cbc_encrypt:
|
||||
.Lcbc_enc192:
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
subs x2,x2,#16
|
||||
subs x2,x2,#16
|
||||
aese v0.16b,v17.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
csel x8,xzr,x8,eq
|
||||
csel x8,xzr,x8,eq
|
||||
aese v0.16b,v18.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v0.16b,v19.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
ld1 {v16.16b},[x0],x8
|
||||
ld1 {v16.16b},[x0],x8
|
||||
aese v0.16b,v20.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
eor v16.16b,v16.16b,v5.16b
|
||||
eor v16.16b,v16.16b,v5.16b
|
||||
aese v0.16b,v21.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
ld1 {v17.4s},[x7] // re-pre-load rndkey[1]
|
||||
ld1 {v17.4s},[x7] // re-pre-load rndkey[1]
|
||||
aese v0.16b,v22.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v0.16b,v23.16b
|
||||
@ -373,35 +373,35 @@ aes_v8_cbc_encrypt:
|
||||
|
||||
.align 5
|
||||
.Lcbc_enc128:
|
||||
ld1 {v2.4s-v3.4s},[x7]
|
||||
ld1 {v2.4s,v3.4s},[x7]
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
b .Lenter_cbc_enc128
|
||||
.Loop_cbc_enc128:
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
st1 {v6.16b},[x1],#16
|
||||
st1 {v6.16b},[x1],#16
|
||||
.Lenter_cbc_enc128:
|
||||
aese v0.16b,v17.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
subs x2,x2,#16
|
||||
subs x2,x2,#16
|
||||
aese v0.16b,v2.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
csel x8,xzr,x8,eq
|
||||
csel x8,xzr,x8,eq
|
||||
aese v0.16b,v3.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v0.16b,v18.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v0.16b,v19.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
ld1 {v16.16b},[x0],x8
|
||||
ld1 {v16.16b},[x0],x8
|
||||
aese v0.16b,v20.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v0.16b,v21.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v0.16b,v22.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
eor v16.16b,v16.16b,v5.16b
|
||||
eor v16.16b,v16.16b,v5.16b
|
||||
aese v0.16b,v23.16b
|
||||
eor v6.16b,v0.16b,v7.16b
|
||||
b.hs .Loop_cbc_enc128
|
||||
@ -448,58 +448,58 @@ aes_v8_cbc_encrypt:
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v16.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
eor v4.16b,v6.16b,v7.16b
|
||||
subs x2,x2,#0x30
|
||||
eor v5.16b,v2.16b,v7.16b
|
||||
csel x6,x2,x6,lo // x6, w6, is zero at this point
|
||||
eor v4.16b,v6.16b,v7.16b
|
||||
subs x2,x2,#0x30
|
||||
eor v5.16b,v2.16b,v7.16b
|
||||
csel x6,x2,x6,lo // x6, w6, is zero at this point
|
||||
aesd v0.16b,v17.16b
|
||||
aesimc v0.16b,v0.16b
|
||||
aesd v1.16b,v17.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v17.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
eor v17.16b,v3.16b,v7.16b
|
||||
add x0,x0,x6 // x0 is adjusted in such way that
|
||||
eor v17.16b,v3.16b,v7.16b
|
||||
add x0,x0,x6 // x0 is adjusted in such way that
|
||||
// at exit from the loop v1.16b-v18.16b
|
||||
// are loaded with last "words"
|
||||
orr v6.16b,v19.16b,v19.16b
|
||||
mov x7,x3
|
||||
orr v6.16b,v19.16b,v19.16b
|
||||
mov x7,x3
|
||||
aesd v0.16b,v20.16b
|
||||
aesimc v0.16b,v0.16b
|
||||
aesd v1.16b,v20.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v20.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
ld1 {v2.16b},[x0],#16
|
||||
ld1 {v2.16b},[x0],#16
|
||||
aesd v0.16b,v21.16b
|
||||
aesimc v0.16b,v0.16b
|
||||
aesd v1.16b,v21.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v21.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
ld1 {v3.16b},[x0],#16
|
||||
ld1 {v3.16b},[x0],#16
|
||||
aesd v0.16b,v22.16b
|
||||
aesimc v0.16b,v0.16b
|
||||
aesd v1.16b,v22.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v22.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
ld1 {v19.16b},[x0],#16
|
||||
ld1 {v19.16b},[x0],#16
|
||||
aesd v0.16b,v23.16b
|
||||
aesd v1.16b,v23.16b
|
||||
aesd v18.16b,v23.16b
|
||||
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
|
||||
add w6,w5,#2
|
||||
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
|
||||
add w6,w5,#2
|
||||
eor v4.16b,v4.16b,v0.16b
|
||||
eor v5.16b,v5.16b,v1.16b
|
||||
eor v18.16b,v18.16b,v17.16b
|
||||
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
|
||||
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
|
||||
st1 {v4.16b},[x1],#16
|
||||
orr v0.16b,v2.16b,v2.16b
|
||||
orr v0.16b,v2.16b,v2.16b
|
||||
st1 {v5.16b},[x1],#16
|
||||
orr v1.16b,v3.16b,v3.16b
|
||||
orr v1.16b,v3.16b,v3.16b
|
||||
st1 {v18.16b},[x1],#16
|
||||
orr v18.16b,v19.16b,v19.16b
|
||||
orr v18.16b,v19.16b,v19.16b
|
||||
b.hs .Loop3x_cbc_dec
|
||||
|
||||
cmn x2,#0x30
|
||||
@ -532,30 +532,30 @@ aes_v8_cbc_encrypt:
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v20.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
cmn x2,#0x20
|
||||
cmn x2,#0x20
|
||||
aesd v1.16b,v21.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v21.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
eor v5.16b,v6.16b,v7.16b
|
||||
eor v5.16b,v6.16b,v7.16b
|
||||
aesd v1.16b,v22.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v22.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
eor v17.16b,v3.16b,v7.16b
|
||||
eor v17.16b,v3.16b,v7.16b
|
||||
aesd v1.16b,v23.16b
|
||||
aesd v18.16b,v23.16b
|
||||
b.eq .Lcbc_dec_one
|
||||
eor v5.16b,v5.16b,v1.16b
|
||||
eor v17.16b,v17.16b,v18.16b
|
||||
orr v6.16b,v19.16b,v19.16b
|
||||
orr v6.16b,v19.16b,v19.16b
|
||||
st1 {v5.16b},[x1],#16
|
||||
st1 {v17.16b},[x1],#16
|
||||
b .Lcbc_done
|
||||
|
||||
.Lcbc_dec_one:
|
||||
eor v5.16b,v5.16b,v18.16b
|
||||
orr v6.16b,v19.16b,v19.16b
|
||||
orr v6.16b,v19.16b,v19.16b
|
||||
st1 {v5.16b},[x1],#16
|
||||
|
||||
.Lcbc_done:
|
||||
@ -568,181 +568,181 @@ aes_v8_cbc_encrypt:
|
||||
.type aes_v8_ctr32_encrypt_blocks,%function
|
||||
.align 5
|
||||
aes_v8_ctr32_encrypt_blocks:
|
||||
stp x29,x30,[sp,#-16]!
|
||||
add x29,sp,#0
|
||||
ldr w5,[x3,#240]
|
||||
stp x29,x30,[sp,#-16]!
|
||||
add x29,sp,#0
|
||||
ldr w5,[x3,#240]
|
||||
|
||||
ldr w8, [x4, #12]
|
||||
ld1 {v0.4s},[x4]
|
||||
ldr w8, [x4, #12]
|
||||
ld1 {v0.4s},[x4]
|
||||
|
||||
ld1 {v16.4s-v17.4s},[x3] // load key schedule...
|
||||
sub w5,w5,#4
|
||||
mov x12,#16
|
||||
cmp x2,#2
|
||||
add x7,x3,x5,lsl#4 // pointer to last 5 round keys
|
||||
sub w5,w5,#2
|
||||
ld1 {v20.4s-v21.4s},[x7],#32
|
||||
ld1 {v22.4s-v23.4s},[x7],#32
|
||||
ld1 {v7.4s},[x7]
|
||||
add x7,x3,#32
|
||||
mov w6,w5
|
||||
ld1 {v16.4s,v17.4s},[x3] // load key schedule...
|
||||
sub w5,w5,#4
|
||||
mov x12,#16
|
||||
cmp x2,#2
|
||||
add x7,x3,x5,lsl#4 // pointer to last 5 round keys
|
||||
sub w5,w5,#2
|
||||
ld1 {v20.4s,v21.4s},[x7],#32
|
||||
ld1 {v22.4s,v23.4s},[x7],#32
|
||||
ld1 {v7.4s},[x7]
|
||||
add x7,x3,#32
|
||||
mov w6,w5
|
||||
csel x12,xzr,x12,lo
|
||||
#ifndef __ARMEB__
|
||||
rev w8, w8
|
||||
rev w8, w8
|
||||
#endif
|
||||
orr v1.16b,v0.16b,v0.16b
|
||||
add w10, w8, #1
|
||||
orr v18.16b,v0.16b,v0.16b
|
||||
add w8, w8, #2
|
||||
orr v6.16b,v0.16b,v0.16b
|
||||
rev w10, w10
|
||||
mov v1.s[3],w10
|
||||
b.ls .Lctr32_tail
|
||||
rev w12, w8
|
||||
sub x2,x2,#3 // bias
|
||||
mov v18.s[3],w12
|
||||
b .Loop3x_ctr32
|
||||
orr v1.16b,v0.16b,v0.16b
|
||||
add w10, w8, #1
|
||||
orr v18.16b,v0.16b,v0.16b
|
||||
add w8, w8, #2
|
||||
orr v6.16b,v0.16b,v0.16b
|
||||
rev w10, w10
|
||||
mov v1.s[3],w10
|
||||
b.ls .Lctr32_tail
|
||||
rev w12, w8
|
||||
sub x2,x2,#3 // bias
|
||||
mov v18.s[3],w12
|
||||
b .Loop3x_ctr32
|
||||
|
||||
.align 4
|
||||
.Loop3x_ctr32:
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v16.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
aese v18.16b,v16.16b
|
||||
aesmc v18.16b,v18.16b
|
||||
ld1 {v16.4s},[x7],#16
|
||||
subs w6,w6,#2
|
||||
aese v0.16b,v17.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v17.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
aese v18.16b,v17.16b
|
||||
aesmc v18.16b,v18.16b
|
||||
ld1 {v17.4s},[x7],#16
|
||||
b.gt .Loop3x_ctr32
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v16.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
aese v18.16b,v16.16b
|
||||
aesmc v18.16b,v18.16b
|
||||
ld1 {v16.4s},[x7],#16
|
||||
subs w6,w6,#2
|
||||
aese v0.16b,v17.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v17.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
aese v18.16b,v17.16b
|
||||
aesmc v18.16b,v18.16b
|
||||
ld1 {v17.4s},[x7],#16
|
||||
b.gt .Loop3x_ctr32
|
||||
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v4.16b,v0.16b
|
||||
aese v1.16b,v16.16b
|
||||
aesmc v5.16b,v1.16b
|
||||
ld1 {v2.16b},[x0],#16
|
||||
orr v0.16b,v6.16b,v6.16b
|
||||
aese v18.16b,v16.16b
|
||||
aesmc v18.16b,v18.16b
|
||||
ld1 {v3.16b},[x0],#16
|
||||
orr v1.16b,v6.16b,v6.16b
|
||||
aese v4.16b,v17.16b
|
||||
aesmc v4.16b,v4.16b
|
||||
aese v5.16b,v17.16b
|
||||
aesmc v5.16b,v5.16b
|
||||
ld1 {v19.16b},[x0],#16
|
||||
mov x7,x3
|
||||
aese v18.16b,v17.16b
|
||||
aesmc v17.16b,v18.16b
|
||||
orr v18.16b,v6.16b,v6.16b
|
||||
add w9,w8,#1
|
||||
aese v4.16b,v20.16b
|
||||
aesmc v4.16b,v4.16b
|
||||
aese v5.16b,v20.16b
|
||||
aesmc v5.16b,v5.16b
|
||||
eor v2.16b,v2.16b,v7.16b
|
||||
add w10,w8,#2
|
||||
aese v17.16b,v20.16b
|
||||
aesmc v17.16b,v17.16b
|
||||
eor v3.16b,v3.16b,v7.16b
|
||||
add w8,w8,#3
|
||||
aese v4.16b,v21.16b
|
||||
aesmc v4.16b,v4.16b
|
||||
aese v5.16b,v21.16b
|
||||
aesmc v5.16b,v5.16b
|
||||
eor v19.16b,v19.16b,v7.16b
|
||||
rev w9,w9
|
||||
aese v17.16b,v21.16b
|
||||
aesmc v17.16b,v17.16b
|
||||
mov v0.s[3], w9
|
||||
rev w10,w10
|
||||
aese v4.16b,v22.16b
|
||||
aesmc v4.16b,v4.16b
|
||||
aese v5.16b,v22.16b
|
||||
aesmc v5.16b,v5.16b
|
||||
mov v1.s[3], w10
|
||||
rev w12,w8
|
||||
aese v17.16b,v22.16b
|
||||
aesmc v17.16b,v17.16b
|
||||
mov v18.s[3], w12
|
||||
subs x2,x2,#3
|
||||
aese v4.16b,v23.16b
|
||||
aese v5.16b,v23.16b
|
||||
aese v17.16b,v23.16b
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v4.16b,v0.16b
|
||||
aese v1.16b,v16.16b
|
||||
aesmc v5.16b,v1.16b
|
||||
ld1 {v2.16b},[x0],#16
|
||||
orr v0.16b,v6.16b,v6.16b
|
||||
aese v18.16b,v16.16b
|
||||
aesmc v18.16b,v18.16b
|
||||
ld1 {v3.16b},[x0],#16
|
||||
orr v1.16b,v6.16b,v6.16b
|
||||
aese v4.16b,v17.16b
|
||||
aesmc v4.16b,v4.16b
|
||||
aese v5.16b,v17.16b
|
||||
aesmc v5.16b,v5.16b
|
||||
ld1 {v19.16b},[x0],#16
|
||||
mov x7,x3
|
||||
aese v18.16b,v17.16b
|
||||
aesmc v17.16b,v18.16b
|
||||
orr v18.16b,v6.16b,v6.16b
|
||||
add w9,w8,#1
|
||||
aese v4.16b,v20.16b
|
||||
aesmc v4.16b,v4.16b
|
||||
aese v5.16b,v20.16b
|
||||
aesmc v5.16b,v5.16b
|
||||
eor v2.16b,v2.16b,v7.16b
|
||||
add w10,w8,#2
|
||||
aese v17.16b,v20.16b
|
||||
aesmc v17.16b,v17.16b
|
||||
eor v3.16b,v3.16b,v7.16b
|
||||
add w8,w8,#3
|
||||
aese v4.16b,v21.16b
|
||||
aesmc v4.16b,v4.16b
|
||||
aese v5.16b,v21.16b
|
||||
aesmc v5.16b,v5.16b
|
||||
eor v19.16b,v19.16b,v7.16b
|
||||
rev w9,w9
|
||||
aese v17.16b,v21.16b
|
||||
aesmc v17.16b,v17.16b
|
||||
mov v0.s[3], w9
|
||||
rev w10,w10
|
||||
aese v4.16b,v22.16b
|
||||
aesmc v4.16b,v4.16b
|
||||
aese v5.16b,v22.16b
|
||||
aesmc v5.16b,v5.16b
|
||||
mov v1.s[3], w10
|
||||
rev w12,w8
|
||||
aese v17.16b,v22.16b
|
||||
aesmc v17.16b,v17.16b
|
||||
mov v18.s[3], w12
|
||||
subs x2,x2,#3
|
||||
aese v4.16b,v23.16b
|
||||
aese v5.16b,v23.16b
|
||||
aese v17.16b,v23.16b
|
||||
|
||||
eor v2.16b,v2.16b,v4.16b
|
||||
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
|
||||
st1 {v2.16b},[x1],#16
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
mov w6,w5
|
||||
st1 {v3.16b},[x1],#16
|
||||
eor v19.16b,v19.16b,v17.16b
|
||||
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
|
||||
st1 {v19.16b},[x1],#16
|
||||
b.hs .Loop3x_ctr32
|
||||
eor v2.16b,v2.16b,v4.16b
|
||||
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
|
||||
st1 {v2.16b},[x1],#16
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
mov w6,w5
|
||||
st1 {v3.16b},[x1],#16
|
||||
eor v19.16b,v19.16b,v17.16b
|
||||
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
|
||||
st1 {v19.16b},[x1],#16
|
||||
b.hs .Loop3x_ctr32
|
||||
|
||||
adds x2,x2,#3
|
||||
b.eq .Lctr32_done
|
||||
cmp x2,#1
|
||||
mov x12,#16
|
||||
adds x2,x2,#3
|
||||
b.eq .Lctr32_done
|
||||
cmp x2,#1
|
||||
mov x12,#16
|
||||
csel x12,xzr,x12,eq
|
||||
|
||||
.Lctr32_tail:
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v16.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
ld1 {v16.4s},[x7],#16
|
||||
subs w6,w6,#2
|
||||
aese v0.16b,v17.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v17.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
ld1 {v17.4s},[x7],#16
|
||||
b.gt .Lctr32_tail
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v16.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
ld1 {v16.4s},[x7],#16
|
||||
subs w6,w6,#2
|
||||
aese v0.16b,v17.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v17.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
ld1 {v17.4s},[x7],#16
|
||||
b.gt .Lctr32_tail
|
||||
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v16.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
aese v0.16b,v17.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v17.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
ld1 {v2.16b},[x0],x12
|
||||
aese v0.16b,v20.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v20.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
ld1 {v3.16b},[x0]
|
||||
aese v0.16b,v21.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v21.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
eor v2.16b,v2.16b,v7.16b
|
||||
aese v0.16b,v22.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v22.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
eor v3.16b,v3.16b,v7.16b
|
||||
aese v0.16b,v23.16b
|
||||
aese v1.16b,v23.16b
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v16.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
aese v0.16b,v17.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v17.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
ld1 {v2.16b},[x0],x12
|
||||
aese v0.16b,v20.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v20.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
ld1 {v3.16b},[x0]
|
||||
aese v0.16b,v21.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v21.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
eor v2.16b,v2.16b,v7.16b
|
||||
aese v0.16b,v22.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v22.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
eor v3.16b,v3.16b,v7.16b
|
||||
aese v0.16b,v23.16b
|
||||
aese v1.16b,v23.16b
|
||||
|
||||
cmp x2,#1
|
||||
eor v2.16b,v2.16b,v0.16b
|
||||
eor v3.16b,v3.16b,v1.16b
|
||||
st1 {v2.16b},[x1],#16
|
||||
b.eq .Lctr32_done
|
||||
st1 {v3.16b},[x1]
|
||||
cmp x2,#1
|
||||
eor v2.16b,v2.16b,v0.16b
|
||||
eor v3.16b,v3.16b,v1.16b
|
||||
st1 {v2.16b},[x1],#16
|
||||
b.eq .Lctr32_done
|
||||
st1 {v3.16b},[x1]
|
||||
|
||||
.Lctr32_done:
|
||||
ldr x29,[sp],#16
|
||||
ldr x29,[sp],#16
|
||||
ret
|
||||
.size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
|
||||
#endif
|
||||
|
1406
secure/lib/libcrypto/aarch64/armv8-mont.S
Normal file
1406
secure/lib/libcrypto/aarch64/armv8-mont.S
Normal file
File diff suppressed because it is too large
Load Diff
1970
secure/lib/libcrypto/aarch64/chacha-armv8.S
Normal file
1970
secure/lib/libcrypto/aarch64/chacha-armv8.S
Normal file
File diff suppressed because it is too large
Load Diff
4227
secure/lib/libcrypto/aarch64/ecp_nistz256-armv8.S
Normal file
4227
secure/lib/libcrypto/aarch64/ecp_nistz256-armv8.S
Normal file
File diff suppressed because it is too large
Load Diff
@ -2,227 +2,552 @@
|
||||
/* Do not modify. This file is auto-generated from ghashv8-armx.pl. */
|
||||
#include "arm_arch.h"
|
||||
|
||||
#if __ARM_MAX_ARCH__>=7
|
||||
.text
|
||||
.global gcm_init_v8
|
||||
.globl gcm_init_v8
|
||||
.type gcm_init_v8,%function
|
||||
.align 4
|
||||
gcm_init_v8:
|
||||
ld1 {v17.2d},[x1] //load input H
|
||||
movi v19.16b,#0xe1
|
||||
ld1 {v17.2d},[x1] //load input H
|
||||
movi v19.16b,#0xe1
|
||||
shl v19.2d,v19.2d,#57 //0xc2.0
|
||||
ext v3.16b,v17.16b,v17.16b,#8
|
||||
ext v3.16b,v17.16b,v17.16b,#8
|
||||
ushr v18.2d,v19.2d,#63
|
||||
dup v17.4s,v17.s[1]
|
||||
ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01
|
||||
dup v17.4s,v17.s[1]
|
||||
ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01
|
||||
ushr v18.2d,v3.2d,#63
|
||||
sshr v17.4s,v17.4s,#31 //broadcast carry bit
|
||||
and v18.16b,v18.16b,v16.16b
|
||||
and v18.16b,v18.16b,v16.16b
|
||||
shl v3.2d,v3.2d,#1
|
||||
ext v18.16b,v18.16b,v18.16b,#8
|
||||
and v16.16b,v16.16b,v17.16b
|
||||
orr v3.16b,v3.16b,v18.16b //H<<<=1
|
||||
eor v20.16b,v3.16b,v16.16b //twisted H
|
||||
st1 {v20.2d},[x0],#16 //store Htable[0]
|
||||
ext v18.16b,v18.16b,v18.16b,#8
|
||||
and v16.16b,v16.16b,v17.16b
|
||||
orr v3.16b,v3.16b,v18.16b //H<<<=1
|
||||
eor v20.16b,v3.16b,v16.16b //twisted H
|
||||
st1 {v20.2d},[x0],#16 //store Htable[0]
|
||||
|
||||
//calculate H^2
|
||||
ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing
|
||||
ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing
|
||||
pmull v0.1q,v20.1d,v20.1d
|
||||
eor v16.16b,v16.16b,v20.16b
|
||||
eor v16.16b,v16.16b,v20.16b
|
||||
pmull2 v2.1q,v20.2d,v20.2d
|
||||
pmull v1.1q,v16.1d,v16.1d
|
||||
|
||||
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
|
||||
eor v18.16b,v0.16b,v2.16b
|
||||
eor v1.16b,v1.16b,v17.16b
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
|
||||
eor v18.16b,v0.16b,v2.16b
|
||||
eor v1.16b,v1.16b,v17.16b
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
pmull v18.1q,v0.1d,v19.1d //1st phase
|
||||
|
||||
ins v2.d[0],v1.d[1]
|
||||
ins v1.d[1],v0.d[0]
|
||||
eor v0.16b,v1.16b,v18.16b
|
||||
eor v0.16b,v1.16b,v18.16b
|
||||
|
||||
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
|
||||
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
|
||||
pmull v0.1q,v0.1d,v19.1d
|
||||
eor v18.16b,v18.16b,v2.16b
|
||||
eor v22.16b,v0.16b,v18.16b
|
||||
eor v18.16b,v18.16b,v2.16b
|
||||
eor v22.16b,v0.16b,v18.16b
|
||||
|
||||
ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing
|
||||
eor v17.16b,v17.16b,v22.16b
|
||||
ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
|
||||
st1 {v21.2d-v22.2d},[x0] //store Htable[1..2]
|
||||
ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing
|
||||
eor v17.16b,v17.16b,v22.16b
|
||||
ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
|
||||
st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2]
|
||||
//calculate H^3 and H^4
|
||||
pmull v0.1q,v20.1d, v22.1d
|
||||
pmull v5.1q,v22.1d,v22.1d
|
||||
pmull2 v2.1q,v20.2d, v22.2d
|
||||
pmull2 v7.1q,v22.2d,v22.2d
|
||||
pmull v1.1q,v16.1d,v17.1d
|
||||
pmull v6.1q,v17.1d,v17.1d
|
||||
|
||||
ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
|
||||
ext v17.16b,v5.16b,v7.16b,#8
|
||||
eor v18.16b,v0.16b,v2.16b
|
||||
eor v1.16b,v1.16b,v16.16b
|
||||
eor v4.16b,v5.16b,v7.16b
|
||||
eor v6.16b,v6.16b,v17.16b
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
pmull v18.1q,v0.1d,v19.1d //1st phase
|
||||
eor v6.16b,v6.16b,v4.16b
|
||||
pmull v4.1q,v5.1d,v19.1d
|
||||
|
||||
ins v2.d[0],v1.d[1]
|
||||
ins v7.d[0],v6.d[1]
|
||||
ins v1.d[1],v0.d[0]
|
||||
ins v6.d[1],v5.d[0]
|
||||
eor v0.16b,v1.16b,v18.16b
|
||||
eor v5.16b,v6.16b,v4.16b
|
||||
|
||||
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
|
||||
ext v4.16b,v5.16b,v5.16b,#8
|
||||
pmull v0.1q,v0.1d,v19.1d
|
||||
pmull v5.1q,v5.1d,v19.1d
|
||||
eor v18.16b,v18.16b,v2.16b
|
||||
eor v4.16b,v4.16b,v7.16b
|
||||
eor v20.16b, v0.16b,v18.16b //H^3
|
||||
eor v22.16b,v5.16b,v4.16b //H^4
|
||||
|
||||
ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing
|
||||
ext v17.16b,v22.16b,v22.16b,#8
|
||||
eor v16.16b,v16.16b,v20.16b
|
||||
eor v17.16b,v17.16b,v22.16b
|
||||
ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
|
||||
st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5]
|
||||
ret
|
||||
.size gcm_init_v8,.-gcm_init_v8
|
||||
.global gcm_gmult_v8
|
||||
.globl gcm_gmult_v8
|
||||
.type gcm_gmult_v8,%function
|
||||
.align 4
|
||||
gcm_gmult_v8:
|
||||
ld1 {v17.2d},[x0] //load Xi
|
||||
movi v19.16b,#0xe1
|
||||
ld1 {v20.2d-v21.2d},[x1] //load twisted H, ...
|
||||
ld1 {v17.2d},[x0] //load Xi
|
||||
movi v19.16b,#0xe1
|
||||
ld1 {v20.2d,v21.2d},[x1] //load twisted H, ...
|
||||
shl v19.2d,v19.2d,#57
|
||||
#ifndef __ARMEB__
|
||||
rev64 v17.16b,v17.16b
|
||||
#endif
|
||||
ext v3.16b,v17.16b,v17.16b,#8
|
||||
ext v3.16b,v17.16b,v17.16b,#8
|
||||
|
||||
pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
|
||||
eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
|
||||
eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
|
||||
pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
|
||||
pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
|
||||
|
||||
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
|
||||
eor v18.16b,v0.16b,v2.16b
|
||||
eor v1.16b,v1.16b,v17.16b
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
|
||||
eor v18.16b,v0.16b,v2.16b
|
||||
eor v1.16b,v1.16b,v17.16b
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
|
||||
|
||||
ins v2.d[0],v1.d[1]
|
||||
ins v1.d[1],v0.d[0]
|
||||
eor v0.16b,v1.16b,v18.16b
|
||||
eor v0.16b,v1.16b,v18.16b
|
||||
|
||||
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
|
||||
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
|
||||
pmull v0.1q,v0.1d,v19.1d
|
||||
eor v18.16b,v18.16b,v2.16b
|
||||
eor v0.16b,v0.16b,v18.16b
|
||||
eor v18.16b,v18.16b,v2.16b
|
||||
eor v0.16b,v0.16b,v18.16b
|
||||
|
||||
#ifndef __ARMEB__
|
||||
rev64 v0.16b,v0.16b
|
||||
#endif
|
||||
ext v0.16b,v0.16b,v0.16b,#8
|
||||
st1 {v0.2d},[x0] //write out Xi
|
||||
ext v0.16b,v0.16b,v0.16b,#8
|
||||
st1 {v0.2d},[x0] //write out Xi
|
||||
|
||||
ret
|
||||
.size gcm_gmult_v8,.-gcm_gmult_v8
|
||||
.global gcm_ghash_v8
|
||||
.globl gcm_ghash_v8
|
||||
.type gcm_ghash_v8,%function
|
||||
.align 4
|
||||
gcm_ghash_v8:
|
||||
ld1 {v0.2d},[x0] //load [rotated] Xi
|
||||
cmp x3,#64
|
||||
b.hs .Lgcm_ghash_v8_4x
|
||||
ld1 {v0.2d},[x0] //load [rotated] Xi
|
||||
//"[rotated]" means that
|
||||
//loaded value would have
|
||||
//to be rotated in order to
|
||||
//make it appear as in
|
||||
//alorithm specification
|
||||
subs x3,x3,#32 //see if x3 is 32 or larger
|
||||
mov x12,#16 //x12 is used as post-
|
||||
//algorithm specification
|
||||
subs x3,x3,#32 //see if x3 is 32 or larger
|
||||
mov x12,#16 //x12 is used as post-
|
||||
//increment for input pointer;
|
||||
//as loop is modulo-scheduled
|
||||
//x12 is zeroed just in time
|
||||
//to preclude oversteping
|
||||
//to preclude overstepping
|
||||
//inp[len], which means that
|
||||
//last block[s] are actually
|
||||
//loaded twice, but last
|
||||
//copy is not processed
|
||||
ld1 {v20.2d-v21.2d},[x1],#32 //load twisted H, ..., H^2
|
||||
movi v19.16b,#0xe1
|
||||
ld1 {v22.2d},[x1]
|
||||
ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2
|
||||
movi v19.16b,#0xe1
|
||||
ld1 {v22.2d},[x1]
|
||||
csel x12,xzr,x12,eq //is it time to zero x12?
|
||||
ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi
|
||||
ld1 {v16.2d},[x2],#16 //load [rotated] I[0]
|
||||
ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi
|
||||
ld1 {v16.2d},[x2],#16 //load [rotated] I[0]
|
||||
shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
|
||||
#ifndef __ARMEB__
|
||||
rev64 v16.16b,v16.16b
|
||||
rev64 v0.16b,v0.16b
|
||||
#endif
|
||||
ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]
|
||||
b.lo .Lodd_tail_v8 //x3 was less than 32
|
||||
ld1 {v17.2d},[x2],x12 //load [rotated] I[1]
|
||||
ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]
|
||||
b.lo .Lodd_tail_v8 //x3 was less than 32
|
||||
ld1 {v17.2d},[x2],x12 //load [rotated] I[1]
|
||||
#ifndef __ARMEB__
|
||||
rev64 v17.16b,v17.16b
|
||||
#endif
|
||||
ext v7.16b,v17.16b,v17.16b,#8
|
||||
eor v3.16b,v3.16b,v0.16b //I[i]^=Xi
|
||||
ext v7.16b,v17.16b,v17.16b,#8
|
||||
eor v3.16b,v3.16b,v0.16b //I[i]^=Xi
|
||||
pmull v4.1q,v20.1d,v7.1d //H·Ii+1
|
||||
eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
|
||||
eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
|
||||
pmull2 v6.1q,v20.2d,v7.2d
|
||||
b .Loop_mod2x_v8
|
||||
b .Loop_mod2x_v8
|
||||
|
||||
.align 4
|
||||
.Loop_mod2x_v8:
|
||||
ext v18.16b,v3.16b,v3.16b,#8
|
||||
subs x3,x3,#32 //is there more data?
|
||||
ext v18.16b,v3.16b,v3.16b,#8
|
||||
subs x3,x3,#32 //is there more data?
|
||||
pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo
|
||||
csel x12,xzr,x12,lo //is it time to zero x12?
|
||||
|
||||
pmull v5.1q,v21.1d,v17.1d
|
||||
eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing
|
||||
pmull v5.1q,v21.1d,v17.1d
|
||||
eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing
|
||||
pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi
|
||||
eor v0.16b,v0.16b,v4.16b //accumulate
|
||||
eor v0.16b,v0.16b,v4.16b //accumulate
|
||||
pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
|
||||
ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2]
|
||||
ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2]
|
||||
|
||||
eor v2.16b,v2.16b,v6.16b
|
||||
csel x12,xzr,x12,eq //is it time to zero x12?
|
||||
eor v1.16b,v1.16b,v5.16b
|
||||
eor v2.16b,v2.16b,v6.16b
|
||||
csel x12,xzr,x12,eq //is it time to zero x12?
|
||||
eor v1.16b,v1.16b,v5.16b
|
||||
|
||||
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
|
||||
eor v18.16b,v0.16b,v2.16b
|
||||
eor v1.16b,v1.16b,v17.16b
|
||||
ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]
|
||||
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
|
||||
eor v18.16b,v0.16b,v2.16b
|
||||
eor v1.16b,v1.16b,v17.16b
|
||||
ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]
|
||||
#ifndef __ARMEB__
|
||||
rev64 v16.16b,v16.16b
|
||||
rev64 v16.16b,v16.16b
|
||||
#endif
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
|
||||
|
||||
#ifndef __ARMEB__
|
||||
rev64 v17.16b,v17.16b
|
||||
rev64 v17.16b,v17.16b
|
||||
#endif
|
||||
ins v2.d[0],v1.d[1]
|
||||
ins v1.d[1],v0.d[0]
|
||||
ext v7.16b,v17.16b,v17.16b,#8
|
||||
ext v3.16b,v16.16b,v16.16b,#8
|
||||
eor v0.16b,v1.16b,v18.16b
|
||||
pmull v4.1q,v20.1d,v7.1d //H·Ii+1
|
||||
eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early
|
||||
ext v7.16b,v17.16b,v17.16b,#8
|
||||
ext v3.16b,v16.16b,v16.16b,#8
|
||||
eor v0.16b,v1.16b,v18.16b
|
||||
pmull v4.1q,v20.1d,v7.1d //H·Ii+1
|
||||
eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early
|
||||
|
||||
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
|
||||
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
|
||||
pmull v0.1q,v0.1d,v19.1d
|
||||
eor v3.16b,v3.16b,v18.16b
|
||||
eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
|
||||
eor v3.16b,v3.16b,v0.16b
|
||||
pmull2 v6.1q,v20.2d,v7.2d
|
||||
b.hs .Loop_mod2x_v8 //there was at least 32 more bytes
|
||||
eor v3.16b,v3.16b,v18.16b
|
||||
eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
|
||||
eor v3.16b,v3.16b,v0.16b
|
||||
pmull2 v6.1q,v20.2d,v7.2d
|
||||
b.hs .Loop_mod2x_v8 //there was at least 32 more bytes
|
||||
|
||||
eor v2.16b,v2.16b,v18.16b
|
||||
ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b
|
||||
adds x3,x3,#32 //re-construct x3
|
||||
eor v0.16b,v0.16b,v2.16b //re-construct v0.16b
|
||||
b.eq .Ldone_v8 //is x3 zero?
|
||||
eor v2.16b,v2.16b,v18.16b
|
||||
ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b
|
||||
adds x3,x3,#32 //re-construct x3
|
||||
eor v0.16b,v0.16b,v2.16b //re-construct v0.16b
|
||||
b.eq .Ldone_v8 //is x3 zero?
|
||||
.Lodd_tail_v8:
|
||||
ext v18.16b,v0.16b,v0.16b,#8
|
||||
eor v3.16b,v3.16b,v0.16b //inp^=Xi
|
||||
eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi
|
||||
ext v18.16b,v0.16b,v0.16b,#8
|
||||
eor v3.16b,v3.16b,v0.16b //inp^=Xi
|
||||
eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi
|
||||
|
||||
pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
|
||||
eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
|
||||
eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
|
||||
pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
|
||||
pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
|
||||
|
||||
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
|
||||
eor v18.16b,v0.16b,v2.16b
|
||||
eor v1.16b,v1.16b,v17.16b
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
|
||||
eor v18.16b,v0.16b,v2.16b
|
||||
eor v1.16b,v1.16b,v17.16b
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
|
||||
|
||||
ins v2.d[0],v1.d[1]
|
||||
ins v1.d[1],v0.d[0]
|
||||
eor v0.16b,v1.16b,v18.16b
|
||||
eor v0.16b,v1.16b,v18.16b
|
||||
|
||||
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
|
||||
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
|
||||
pmull v0.1q,v0.1d,v19.1d
|
||||
eor v18.16b,v18.16b,v2.16b
|
||||
eor v0.16b,v0.16b,v18.16b
|
||||
eor v18.16b,v18.16b,v2.16b
|
||||
eor v0.16b,v0.16b,v18.16b
|
||||
|
||||
.Ldone_v8:
|
||||
#ifndef __ARMEB__
|
||||
rev64 v0.16b,v0.16b
|
||||
#endif
|
||||
ext v0.16b,v0.16b,v0.16b,#8
|
||||
st1 {v0.2d},[x0] //write out Xi
|
||||
ext v0.16b,v0.16b,v0.16b,#8
|
||||
st1 {v0.2d},[x0] //write out Xi
|
||||
|
||||
ret
|
||||
.size gcm_ghash_v8,.-gcm_ghash_v8
|
||||
.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
|
||||
.align 2
|
||||
.type gcm_ghash_v8_4x,%function
|
||||
.align 4
|
||||
gcm_ghash_v8_4x:
|
||||
.Lgcm_ghash_v8_4x:
|
||||
ld1 {v0.2d},[x0] //load [rotated] Xi
|
||||
ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2
|
||||
movi v19.16b,#0xe1
|
||||
ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4
|
||||
shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
|
||||
|
||||
ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
|
||||
#ifndef __ARMEB__
|
||||
rev64 v0.16b,v0.16b
|
||||
rev64 v5.16b,v5.16b
|
||||
rev64 v6.16b,v6.16b
|
||||
rev64 v7.16b,v7.16b
|
||||
rev64 v4.16b,v4.16b
|
||||
#endif
|
||||
ext v25.16b,v7.16b,v7.16b,#8
|
||||
ext v24.16b,v6.16b,v6.16b,#8
|
||||
ext v23.16b,v5.16b,v5.16b,#8
|
||||
|
||||
pmull v29.1q,v20.1d,v25.1d //H·Ii+3
|
||||
eor v7.16b,v7.16b,v25.16b
|
||||
pmull2 v31.1q,v20.2d,v25.2d
|
||||
pmull v30.1q,v21.1d,v7.1d
|
||||
|
||||
pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
|
||||
eor v6.16b,v6.16b,v24.16b
|
||||
pmull2 v24.1q,v22.2d,v24.2d
|
||||
pmull2 v6.1q,v21.2d,v6.2d
|
||||
|
||||
eor v29.16b,v29.16b,v16.16b
|
||||
eor v31.16b,v31.16b,v24.16b
|
||||
eor v30.16b,v30.16b,v6.16b
|
||||
|
||||
pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
|
||||
eor v5.16b,v5.16b,v23.16b
|
||||
pmull2 v23.1q,v26.2d,v23.2d
|
||||
pmull v5.1q,v27.1d,v5.1d
|
||||
|
||||
eor v29.16b,v29.16b,v7.16b
|
||||
eor v31.16b,v31.16b,v23.16b
|
||||
eor v30.16b,v30.16b,v5.16b
|
||||
|
||||
subs x3,x3,#128
|
||||
b.lo .Ltail4x
|
||||
|
||||
b .Loop4x
|
||||
|
||||
.align 4
|
||||
.Loop4x:
|
||||
eor v16.16b,v4.16b,v0.16b
|
||||
ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
|
||||
ext v3.16b,v16.16b,v16.16b,#8
|
||||
#ifndef __ARMEB__
|
||||
rev64 v5.16b,v5.16b
|
||||
rev64 v6.16b,v6.16b
|
||||
rev64 v7.16b,v7.16b
|
||||
rev64 v4.16b,v4.16b
|
||||
#endif
|
||||
|
||||
pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
|
||||
eor v16.16b,v16.16b,v3.16b
|
||||
pmull2 v2.1q,v28.2d,v3.2d
|
||||
ext v25.16b,v7.16b,v7.16b,#8
|
||||
pmull2 v1.1q,v27.2d,v16.2d
|
||||
|
||||
eor v0.16b,v0.16b,v29.16b
|
||||
eor v2.16b,v2.16b,v31.16b
|
||||
ext v24.16b,v6.16b,v6.16b,#8
|
||||
eor v1.16b,v1.16b,v30.16b
|
||||
ext v23.16b,v5.16b,v5.16b,#8
|
||||
|
||||
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
|
||||
eor v18.16b,v0.16b,v2.16b
|
||||
pmull v29.1q,v20.1d,v25.1d //H·Ii+3
|
||||
eor v7.16b,v7.16b,v25.16b
|
||||
eor v1.16b,v1.16b,v17.16b
|
||||
pmull2 v31.1q,v20.2d,v25.2d
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
pmull v30.1q,v21.1d,v7.1d
|
||||
|
||||
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
|
||||
ins v2.d[0],v1.d[1]
|
||||
ins v1.d[1],v0.d[0]
|
||||
pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
|
||||
eor v6.16b,v6.16b,v24.16b
|
||||
pmull2 v24.1q,v22.2d,v24.2d
|
||||
eor v0.16b,v1.16b,v18.16b
|
||||
pmull2 v6.1q,v21.2d,v6.2d
|
||||
|
||||
eor v29.16b,v29.16b,v16.16b
|
||||
eor v31.16b,v31.16b,v24.16b
|
||||
eor v30.16b,v30.16b,v6.16b
|
||||
|
||||
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
|
||||
pmull v0.1q,v0.1d,v19.1d
|
||||
pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
|
||||
eor v5.16b,v5.16b,v23.16b
|
||||
eor v18.16b,v18.16b,v2.16b
|
||||
pmull2 v23.1q,v26.2d,v23.2d
|
||||
pmull v5.1q,v27.1d,v5.1d
|
||||
|
||||
eor v0.16b,v0.16b,v18.16b
|
||||
eor v29.16b,v29.16b,v7.16b
|
||||
eor v31.16b,v31.16b,v23.16b
|
||||
ext v0.16b,v0.16b,v0.16b,#8
|
||||
eor v30.16b,v30.16b,v5.16b
|
||||
|
||||
subs x3,x3,#64
|
||||
b.hs .Loop4x
|
||||
|
||||
.Ltail4x:
|
||||
eor v16.16b,v4.16b,v0.16b
|
||||
ext v3.16b,v16.16b,v16.16b,#8
|
||||
|
||||
pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
|
||||
eor v16.16b,v16.16b,v3.16b
|
||||
pmull2 v2.1q,v28.2d,v3.2d
|
||||
pmull2 v1.1q,v27.2d,v16.2d
|
||||
|
||||
eor v0.16b,v0.16b,v29.16b
|
||||
eor v2.16b,v2.16b,v31.16b
|
||||
eor v1.16b,v1.16b,v30.16b
|
||||
|
||||
adds x3,x3,#64
|
||||
b.eq .Ldone4x
|
||||
|
||||
cmp x3,#32
|
||||
b.lo .Lone
|
||||
b.eq .Ltwo
|
||||
.Lthree:
|
||||
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
|
||||
eor v18.16b,v0.16b,v2.16b
|
||||
eor v1.16b,v1.16b,v17.16b
|
||||
ld1 {v4.2d,v5.2d,v6.2d},[x2]
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
#ifndef __ARMEB__
|
||||
rev64 v5.16b,v5.16b
|
||||
rev64 v6.16b,v6.16b
|
||||
rev64 v4.16b,v4.16b
|
||||
#endif
|
||||
|
||||
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
|
||||
ins v2.d[0],v1.d[1]
|
||||
ins v1.d[1],v0.d[0]
|
||||
ext v24.16b,v6.16b,v6.16b,#8
|
||||
ext v23.16b,v5.16b,v5.16b,#8
|
||||
eor v0.16b,v1.16b,v18.16b
|
||||
|
||||
pmull v29.1q,v20.1d,v24.1d //H·Ii+2
|
||||
eor v6.16b,v6.16b,v24.16b
|
||||
|
||||
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
|
||||
pmull v0.1q,v0.1d,v19.1d
|
||||
eor v18.16b,v18.16b,v2.16b
|
||||
pmull2 v31.1q,v20.2d,v24.2d
|
||||
pmull v30.1q,v21.1d,v6.1d
|
||||
eor v0.16b,v0.16b,v18.16b
|
||||
pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1
|
||||
eor v5.16b,v5.16b,v23.16b
|
||||
ext v0.16b,v0.16b,v0.16b,#8
|
||||
|
||||
pmull2 v23.1q,v22.2d,v23.2d
|
||||
eor v16.16b,v4.16b,v0.16b
|
||||
pmull2 v5.1q,v21.2d,v5.2d
|
||||
ext v3.16b,v16.16b,v16.16b,#8
|
||||
|
||||
eor v29.16b,v29.16b,v7.16b
|
||||
eor v31.16b,v31.16b,v23.16b
|
||||
eor v30.16b,v30.16b,v5.16b
|
||||
|
||||
pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii)
|
||||
eor v16.16b,v16.16b,v3.16b
|
||||
pmull2 v2.1q,v26.2d,v3.2d
|
||||
pmull v1.1q,v27.1d,v16.1d
|
||||
|
||||
eor v0.16b,v0.16b,v29.16b
|
||||
eor v2.16b,v2.16b,v31.16b
|
||||
eor v1.16b,v1.16b,v30.16b
|
||||
b .Ldone4x
|
||||
|
||||
.align 4
|
||||
.Ltwo:
|
||||
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
|
||||
eor v18.16b,v0.16b,v2.16b
|
||||
eor v1.16b,v1.16b,v17.16b
|
||||
ld1 {v4.2d,v5.2d},[x2]
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
#ifndef __ARMEB__
|
||||
rev64 v5.16b,v5.16b
|
||||
rev64 v4.16b,v4.16b
|
||||
#endif
|
||||
|
||||
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
|
||||
ins v2.d[0],v1.d[1]
|
||||
ins v1.d[1],v0.d[0]
|
||||
ext v23.16b,v5.16b,v5.16b,#8
|
||||
eor v0.16b,v1.16b,v18.16b
|
||||
|
||||
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
|
||||
pmull v0.1q,v0.1d,v19.1d
|
||||
eor v18.16b,v18.16b,v2.16b
|
||||
eor v0.16b,v0.16b,v18.16b
|
||||
ext v0.16b,v0.16b,v0.16b,#8
|
||||
|
||||
pmull v29.1q,v20.1d,v23.1d //H·Ii+1
|
||||
eor v5.16b,v5.16b,v23.16b
|
||||
|
||||
eor v16.16b,v4.16b,v0.16b
|
||||
ext v3.16b,v16.16b,v16.16b,#8
|
||||
|
||||
pmull2 v31.1q,v20.2d,v23.2d
|
||||
pmull v30.1q,v21.1d,v5.1d
|
||||
|
||||
pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii)
|
||||
eor v16.16b,v16.16b,v3.16b
|
||||
pmull2 v2.1q,v22.2d,v3.2d
|
||||
pmull2 v1.1q,v21.2d,v16.2d
|
||||
|
||||
eor v0.16b,v0.16b,v29.16b
|
||||
eor v2.16b,v2.16b,v31.16b
|
||||
eor v1.16b,v1.16b,v30.16b
|
||||
b .Ldone4x
|
||||
|
||||
.align 4
|
||||
.Lone:
|
||||
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
|
||||
eor v18.16b,v0.16b,v2.16b
|
||||
eor v1.16b,v1.16b,v17.16b
|
||||
ld1 {v4.2d},[x2]
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
#ifndef __ARMEB__
|
||||
rev64 v4.16b,v4.16b
|
||||
#endif
|
||||
|
||||
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
|
||||
ins v2.d[0],v1.d[1]
|
||||
ins v1.d[1],v0.d[0]
|
||||
eor v0.16b,v1.16b,v18.16b
|
||||
|
||||
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
|
||||
pmull v0.1q,v0.1d,v19.1d
|
||||
eor v18.16b,v18.16b,v2.16b
|
||||
eor v0.16b,v0.16b,v18.16b
|
||||
ext v0.16b,v0.16b,v0.16b,#8
|
||||
|
||||
eor v16.16b,v4.16b,v0.16b
|
||||
ext v3.16b,v16.16b,v16.16b,#8
|
||||
|
||||
pmull v0.1q,v20.1d,v3.1d
|
||||
eor v16.16b,v16.16b,v3.16b
|
||||
pmull2 v2.1q,v20.2d,v3.2d
|
||||
pmull v1.1q,v21.1d,v16.1d
|
||||
|
||||
.Ldone4x:
|
||||
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
|
||||
eor v18.16b,v0.16b,v2.16b
|
||||
eor v1.16b,v1.16b,v17.16b
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
|
||||
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
|
||||
ins v2.d[0],v1.d[1]
|
||||
ins v1.d[1],v0.d[0]
|
||||
eor v0.16b,v1.16b,v18.16b
|
||||
|
||||
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
|
||||
pmull v0.1q,v0.1d,v19.1d
|
||||
eor v18.16b,v18.16b,v2.16b
|
||||
eor v0.16b,v0.16b,v18.16b
|
||||
ext v0.16b,v0.16b,v0.16b,#8
|
||||
|
||||
#ifndef __ARMEB__
|
||||
rev64 v0.16b,v0.16b
|
||||
#endif
|
||||
st1 {v0.2d},[x0] //write out Xi
|
||||
|
||||
ret
|
||||
.size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
|
||||
.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
|
||||
.align 2
|
||||
.align 2
|
||||
#endif
|
||||
|
1083
secure/lib/libcrypto/aarch64/keccak1600-armv8.S
Normal file
1083
secure/lib/libcrypto/aarch64/keccak1600-armv8.S
Normal file
File diff suppressed because it is too large
Load Diff
866
secure/lib/libcrypto/aarch64/poly1305-armv8.S
Normal file
866
secure/lib/libcrypto/aarch64/poly1305-armv8.S
Normal file
@ -0,0 +1,866 @@
|
||||
/* $FreeBSD$ */
|
||||
/* Do not modify. This file is auto-generated from poly1305-armv8.pl. */
|
||||
#include "arm_arch.h"
|
||||
|
||||
.text
|
||||
|
||||
// forward "declarations" are required for Apple
|
||||
|
||||
.globl poly1305_blocks
|
||||
.globl poly1305_emit
|
||||
|
||||
.globl poly1305_init
|
||||
.type poly1305_init,%function
|
||||
.align 5
|
||||
poly1305_init:
|
||||
cmp x1,xzr
|
||||
stp xzr,xzr,[x0] // zero hash value
|
||||
stp xzr,xzr,[x0,#16] // [along with is_base2_26]
|
||||
|
||||
csel x0,xzr,x0,eq
|
||||
b.eq .Lno_key
|
||||
|
||||
#ifdef __ILP32__
|
||||
ldrsw x11,.LOPENSSL_armcap_P
|
||||
#else
|
||||
ldr x11,.LOPENSSL_armcap_P
|
||||
#endif
|
||||
adr x10,.LOPENSSL_armcap_P
|
||||
|
||||
ldp x7,x8,[x1] // load key
|
||||
mov x9,#0xfffffffc0fffffff
|
||||
movk x9,#0x0fff,lsl#48
|
||||
ldr w17,[x10,x11]
|
||||
#ifdef __ARMEB__
|
||||
rev x7,x7 // flip bytes
|
||||
rev x8,x8
|
||||
#endif
|
||||
and x7,x7,x9 // &=0ffffffc0fffffff
|
||||
and x9,x9,#-4
|
||||
and x8,x8,x9 // &=0ffffffc0ffffffc
|
||||
stp x7,x8,[x0,#32] // save key value
|
||||
|
||||
tst w17,#ARMV7_NEON
|
||||
|
||||
adr x12,poly1305_blocks
|
||||
adr x7,poly1305_blocks_neon
|
||||
adr x13,poly1305_emit
|
||||
adr x8,poly1305_emit_neon
|
||||
|
||||
csel x12,x12,x7,eq
|
||||
csel x13,x13,x8,eq
|
||||
|
||||
#ifdef __ILP32__
|
||||
stp w12,w13,[x2]
|
||||
#else
|
||||
stp x12,x13,[x2]
|
||||
#endif
|
||||
|
||||
mov x0,#1
|
||||
.Lno_key:
|
||||
ret
|
||||
.size poly1305_init,.-poly1305_init
|
||||
|
||||
.type poly1305_blocks,%function
|
||||
.align 5
|
||||
poly1305_blocks:
|
||||
ands x2,x2,#-16
|
||||
b.eq .Lno_data
|
||||
|
||||
ldp x4,x5,[x0] // load hash value
|
||||
ldp x7,x8,[x0,#32] // load key value
|
||||
ldr x6,[x0,#16]
|
||||
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||
b .Loop
|
||||
|
||||
.align 5
|
||||
.Loop:
|
||||
ldp x10,x11,[x1],#16 // load input
|
||||
sub x2,x2,#16
|
||||
#ifdef __ARMEB__
|
||||
rev x10,x10
|
||||
rev x11,x11
|
||||
#endif
|
||||
adds x4,x4,x10 // accumulate input
|
||||
adcs x5,x5,x11
|
||||
|
||||
mul x12,x4,x7 // h0*r0
|
||||
adc x6,x6,x3
|
||||
umulh x13,x4,x7
|
||||
|
||||
mul x10,x5,x9 // h1*5*r1
|
||||
umulh x11,x5,x9
|
||||
|
||||
adds x12,x12,x10
|
||||
mul x10,x4,x8 // h0*r1
|
||||
adc x13,x13,x11
|
||||
umulh x14,x4,x8
|
||||
|
||||
adds x13,x13,x10
|
||||
mul x10,x5,x7 // h1*r0
|
||||
adc x14,x14,xzr
|
||||
umulh x11,x5,x7
|
||||
|
||||
adds x13,x13,x10
|
||||
mul x10,x6,x9 // h2*5*r1
|
||||
adc x14,x14,x11
|
||||
mul x11,x6,x7 // h2*r0
|
||||
|
||||
adds x13,x13,x10
|
||||
adc x14,x14,x11
|
||||
|
||||
and x10,x14,#-4 // final reduction
|
||||
and x6,x14,#3
|
||||
add x10,x10,x14,lsr#2
|
||||
adds x4,x12,x10
|
||||
adcs x5,x13,xzr
|
||||
adc x6,x6,xzr
|
||||
|
||||
cbnz x2,.Loop
|
||||
|
||||
stp x4,x5,[x0] // store hash value
|
||||
str x6,[x0,#16]
|
||||
|
||||
.Lno_data:
|
||||
ret
|
||||
.size poly1305_blocks,.-poly1305_blocks
|
||||
|
||||
.type poly1305_emit,%function
|
||||
.align 5
|
||||
poly1305_emit:
|
||||
ldp x4,x5,[x0] // load hash base 2^64
|
||||
ldr x6,[x0,#16]
|
||||
ldp x10,x11,[x2] // load nonce
|
||||
|
||||
adds x12,x4,#5 // compare to modulus
|
||||
adcs x13,x5,xzr
|
||||
adc x14,x6,xzr
|
||||
|
||||
tst x14,#-4 // see if it's carried/borrowed
|
||||
|
||||
csel x4,x4,x12,eq
|
||||
csel x5,x5,x13,eq
|
||||
|
||||
#ifdef __ARMEB__
|
||||
ror x10,x10,#32 // flip nonce words
|
||||
ror x11,x11,#32
|
||||
#endif
|
||||
adds x4,x4,x10 // accumulate nonce
|
||||
adc x5,x5,x11
|
||||
#ifdef __ARMEB__
|
||||
rev x4,x4 // flip output bytes
|
||||
rev x5,x5
|
||||
#endif
|
||||
stp x4,x5,[x1] // write result
|
||||
|
||||
ret
|
||||
.size poly1305_emit,.-poly1305_emit
|
||||
.type poly1305_mult,%function
|
||||
.align 5
|
||||
poly1305_mult:
|
||||
mul x12,x4,x7 // h0*r0
|
||||
umulh x13,x4,x7
|
||||
|
||||
mul x10,x5,x9 // h1*5*r1
|
||||
umulh x11,x5,x9
|
||||
|
||||
adds x12,x12,x10
|
||||
mul x10,x4,x8 // h0*r1
|
||||
adc x13,x13,x11
|
||||
umulh x14,x4,x8
|
||||
|
||||
adds x13,x13,x10
|
||||
mul x10,x5,x7 // h1*r0
|
||||
adc x14,x14,xzr
|
||||
umulh x11,x5,x7
|
||||
|
||||
adds x13,x13,x10
|
||||
mul x10,x6,x9 // h2*5*r1
|
||||
adc x14,x14,x11
|
||||
mul x11,x6,x7 // h2*r0
|
||||
|
||||
adds x13,x13,x10
|
||||
adc x14,x14,x11
|
||||
|
||||
and x10,x14,#-4 // final reduction
|
||||
and x6,x14,#3
|
||||
add x10,x10,x14,lsr#2
|
||||
adds x4,x12,x10
|
||||
adcs x5,x13,xzr
|
||||
adc x6,x6,xzr
|
||||
|
||||
ret
|
||||
.size poly1305_mult,.-poly1305_mult
|
||||
|
||||
.type poly1305_splat,%function
|
||||
.align 5
|
||||
poly1305_splat:
|
||||
and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
|
||||
ubfx x13,x4,#26,#26
|
||||
extr x14,x5,x4,#52
|
||||
and x14,x14,#0x03ffffff
|
||||
ubfx x15,x5,#14,#26
|
||||
extr x16,x6,x5,#40
|
||||
|
||||
str w12,[x0,#16*0] // r0
|
||||
add w12,w13,w13,lsl#2 // r1*5
|
||||
str w13,[x0,#16*1] // r1
|
||||
add w13,w14,w14,lsl#2 // r2*5
|
||||
str w12,[x0,#16*2] // s1
|
||||
str w14,[x0,#16*3] // r2
|
||||
add w14,w15,w15,lsl#2 // r3*5
|
||||
str w13,[x0,#16*4] // s2
|
||||
str w15,[x0,#16*5] // r3
|
||||
add w15,w16,w16,lsl#2 // r4*5
|
||||
str w14,[x0,#16*6] // s3
|
||||
str w16,[x0,#16*7] // r4
|
||||
str w15,[x0,#16*8] // s4
|
||||
|
||||
ret
|
||||
.size poly1305_splat,.-poly1305_splat
|
||||
|
||||
.type poly1305_blocks_neon,%function
|
||||
.align 5
|
||||
poly1305_blocks_neon:
|
||||
ldr x17,[x0,#24]
|
||||
cmp x2,#128
|
||||
b.hs .Lblocks_neon
|
||||
cbz x17,poly1305_blocks
|
||||
|
||||
.Lblocks_neon:
|
||||
stp x29,x30,[sp,#-80]!
|
||||
add x29,sp,#0
|
||||
|
||||
ands x2,x2,#-16
|
||||
b.eq .Lno_data_neon
|
||||
|
||||
cbz x17,.Lbase2_64_neon
|
||||
|
||||
ldp w10,w11,[x0] // load hash value base 2^26
|
||||
ldp w12,w13,[x0,#8]
|
||||
ldr w14,[x0,#16]
|
||||
|
||||
tst x2,#31
|
||||
b.eq .Leven_neon
|
||||
|
||||
ldp x7,x8,[x0,#32] // load key value
|
||||
|
||||
add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
|
||||
lsr x5,x12,#12
|
||||
adds x4,x4,x12,lsl#52
|
||||
add x5,x5,x13,lsl#14
|
||||
adc x5,x5,xzr
|
||||
lsr x6,x14,#24
|
||||
adds x5,x5,x14,lsl#40
|
||||
adc x14,x6,xzr // can be partially reduced...
|
||||
|
||||
ldp x12,x13,[x1],#16 // load input
|
||||
sub x2,x2,#16
|
||||
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||
|
||||
and x10,x14,#-4 // ... so reduce
|
||||
and x6,x14,#3
|
||||
add x10,x10,x14,lsr#2
|
||||
adds x4,x4,x10
|
||||
adcs x5,x5,xzr
|
||||
adc x6,x6,xzr
|
||||
|
||||
#ifdef __ARMEB__
|
||||
rev x12,x12
|
||||
rev x13,x13
|
||||
#endif
|
||||
adds x4,x4,x12 // accumulate input
|
||||
adcs x5,x5,x13
|
||||
adc x6,x6,x3
|
||||
|
||||
bl poly1305_mult
|
||||
ldr x30,[sp,#8]
|
||||
|
||||
cbz x3,.Lstore_base2_64_neon
|
||||
|
||||
and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
|
||||
ubfx x11,x4,#26,#26
|
||||
extr x12,x5,x4,#52
|
||||
and x12,x12,#0x03ffffff
|
||||
ubfx x13,x5,#14,#26
|
||||
extr x14,x6,x5,#40
|
||||
|
||||
cbnz x2,.Leven_neon
|
||||
|
||||
stp w10,w11,[x0] // store hash value base 2^26
|
||||
stp w12,w13,[x0,#8]
|
||||
str w14,[x0,#16]
|
||||
b .Lno_data_neon
|
||||
|
||||
.align 4
|
||||
.Lstore_base2_64_neon:
|
||||
stp x4,x5,[x0] // store hash value base 2^64
|
||||
stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed
|
||||
b .Lno_data_neon
|
||||
|
||||
.align 4
|
||||
.Lbase2_64_neon:
|
||||
ldp x7,x8,[x0,#32] // load key value
|
||||
|
||||
ldp x4,x5,[x0] // load hash value base 2^64
|
||||
ldr x6,[x0,#16]
|
||||
|
||||
tst x2,#31
|
||||
b.eq .Linit_neon
|
||||
|
||||
ldp x12,x13,[x1],#16 // load input
|
||||
sub x2,x2,#16
|
||||
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||
#ifdef __ARMEB__
|
||||
rev x12,x12
|
||||
rev x13,x13
|
||||
#endif
|
||||
adds x4,x4,x12 // accumulate input
|
||||
adcs x5,x5,x13
|
||||
adc x6,x6,x3
|
||||
|
||||
bl poly1305_mult
|
||||
|
||||
.Linit_neon:
|
||||
and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
|
||||
ubfx x11,x4,#26,#26
|
||||
extr x12,x5,x4,#52
|
||||
and x12,x12,#0x03ffffff
|
||||
ubfx x13,x5,#14,#26
|
||||
extr x14,x6,x5,#40
|
||||
|
||||
stp d8,d9,[sp,#16] // meet ABI requirements
|
||||
stp d10,d11,[sp,#32]
|
||||
stp d12,d13,[sp,#48]
|
||||
stp d14,d15,[sp,#64]
|
||||
|
||||
fmov d24,x10
|
||||
fmov d25,x11
|
||||
fmov d26,x12
|
||||
fmov d27,x13
|
||||
fmov d28,x14
|
||||
|
||||
////////////////////////////////// initialize r^n table
|
||||
mov x4,x7 // r^1
|
||||
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||
mov x5,x8
|
||||
mov x6,xzr
|
||||
add x0,x0,#48+12
|
||||
bl poly1305_splat
|
||||
|
||||
bl poly1305_mult // r^2
|
||||
sub x0,x0,#4
|
||||
bl poly1305_splat
|
||||
|
||||
bl poly1305_mult // r^3
|
||||
sub x0,x0,#4
|
||||
bl poly1305_splat
|
||||
|
||||
bl poly1305_mult // r^4
|
||||
sub x0,x0,#4
|
||||
bl poly1305_splat
|
||||
ldr x30,[sp,#8]
|
||||
|
||||
add x16,x1,#32
|
||||
adr x17,.Lzeros
|
||||
subs x2,x2,#64
|
||||
csel x16,x17,x16,lo
|
||||
|
||||
mov x4,#1
|
||||
str x4,[x0,#-24] // set is_base2_26
|
||||
sub x0,x0,#48 // restore original x0
|
||||
b .Ldo_neon
|
||||
|
||||
.align 4
|
||||
.Leven_neon:
|
||||
add x16,x1,#32
|
||||
adr x17,.Lzeros
|
||||
subs x2,x2,#64
|
||||
csel x16,x17,x16,lo
|
||||
|
||||
stp d8,d9,[sp,#16] // meet ABI requirements
|
||||
stp d10,d11,[sp,#32]
|
||||
stp d12,d13,[sp,#48]
|
||||
stp d14,d15,[sp,#64]
|
||||
|
||||
fmov d24,x10
|
||||
fmov d25,x11
|
||||
fmov d26,x12
|
||||
fmov d27,x13
|
||||
fmov d28,x14
|
||||
|
||||
.Ldo_neon:
|
||||
ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
|
||||
ldp x9,x13,[x16],#48
|
||||
|
||||
lsl x3,x3,#24
|
||||
add x15,x0,#48
|
||||
|
||||
#ifdef __ARMEB__
|
||||
rev x8,x8
|
||||
rev x12,x12
|
||||
rev x9,x9
|
||||
rev x13,x13
|
||||
#endif
|
||||
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||
and x5,x9,#0x03ffffff
|
||||
ubfx x6,x8,#26,#26
|
||||
ubfx x7,x9,#26,#26
|
||||
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||
extr x8,x12,x8,#52
|
||||
extr x9,x13,x9,#52
|
||||
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||
fmov d14,x4
|
||||
and x8,x8,#0x03ffffff
|
||||
and x9,x9,#0x03ffffff
|
||||
ubfx x10,x12,#14,#26
|
||||
ubfx x11,x13,#14,#26
|
||||
add x12,x3,x12,lsr#40
|
||||
add x13,x3,x13,lsr#40
|
||||
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||
fmov d15,x6
|
||||
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||
fmov d16,x8
|
||||
fmov d17,x10
|
||||
fmov d18,x12
|
||||
|
||||
ldp x8,x12,[x1],#16 // inp[0:1]
|
||||
ldp x9,x13,[x1],#48
|
||||
|
||||
ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
|
||||
ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
|
||||
ld1 {v8.4s},[x15]
|
||||
|
||||
#ifdef __ARMEB__
|
||||
rev x8,x8
|
||||
rev x12,x12
|
||||
rev x9,x9
|
||||
rev x13,x13
|
||||
#endif
|
||||
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||
and x5,x9,#0x03ffffff
|
||||
ubfx x6,x8,#26,#26
|
||||
ubfx x7,x9,#26,#26
|
||||
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||
extr x8,x12,x8,#52
|
||||
extr x9,x13,x9,#52
|
||||
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||
fmov d9,x4
|
||||
and x8,x8,#0x03ffffff
|
||||
and x9,x9,#0x03ffffff
|
||||
ubfx x10,x12,#14,#26
|
||||
ubfx x11,x13,#14,#26
|
||||
add x12,x3,x12,lsr#40
|
||||
add x13,x3,x13,lsr#40
|
||||
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||
fmov d10,x6
|
||||
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||
movi v31.2d,#-1
|
||||
fmov d11,x8
|
||||
fmov d12,x10
|
||||
fmov d13,x12
|
||||
ushr v31.2d,v31.2d,#38
|
||||
|
||||
b.ls .Lskip_loop
|
||||
|
||||
.align 4
|
||||
.Loop_neon:
|
||||
////////////////////////////////////////////////////////////////
|
||||
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
|
||||
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
|
||||
// ___________________/
|
||||
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
|
||||
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
|
||||
// ___________________/ ____________________/
|
||||
//
|
||||
// Note that we start with inp[2:3]*r^2. This is because it
|
||||
// doesn't depend on reduction in previous iteration.
|
||||
////////////////////////////////////////////////////////////////
|
||||
// d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
|
||||
// d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
|
||||
// d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
|
||||
// d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
|
||||
// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
|
||||
|
||||
subs x2,x2,#64
|
||||
umull v23.2d,v14.2s,v7.s[2]
|
||||
csel x16,x17,x16,lo
|
||||
umull v22.2d,v14.2s,v5.s[2]
|
||||
umull v21.2d,v14.2s,v3.s[2]
|
||||
ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
|
||||
umull v20.2d,v14.2s,v1.s[2]
|
||||
ldp x9,x13,[x16],#48
|
||||
umull v19.2d,v14.2s,v0.s[2]
|
||||
#ifdef __ARMEB__
|
||||
rev x8,x8
|
||||
rev x12,x12
|
||||
rev x9,x9
|
||||
rev x13,x13
|
||||
#endif
|
||||
|
||||
umlal v23.2d,v15.2s,v5.s[2]
|
||||
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||
umlal v22.2d,v15.2s,v3.s[2]
|
||||
and x5,x9,#0x03ffffff
|
||||
umlal v21.2d,v15.2s,v1.s[2]
|
||||
ubfx x6,x8,#26,#26
|
||||
umlal v20.2d,v15.2s,v0.s[2]
|
||||
ubfx x7,x9,#26,#26
|
||||
umlal v19.2d,v15.2s,v8.s[2]
|
||||
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||
|
||||
umlal v23.2d,v16.2s,v3.s[2]
|
||||
extr x8,x12,x8,#52
|
||||
umlal v22.2d,v16.2s,v1.s[2]
|
||||
extr x9,x13,x9,#52
|
||||
umlal v21.2d,v16.2s,v0.s[2]
|
||||
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||
umlal v20.2d,v16.2s,v8.s[2]
|
||||
fmov d14,x4
|
||||
umlal v19.2d,v16.2s,v6.s[2]
|
||||
and x8,x8,#0x03ffffff
|
||||
|
||||
umlal v23.2d,v17.2s,v1.s[2]
|
||||
and x9,x9,#0x03ffffff
|
||||
umlal v22.2d,v17.2s,v0.s[2]
|
||||
ubfx x10,x12,#14,#26
|
||||
umlal v21.2d,v17.2s,v8.s[2]
|
||||
ubfx x11,x13,#14,#26
|
||||
umlal v20.2d,v17.2s,v6.s[2]
|
||||
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||
umlal v19.2d,v17.2s,v4.s[2]
|
||||
fmov d15,x6
|
||||
|
||||
add v11.2s,v11.2s,v26.2s
|
||||
add x12,x3,x12,lsr#40
|
||||
umlal v23.2d,v18.2s,v0.s[2]
|
||||
add x13,x3,x13,lsr#40
|
||||
umlal v22.2d,v18.2s,v8.s[2]
|
||||
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||
umlal v21.2d,v18.2s,v6.s[2]
|
||||
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||
umlal v20.2d,v18.2s,v4.s[2]
|
||||
fmov d16,x8
|
||||
umlal v19.2d,v18.2s,v2.s[2]
|
||||
fmov d17,x10
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// (hash+inp[0:1])*r^4 and accumulate
|
||||
|
||||
add v9.2s,v9.2s,v24.2s
|
||||
fmov d18,x12
|
||||
umlal v22.2d,v11.2s,v1.s[0]
|
||||
ldp x8,x12,[x1],#16 // inp[0:1]
|
||||
umlal v19.2d,v11.2s,v6.s[0]
|
||||
ldp x9,x13,[x1],#48
|
||||
umlal v23.2d,v11.2s,v3.s[0]
|
||||
umlal v20.2d,v11.2s,v8.s[0]
|
||||
umlal v21.2d,v11.2s,v0.s[0]
|
||||
#ifdef __ARMEB__
|
||||
rev x8,x8
|
||||
rev x12,x12
|
||||
rev x9,x9
|
||||
rev x13,x13
|
||||
#endif
|
||||
|
||||
add v10.2s,v10.2s,v25.2s
|
||||
umlal v22.2d,v9.2s,v5.s[0]
|
||||
umlal v23.2d,v9.2s,v7.s[0]
|
||||
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||
umlal v21.2d,v9.2s,v3.s[0]
|
||||
and x5,x9,#0x03ffffff
|
||||
umlal v19.2d,v9.2s,v0.s[0]
|
||||
ubfx x6,x8,#26,#26
|
||||
umlal v20.2d,v9.2s,v1.s[0]
|
||||
ubfx x7,x9,#26,#26
|
||||
|
||||
add v12.2s,v12.2s,v27.2s
|
||||
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||
umlal v22.2d,v10.2s,v3.s[0]
|
||||
extr x8,x12,x8,#52
|
||||
umlal v23.2d,v10.2s,v5.s[0]
|
||||
extr x9,x13,x9,#52
|
||||
umlal v19.2d,v10.2s,v8.s[0]
|
||||
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||
umlal v21.2d,v10.2s,v1.s[0]
|
||||
fmov d9,x4
|
||||
umlal v20.2d,v10.2s,v0.s[0]
|
||||
and x8,x8,#0x03ffffff
|
||||
|
||||
add v13.2s,v13.2s,v28.2s
|
||||
and x9,x9,#0x03ffffff
|
||||
umlal v22.2d,v12.2s,v0.s[0]
|
||||
ubfx x10,x12,#14,#26
|
||||
umlal v19.2d,v12.2s,v4.s[0]
|
||||
ubfx x11,x13,#14,#26
|
||||
umlal v23.2d,v12.2s,v1.s[0]
|
||||
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||
umlal v20.2d,v12.2s,v6.s[0]
|
||||
fmov d10,x6
|
||||
umlal v21.2d,v12.2s,v8.s[0]
|
||||
add x12,x3,x12,lsr#40
|
||||
|
||||
umlal v22.2d,v13.2s,v8.s[0]
|
||||
add x13,x3,x13,lsr#40
|
||||
umlal v19.2d,v13.2s,v2.s[0]
|
||||
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||
umlal v23.2d,v13.2s,v0.s[0]
|
||||
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||
umlal v20.2d,v13.2s,v4.s[0]
|
||||
fmov d11,x8
|
||||
umlal v21.2d,v13.2s,v6.s[0]
|
||||
fmov d12,x10
|
||||
fmov d13,x12
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
|
||||
// and P. Schwabe
|
||||
//
|
||||
// [see discussion in poly1305-armv4 module]
|
||||
|
||||
ushr v29.2d,v22.2d,#26
|
||||
xtn v27.2s,v22.2d
|
||||
ushr v30.2d,v19.2d,#26
|
||||
and v19.16b,v19.16b,v31.16b
|
||||
add v23.2d,v23.2d,v29.2d // h3 -> h4
|
||||
bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff
|
||||
add v20.2d,v20.2d,v30.2d // h0 -> h1
|
||||
|
||||
ushr v29.2d,v23.2d,#26
|
||||
xtn v28.2s,v23.2d
|
||||
ushr v30.2d,v20.2d,#26
|
||||
xtn v25.2s,v20.2d
|
||||
bic v28.2s,#0xfc,lsl#24
|
||||
add v21.2d,v21.2d,v30.2d // h1 -> h2
|
||||
|
||||
add v19.2d,v19.2d,v29.2d
|
||||
shl v29.2d,v29.2d,#2
|
||||
shrn v30.2s,v21.2d,#26
|
||||
xtn v26.2s,v21.2d
|
||||
add v19.2d,v19.2d,v29.2d // h4 -> h0
|
||||
bic v25.2s,#0xfc,lsl#24
|
||||
add v27.2s,v27.2s,v30.2s // h2 -> h3
|
||||
bic v26.2s,#0xfc,lsl#24
|
||||
|
||||
shrn v29.2s,v19.2d,#26
|
||||
xtn v24.2s,v19.2d
|
||||
ushr v30.2s,v27.2s,#26
|
||||
bic v27.2s,#0xfc,lsl#24
|
||||
bic v24.2s,#0xfc,lsl#24
|
||||
add v25.2s,v25.2s,v29.2s // h0 -> h1
|
||||
add v28.2s,v28.2s,v30.2s // h3 -> h4
|
||||
|
||||
b.hi .Loop_neon
|
||||
|
||||
.Lskip_loop:
|
||||
dup v16.2d,v16.d[0]
|
||||
add v11.2s,v11.2s,v26.2s
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
|
||||
|
||||
adds x2,x2,#32
|
||||
b.ne .Long_tail
|
||||
|
||||
dup v16.2d,v11.d[0]
|
||||
add v14.2s,v9.2s,v24.2s
|
||||
add v17.2s,v12.2s,v27.2s
|
||||
add v15.2s,v10.2s,v25.2s
|
||||
add v18.2s,v13.2s,v28.2s
|
||||
|
||||
.Long_tail:
|
||||
dup v14.2d,v14.d[0]
|
||||
umull2 v19.2d,v16.4s,v6.4s
|
||||
umull2 v22.2d,v16.4s,v1.4s
|
||||
umull2 v23.2d,v16.4s,v3.4s
|
||||
umull2 v21.2d,v16.4s,v0.4s
|
||||
umull2 v20.2d,v16.4s,v8.4s
|
||||
|
||||
dup v15.2d,v15.d[0]
|
||||
umlal2 v19.2d,v14.4s,v0.4s
|
||||
umlal2 v21.2d,v14.4s,v3.4s
|
||||
umlal2 v22.2d,v14.4s,v5.4s
|
||||
umlal2 v23.2d,v14.4s,v7.4s
|
||||
umlal2 v20.2d,v14.4s,v1.4s
|
||||
|
||||
dup v17.2d,v17.d[0]
|
||||
umlal2 v19.2d,v15.4s,v8.4s
|
||||
umlal2 v22.2d,v15.4s,v3.4s
|
||||
umlal2 v21.2d,v15.4s,v1.4s
|
||||
umlal2 v23.2d,v15.4s,v5.4s
|
||||
umlal2 v20.2d,v15.4s,v0.4s
|
||||
|
||||
dup v18.2d,v18.d[0]
|
||||
umlal2 v22.2d,v17.4s,v0.4s
|
||||
umlal2 v23.2d,v17.4s,v1.4s
|
||||
umlal2 v19.2d,v17.4s,v4.4s
|
||||
umlal2 v20.2d,v17.4s,v6.4s
|
||||
umlal2 v21.2d,v17.4s,v8.4s
|
||||
|
||||
umlal2 v22.2d,v18.4s,v8.4s
|
||||
umlal2 v19.2d,v18.4s,v2.4s
|
||||
umlal2 v23.2d,v18.4s,v0.4s
|
||||
umlal2 v20.2d,v18.4s,v4.4s
|
||||
umlal2 v21.2d,v18.4s,v6.4s
|
||||
|
||||
b.eq .Lshort_tail
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// (hash+inp[0:1])*r^4:r^3 and accumulate
|
||||
|
||||
add v9.2s,v9.2s,v24.2s
|
||||
umlal v22.2d,v11.2s,v1.2s
|
||||
umlal v19.2d,v11.2s,v6.2s
|
||||
umlal v23.2d,v11.2s,v3.2s
|
||||
umlal v20.2d,v11.2s,v8.2s
|
||||
umlal v21.2d,v11.2s,v0.2s
|
||||
|
||||
add v10.2s,v10.2s,v25.2s
|
||||
umlal v22.2d,v9.2s,v5.2s
|
||||
umlal v19.2d,v9.2s,v0.2s
|
||||
umlal v23.2d,v9.2s,v7.2s
|
||||
umlal v20.2d,v9.2s,v1.2s
|
||||
umlal v21.2d,v9.2s,v3.2s
|
||||
|
||||
add v12.2s,v12.2s,v27.2s
|
||||
umlal v22.2d,v10.2s,v3.2s
|
||||
umlal v19.2d,v10.2s,v8.2s
|
||||
umlal v23.2d,v10.2s,v5.2s
|
||||
umlal v20.2d,v10.2s,v0.2s
|
||||
umlal v21.2d,v10.2s,v1.2s
|
||||
|
||||
add v13.2s,v13.2s,v28.2s
|
||||
umlal v22.2d,v12.2s,v0.2s
|
||||
umlal v19.2d,v12.2s,v4.2s
|
||||
umlal v23.2d,v12.2s,v1.2s
|
||||
umlal v20.2d,v12.2s,v6.2s
|
||||
umlal v21.2d,v12.2s,v8.2s
|
||||
|
||||
umlal v22.2d,v13.2s,v8.2s
|
||||
umlal v19.2d,v13.2s,v2.2s
|
||||
umlal v23.2d,v13.2s,v0.2s
|
||||
umlal v20.2d,v13.2s,v4.2s
|
||||
umlal v21.2d,v13.2s,v6.2s
|
||||
|
||||
.Lshort_tail:
|
||||
////////////////////////////////////////////////////////////////
|
||||
// horizontal add
|
||||
|
||||
addp v22.2d,v22.2d,v22.2d
|
||||
ldp d8,d9,[sp,#16] // meet ABI requirements
|
||||
addp v19.2d,v19.2d,v19.2d
|
||||
ldp d10,d11,[sp,#32]
|
||||
addp v23.2d,v23.2d,v23.2d
|
||||
ldp d12,d13,[sp,#48]
|
||||
addp v20.2d,v20.2d,v20.2d
|
||||
ldp d14,d15,[sp,#64]
|
||||
addp v21.2d,v21.2d,v21.2d
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// lazy reduction, but without narrowing
|
||||
|
||||
ushr v29.2d,v22.2d,#26
|
||||
and v22.16b,v22.16b,v31.16b
|
||||
ushr v30.2d,v19.2d,#26
|
||||
and v19.16b,v19.16b,v31.16b
|
||||
|
||||
add v23.2d,v23.2d,v29.2d // h3 -> h4
|
||||
add v20.2d,v20.2d,v30.2d // h0 -> h1
|
||||
|
||||
ushr v29.2d,v23.2d,#26
|
||||
and v23.16b,v23.16b,v31.16b
|
||||
ushr v30.2d,v20.2d,#26
|
||||
and v20.16b,v20.16b,v31.16b
|
||||
add v21.2d,v21.2d,v30.2d // h1 -> h2
|
||||
|
||||
add v19.2d,v19.2d,v29.2d
|
||||
shl v29.2d,v29.2d,#2
|
||||
ushr v30.2d,v21.2d,#26
|
||||
and v21.16b,v21.16b,v31.16b
|
||||
add v19.2d,v19.2d,v29.2d // h4 -> h0
|
||||
add v22.2d,v22.2d,v30.2d // h2 -> h3
|
||||
|
||||
ushr v29.2d,v19.2d,#26
|
||||
and v19.16b,v19.16b,v31.16b
|
||||
ushr v30.2d,v22.2d,#26
|
||||
and v22.16b,v22.16b,v31.16b
|
||||
add v20.2d,v20.2d,v29.2d // h0 -> h1
|
||||
add v23.2d,v23.2d,v30.2d // h3 -> h4
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// write the result, can be partially reduced
|
||||
|
||||
st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
|
||||
st1 {v23.s}[0],[x0]
|
||||
|
||||
.Lno_data_neon:
|
||||
ldr x29,[sp],#80
|
||||
ret
|
||||
.size poly1305_blocks_neon,.-poly1305_blocks_neon
|
||||
|
||||
.type poly1305_emit_neon,%function
|
||||
.align 5
|
||||
poly1305_emit_neon:
|
||||
ldr x17,[x0,#24]
|
||||
cbz x17,poly1305_emit
|
||||
|
||||
ldp w10,w11,[x0] // load hash value base 2^26
|
||||
ldp w12,w13,[x0,#8]
|
||||
ldr w14,[x0,#16]
|
||||
|
||||
add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
|
||||
lsr x5,x12,#12
|
||||
adds x4,x4,x12,lsl#52
|
||||
add x5,x5,x13,lsl#14
|
||||
adc x5,x5,xzr
|
||||
lsr x6,x14,#24
|
||||
adds x5,x5,x14,lsl#40
|
||||
adc x6,x6,xzr // can be partially reduced...
|
||||
|
||||
ldp x10,x11,[x2] // load nonce
|
||||
|
||||
and x12,x6,#-4 // ... so reduce
|
||||
add x12,x12,x6,lsr#2
|
||||
and x6,x6,#3
|
||||
adds x4,x4,x12
|
||||
adcs x5,x5,xzr
|
||||
adc x6,x6,xzr
|
||||
|
||||
adds x12,x4,#5 // compare to modulus
|
||||
adcs x13,x5,xzr
|
||||
adc x14,x6,xzr
|
||||
|
||||
tst x14,#-4 // see if it's carried/borrowed
|
||||
|
||||
csel x4,x4,x12,eq
|
||||
csel x5,x5,x13,eq
|
||||
|
||||
#ifdef __ARMEB__
|
||||
ror x10,x10,#32 // flip nonce words
|
||||
ror x11,x11,#32
|
||||
#endif
|
||||
adds x4,x4,x10 // accumulate nonce
|
||||
adc x5,x5,x11
|
||||
#ifdef __ARMEB__
|
||||
rev x4,x4 // flip output bytes
|
||||
rev x5,x5
|
||||
#endif
|
||||
stp x4,x5,[x1] // write result
|
||||
|
||||
ret
|
||||
.size poly1305_emit_neon,.-poly1305_emit_neon
|
||||
|
||||
.align 5
|
||||
.Lzeros:
|
||||
.long 0,0,0,0,0,0,0,0
|
||||
.LOPENSSL_armcap_P:
|
||||
#ifdef __ILP32__
|
||||
.long OPENSSL_armcap_P-.
|
||||
#else
|
||||
.quad OPENSSL_armcap_P-.
|
||||
#endif
|
||||
.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
|
||||
.align 2
|
||||
.align 2
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,13 +1,82 @@
|
||||
/* $FreeBSD$ */
|
||||
/* Do not modify. This file is auto-generated from sha512-armv8.pl. */
|
||||
#include "arm_arch.h"
|
||||
// Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the OpenSSL license (the "License"). You may not use
|
||||
// this file except in compliance with the License. You can obtain a copy
|
||||
// in the file LICENSE in the source distribution or at
|
||||
// https://www.openssl.org/source/license.html
|
||||
|
||||
// ====================================================================
|
||||
// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
// project. The module is, however, dual licensed under OpenSSL and
|
||||
// CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
// details see http://www.openssl.org/~appro/cryptogams/.
|
||||
//
|
||||
// Permission to use under GPLv2 terms is granted.
|
||||
// ====================================================================
|
||||
//
|
||||
// SHA256/512 for ARMv8.
|
||||
//
|
||||
// Performance in cycles per processed byte and improvement coefficient
|
||||
// over code generated with "default" compiler:
|
||||
//
|
||||
// SHA256-hw SHA256(*) SHA512
|
||||
// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
|
||||
// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
|
||||
// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
|
||||
// Denver 2.01 10.5 (+26%) 6.70 (+8%)
|
||||
// X-Gene 20.0 (+100%) 12.8 (+300%(***))
|
||||
// Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
|
||||
// Kryo 1.92 17.4 (+30%) 11.2 (+8%)
|
||||
//
|
||||
// (*) Software SHA256 results are of lesser relevance, presented
|
||||
// mostly for informational purposes.
|
||||
// (**) The result is a trade-off: it's possible to improve it by
|
||||
// 10% (or by 1 cycle per round), but at the cost of 20% loss
|
||||
// on Cortex-A53 (or by 4 cycles per round).
|
||||
// (***) Super-impressive coefficients over gcc-generated code are
|
||||
// indication of some compiler "pathology", most notably code
|
||||
// generated with -mgeneral-regs-only is significantly faster
|
||||
// and the gap is only 40-90%.
|
||||
//
|
||||
// October 2016.
|
||||
//
|
||||
// Originally it was reckoned that it makes no sense to implement NEON
|
||||
// version of SHA256 for 64-bit processors. This is because performance
|
||||
// improvement on most wide-spread Cortex-A5x processors was observed
|
||||
// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
|
||||
// observed that 32-bit NEON SHA256 performs significantly better than
|
||||
// 64-bit scalar version on *some* of the more recent processors. As
|
||||
// result 64-bit NEON version of SHA256 was added to provide best
|
||||
// all-round performance. For example it executes ~30% faster on X-Gene
|
||||
// and Mongoose. [For reference, NEON version of SHA512 is bound to
|
||||
// deliver much less improvement, likely *negative* on Cortex-A5x.
|
||||
// Which is why NEON support is limited to SHA256.]
|
||||
|
||||
#ifndef __KERNEL__
|
||||
# include "arm_arch.h"
|
||||
#endif
|
||||
|
||||
.text
|
||||
|
||||
|
||||
.globl sha512_block_data_order
|
||||
.type sha512_block_data_order,%function
|
||||
.align 6
|
||||
sha512_block_data_order:
|
||||
#ifndef __KERNEL__
|
||||
# ifdef __ILP32__
|
||||
ldrsw x16,.LOPENSSL_armcap_P
|
||||
# else
|
||||
ldr x16,.LOPENSSL_armcap_P
|
||||
# endif
|
||||
adr x17,.LOPENSSL_armcap_P
|
||||
add x16,x16,x17
|
||||
ldr w16,[x16]
|
||||
tst w16,#ARMV8_SHA512
|
||||
b.ne .Lv8_entry
|
||||
#endif
|
||||
stp x29,x30,[sp,#-128]!
|
||||
add x29,sp,#0
|
||||
|
||||
@ -23,7 +92,7 @@ sha512_block_data_order:
|
||||
ldp x24,x25,[x0,#4*8]
|
||||
add x2,x1,x2,lsl#7 // end of input
|
||||
ldp x26,x27,[x0,#6*8]
|
||||
adr x30,K512
|
||||
adr x30,.LK512
|
||||
stp x0,x2,[x29,#96]
|
||||
|
||||
.Loop:
|
||||
@ -31,7 +100,7 @@ sha512_block_data_order:
|
||||
ldr x19,[x30],#8 // *K++
|
||||
eor x28,x21,x22 // magic seed
|
||||
str x1,[x29,#112]
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x3,x3 // 0
|
||||
#endif
|
||||
ror x16,x24,#14
|
||||
@ -54,7 +123,7 @@ sha512_block_data_order:
|
||||
add x27,x27,x28 // h+=Maj(a,b,c)
|
||||
ldr x28,[x30],#8 // *K++, x19 in next round
|
||||
//add x27,x27,x17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x4,x4 // 1
|
||||
#endif
|
||||
ldp x5,x6,[x1],#2*8
|
||||
@ -79,7 +148,7 @@ sha512_block_data_order:
|
||||
add x26,x26,x19 // h+=Maj(a,b,c)
|
||||
ldr x19,[x30],#8 // *K++, x28 in next round
|
||||
//add x26,x26,x17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x5,x5 // 2
|
||||
#endif
|
||||
add x26,x26,x17 // h+=Sigma0(a)
|
||||
@ -103,7 +172,7 @@ sha512_block_data_order:
|
||||
add x25,x25,x28 // h+=Maj(a,b,c)
|
||||
ldr x28,[x30],#8 // *K++, x19 in next round
|
||||
//add x25,x25,x17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x6,x6 // 3
|
||||
#endif
|
||||
ldp x7,x8,[x1],#2*8
|
||||
@ -128,7 +197,7 @@ sha512_block_data_order:
|
||||
add x24,x24,x19 // h+=Maj(a,b,c)
|
||||
ldr x19,[x30],#8 // *K++, x28 in next round
|
||||
//add x24,x24,x17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x7,x7 // 4
|
||||
#endif
|
||||
add x24,x24,x17 // h+=Sigma0(a)
|
||||
@ -152,7 +221,7 @@ sha512_block_data_order:
|
||||
add x23,x23,x28 // h+=Maj(a,b,c)
|
||||
ldr x28,[x30],#8 // *K++, x19 in next round
|
||||
//add x23,x23,x17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x8,x8 // 5
|
||||
#endif
|
||||
ldp x9,x10,[x1],#2*8
|
||||
@ -177,7 +246,7 @@ sha512_block_data_order:
|
||||
add x22,x22,x19 // h+=Maj(a,b,c)
|
||||
ldr x19,[x30],#8 // *K++, x28 in next round
|
||||
//add x22,x22,x17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x9,x9 // 6
|
||||
#endif
|
||||
add x22,x22,x17 // h+=Sigma0(a)
|
||||
@ -201,7 +270,7 @@ sha512_block_data_order:
|
||||
add x21,x21,x28 // h+=Maj(a,b,c)
|
||||
ldr x28,[x30],#8 // *K++, x19 in next round
|
||||
//add x21,x21,x17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x10,x10 // 7
|
||||
#endif
|
||||
ldp x11,x12,[x1],#2*8
|
||||
@ -226,7 +295,7 @@ sha512_block_data_order:
|
||||
add x20,x20,x19 // h+=Maj(a,b,c)
|
||||
ldr x19,[x30],#8 // *K++, x28 in next round
|
||||
//add x20,x20,x17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x11,x11 // 8
|
||||
#endif
|
||||
add x20,x20,x17 // h+=Sigma0(a)
|
||||
@ -250,7 +319,7 @@ sha512_block_data_order:
|
||||
add x27,x27,x28 // h+=Maj(a,b,c)
|
||||
ldr x28,[x30],#8 // *K++, x19 in next round
|
||||
//add x27,x27,x17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x12,x12 // 9
|
||||
#endif
|
||||
ldp x13,x14,[x1],#2*8
|
||||
@ -275,7 +344,7 @@ sha512_block_data_order:
|
||||
add x26,x26,x19 // h+=Maj(a,b,c)
|
||||
ldr x19,[x30],#8 // *K++, x28 in next round
|
||||
//add x26,x26,x17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x13,x13 // 10
|
||||
#endif
|
||||
add x26,x26,x17 // h+=Sigma0(a)
|
||||
@ -299,7 +368,7 @@ sha512_block_data_order:
|
||||
add x25,x25,x28 // h+=Maj(a,b,c)
|
||||
ldr x28,[x30],#8 // *K++, x19 in next round
|
||||
//add x25,x25,x17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x14,x14 // 11
|
||||
#endif
|
||||
ldp x15,x0,[x1],#2*8
|
||||
@ -325,7 +394,7 @@ sha512_block_data_order:
|
||||
add x24,x24,x19 // h+=Maj(a,b,c)
|
||||
ldr x19,[x30],#8 // *K++, x28 in next round
|
||||
//add x24,x24,x17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x15,x15 // 12
|
||||
#endif
|
||||
add x24,x24,x17 // h+=Sigma0(a)
|
||||
@ -350,7 +419,7 @@ sha512_block_data_order:
|
||||
add x23,x23,x28 // h+=Maj(a,b,c)
|
||||
ldr x28,[x30],#8 // *K++, x19 in next round
|
||||
//add x23,x23,x17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x0,x0 // 13
|
||||
#endif
|
||||
ldp x1,x2,[x1]
|
||||
@ -376,7 +445,7 @@ sha512_block_data_order:
|
||||
add x22,x22,x19 // h+=Maj(a,b,c)
|
||||
ldr x19,[x30],#8 // *K++, x28 in next round
|
||||
//add x22,x22,x17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x1,x1 // 14
|
||||
#endif
|
||||
ldr x6,[sp,#24]
|
||||
@ -402,7 +471,7 @@ sha512_block_data_order:
|
||||
add x21,x21,x28 // h+=Maj(a,b,c)
|
||||
ldr x28,[x30],#8 // *K++, x19 in next round
|
||||
//add x21,x21,x17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x2,x2 // 15
|
||||
#endif
|
||||
ldr x7,[sp,#0]
|
||||
@ -971,53 +1040,581 @@ sha512_block_data_order:
|
||||
.size sha512_block_data_order,.-sha512_block_data_order
|
||||
|
||||
.align 6
|
||||
.type K512,%object
|
||||
K512:
|
||||
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
|
||||
.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
|
||||
.quad 0x3956c25bf348b538,0x59f111f1b605d019
|
||||
.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
|
||||
.quad 0xd807aa98a3030242,0x12835b0145706fbe
|
||||
.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
|
||||
.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
|
||||
.quad 0x9bdc06a725c71235,0xc19bf174cf692694
|
||||
.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
|
||||
.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
|
||||
.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
|
||||
.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
|
||||
.quad 0x983e5152ee66dfab,0xa831c66d2db43210
|
||||
.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
|
||||
.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
|
||||
.quad 0x06ca6351e003826f,0x142929670a0e6e70
|
||||
.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
|
||||
.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
|
||||
.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
|
||||
.quad 0x81c2c92e47edaee6,0x92722c851482353b
|
||||
.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
|
||||
.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
|
||||
.quad 0xd192e819d6ef5218,0xd69906245565a910
|
||||
.quad 0xf40e35855771202a,0x106aa07032bbd1b8
|
||||
.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
|
||||
.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
|
||||
.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
|
||||
.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
|
||||
.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
|
||||
.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
|
||||
.quad 0x90befffa23631e28,0xa4506cebde82bde9
|
||||
.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
|
||||
.quad 0xca273eceea26619c,0xd186b8c721c0c207
|
||||
.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
|
||||
.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
|
||||
.quad 0x113f9804bef90dae,0x1b710b35131c471b
|
||||
.quad 0x28db77f523047d84,0x32caab7b40c72493
|
||||
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
|
||||
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
|
||||
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
|
||||
.quad 0 // terminator
|
||||
.size K512,.-K512
|
||||
.type .LK512,%object
|
||||
.LK512:
|
||||
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
|
||||
.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
|
||||
.quad 0x3956c25bf348b538,0x59f111f1b605d019
|
||||
.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
|
||||
.quad 0xd807aa98a3030242,0x12835b0145706fbe
|
||||
.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
|
||||
.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
|
||||
.quad 0x9bdc06a725c71235,0xc19bf174cf692694
|
||||
.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
|
||||
.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
|
||||
.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
|
||||
.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
|
||||
.quad 0x983e5152ee66dfab,0xa831c66d2db43210
|
||||
.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
|
||||
.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
|
||||
.quad 0x06ca6351e003826f,0x142929670a0e6e70
|
||||
.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
|
||||
.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
|
||||
.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
|
||||
.quad 0x81c2c92e47edaee6,0x92722c851482353b
|
||||
.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
|
||||
.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
|
||||
.quad 0xd192e819d6ef5218,0xd69906245565a910
|
||||
.quad 0xf40e35855771202a,0x106aa07032bbd1b8
|
||||
.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
|
||||
.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
|
||||
.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
|
||||
.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
|
||||
.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
|
||||
.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
|
||||
.quad 0x90befffa23631e28,0xa4506cebde82bde9
|
||||
.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
|
||||
.quad 0xca273eceea26619c,0xd186b8c721c0c207
|
||||
.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
|
||||
.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
|
||||
.quad 0x113f9804bef90dae,0x1b710b35131c471b
|
||||
.quad 0x28db77f523047d84,0x32caab7b40c72493
|
||||
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
|
||||
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
|
||||
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
|
||||
.quad 0 // terminator
|
||||
.size .LK512,.-.LK512
|
||||
#ifndef __KERNEL__
|
||||
.align 3
|
||||
.LOPENSSL_armcap_P:
|
||||
.quad OPENSSL_armcap_P-.
|
||||
.asciz "SHA512 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
|
||||
# ifdef __ILP32__
|
||||
.long OPENSSL_armcap_P-.
|
||||
# else
|
||||
.quad OPENSSL_armcap_P-.
|
||||
# endif
|
||||
#endif
|
||||
.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
|
||||
.align 2
|
||||
.align 2
|
||||
#ifndef __KERNEL__
|
||||
.type sha512_block_armv8,%function
|
||||
.align 6
|
||||
sha512_block_armv8:
|
||||
.Lv8_entry:
|
||||
stp x29,x30,[sp,#-16]!
|
||||
add x29,sp,#0
|
||||
|
||||
ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 // load input
|
||||
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
|
||||
|
||||
ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context
|
||||
adr x3,.LK512
|
||||
|
||||
rev64 v16.16b,v16.16b
|
||||
rev64 v17.16b,v17.16b
|
||||
rev64 v18.16b,v18.16b
|
||||
rev64 v19.16b,v19.16b
|
||||
rev64 v20.16b,v20.16b
|
||||
rev64 v21.16b,v21.16b
|
||||
rev64 v22.16b,v22.16b
|
||||
rev64 v23.16b,v23.16b
|
||||
b .Loop_hw
|
||||
|
||||
.align 4
|
||||
.Loop_hw:
|
||||
ld1 {v24.2d},[x3],#16
|
||||
subs x2,x2,#1
|
||||
sub x4,x1,#128
|
||||
orr v26.16b,v0.16b,v0.16b // offload
|
||||
orr v27.16b,v1.16b,v1.16b
|
||||
orr v28.16b,v2.16b,v2.16b
|
||||
orr v29.16b,v3.16b,v3.16b
|
||||
csel x1,x1,x4,ne // conditional rewind
|
||||
add v24.2d,v24.2d,v16.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v2.16b,v3.16b,#8
|
||||
ext v6.16b,v1.16b,v2.16b,#8
|
||||
add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08230 //sha512su0 v16.16b,v17.16b
|
||||
ext v7.16b,v20.16b,v21.16b,#8
|
||||
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
|
||||
.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
|
||||
add v4.2d,v1.2d,v3.2d // "D + T1"
|
||||
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
|
||||
add v25.2d,v25.2d,v17.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v4.16b,v2.16b,#8
|
||||
ext v6.16b,v0.16b,v4.16b,#8
|
||||
add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08251 //sha512su0 v17.16b,v18.16b
|
||||
ext v7.16b,v21.16b,v22.16b,#8
|
||||
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
|
||||
add v1.2d,v0.2d,v2.2d // "D + T1"
|
||||
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
|
||||
add v24.2d,v24.2d,v18.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v1.16b,v4.16b,#8
|
||||
ext v6.16b,v3.16b,v1.16b,#8
|
||||
add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08272 //sha512su0 v18.16b,v19.16b
|
||||
ext v7.16b,v22.16b,v23.16b,#8
|
||||
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
|
||||
add v0.2d,v3.2d,v4.2d // "D + T1"
|
||||
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
|
||||
add v25.2d,v25.2d,v19.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v0.16b,v1.16b,#8
|
||||
ext v6.16b,v2.16b,v0.16b,#8
|
||||
add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08293 //sha512su0 v19.16b,v20.16b
|
||||
ext v7.16b,v23.16b,v16.16b,#8
|
||||
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
|
||||
add v3.2d,v2.2d,v1.2d // "D + T1"
|
||||
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
|
||||
add v24.2d,v24.2d,v20.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v3.16b,v0.16b,#8
|
||||
ext v6.16b,v4.16b,v3.16b,#8
|
||||
add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
|
||||
ext v7.16b,v16.16b,v17.16b,#8
|
||||
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
|
||||
add v2.2d,v4.2d,v0.2d // "D + T1"
|
||||
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
|
||||
add v25.2d,v25.2d,v21.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v2.16b,v3.16b,#8
|
||||
ext v6.16b,v1.16b,v2.16b,#8
|
||||
add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
|
||||
ext v7.16b,v17.16b,v18.16b,#8
|
||||
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
|
||||
add v4.2d,v1.2d,v3.2d // "D + T1"
|
||||
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
|
||||
add v24.2d,v24.2d,v22.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v4.16b,v2.16b,#8
|
||||
ext v6.16b,v0.16b,v4.16b,#8
|
||||
add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
|
||||
ext v7.16b,v18.16b,v19.16b,#8
|
||||
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
|
||||
.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
|
||||
add v1.2d,v0.2d,v2.2d // "D + T1"
|
||||
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
|
||||
add v25.2d,v25.2d,v23.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v1.16b,v4.16b,#8
|
||||
ext v6.16b,v3.16b,v1.16b,#8
|
||||
add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08217 //sha512su0 v23.16b,v16.16b
|
||||
ext v7.16b,v19.16b,v20.16b,#8
|
||||
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
|
||||
.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
|
||||
add v0.2d,v3.2d,v4.2d // "D + T1"
|
||||
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
|
||||
add v24.2d,v24.2d,v16.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v0.16b,v1.16b,#8
|
||||
ext v6.16b,v2.16b,v0.16b,#8
|
||||
add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08230 //sha512su0 v16.16b,v17.16b
|
||||
ext v7.16b,v20.16b,v21.16b,#8
|
||||
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
|
||||
.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
|
||||
add v3.2d,v2.2d,v1.2d // "D + T1"
|
||||
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
|
||||
add v25.2d,v25.2d,v17.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v3.16b,v0.16b,#8
|
||||
ext v6.16b,v4.16b,v3.16b,#8
|
||||
add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08251 //sha512su0 v17.16b,v18.16b
|
||||
ext v7.16b,v21.16b,v22.16b,#8
|
||||
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
|
||||
add v2.2d,v4.2d,v0.2d // "D + T1"
|
||||
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
|
||||
add v24.2d,v24.2d,v18.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v2.16b,v3.16b,#8
|
||||
ext v6.16b,v1.16b,v2.16b,#8
|
||||
add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08272 //sha512su0 v18.16b,v19.16b
|
||||
ext v7.16b,v22.16b,v23.16b,#8
|
||||
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
|
||||
add v4.2d,v1.2d,v3.2d // "D + T1"
|
||||
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
|
||||
add v25.2d,v25.2d,v19.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v4.16b,v2.16b,#8
|
||||
ext v6.16b,v0.16b,v4.16b,#8
|
||||
add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08293 //sha512su0 v19.16b,v20.16b
|
||||
ext v7.16b,v23.16b,v16.16b,#8
|
||||
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
|
||||
add v1.2d,v0.2d,v2.2d // "D + T1"
|
||||
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
|
||||
add v24.2d,v24.2d,v20.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v1.16b,v4.16b,#8
|
||||
ext v6.16b,v3.16b,v1.16b,#8
|
||||
add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
|
||||
ext v7.16b,v16.16b,v17.16b,#8
|
||||
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
|
||||
add v0.2d,v3.2d,v4.2d // "D + T1"
|
||||
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
|
||||
add v25.2d,v25.2d,v21.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v0.16b,v1.16b,#8
|
||||
ext v6.16b,v2.16b,v0.16b,#8
|
||||
add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
|
||||
ext v7.16b,v17.16b,v18.16b,#8
|
||||
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
|
||||
add v3.2d,v2.2d,v1.2d // "D + T1"
|
||||
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
|
||||
add v24.2d,v24.2d,v22.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v3.16b,v0.16b,#8
|
||||
ext v6.16b,v4.16b,v3.16b,#8
|
||||
add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
|
||||
ext v7.16b,v18.16b,v19.16b,#8
|
||||
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
|
||||
.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
|
||||
add v2.2d,v4.2d,v0.2d // "D + T1"
|
||||
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
|
||||
add v25.2d,v25.2d,v23.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v2.16b,v3.16b,#8
|
||||
ext v6.16b,v1.16b,v2.16b,#8
|
||||
add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08217 //sha512su0 v23.16b,v16.16b
|
||||
ext v7.16b,v19.16b,v20.16b,#8
|
||||
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
|
||||
.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
|
||||
add v4.2d,v1.2d,v3.2d // "D + T1"
|
||||
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
|
||||
add v24.2d,v24.2d,v16.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v4.16b,v2.16b,#8
|
||||
ext v6.16b,v0.16b,v4.16b,#8
|
||||
add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08230 //sha512su0 v16.16b,v17.16b
|
||||
ext v7.16b,v20.16b,v21.16b,#8
|
||||
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
|
||||
.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
|
||||
add v1.2d,v0.2d,v2.2d // "D + T1"
|
||||
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
|
||||
add v25.2d,v25.2d,v17.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v1.16b,v4.16b,#8
|
||||
ext v6.16b,v3.16b,v1.16b,#8
|
||||
add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08251 //sha512su0 v17.16b,v18.16b
|
||||
ext v7.16b,v21.16b,v22.16b,#8
|
||||
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
|
||||
add v0.2d,v3.2d,v4.2d // "D + T1"
|
||||
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
|
||||
add v24.2d,v24.2d,v18.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v0.16b,v1.16b,#8
|
||||
ext v6.16b,v2.16b,v0.16b,#8
|
||||
add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08272 //sha512su0 v18.16b,v19.16b
|
||||
ext v7.16b,v22.16b,v23.16b,#8
|
||||
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
|
||||
add v3.2d,v2.2d,v1.2d // "D + T1"
|
||||
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
|
||||
add v25.2d,v25.2d,v19.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v3.16b,v0.16b,#8
|
||||
ext v6.16b,v4.16b,v3.16b,#8
|
||||
add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08293 //sha512su0 v19.16b,v20.16b
|
||||
ext v7.16b,v23.16b,v16.16b,#8
|
||||
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
|
||||
add v2.2d,v4.2d,v0.2d // "D + T1"
|
||||
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
|
||||
add v24.2d,v24.2d,v20.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v2.16b,v3.16b,#8
|
||||
ext v6.16b,v1.16b,v2.16b,#8
|
||||
add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
|
||||
ext v7.16b,v16.16b,v17.16b,#8
|
||||
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
|
||||
add v4.2d,v1.2d,v3.2d // "D + T1"
|
||||
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
|
||||
add v25.2d,v25.2d,v21.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v4.16b,v2.16b,#8
|
||||
ext v6.16b,v0.16b,v4.16b,#8
|
||||
add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
|
||||
ext v7.16b,v17.16b,v18.16b,#8
|
||||
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
|
||||
add v1.2d,v0.2d,v2.2d // "D + T1"
|
||||
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
|
||||
add v24.2d,v24.2d,v22.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v1.16b,v4.16b,#8
|
||||
ext v6.16b,v3.16b,v1.16b,#8
|
||||
add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
|
||||
ext v7.16b,v18.16b,v19.16b,#8
|
||||
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
|
||||
.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
|
||||
add v0.2d,v3.2d,v4.2d // "D + T1"
|
||||
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
|
||||
add v25.2d,v25.2d,v23.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v0.16b,v1.16b,#8
|
||||
ext v6.16b,v2.16b,v0.16b,#8
|
||||
add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08217 //sha512su0 v23.16b,v16.16b
|
||||
ext v7.16b,v19.16b,v20.16b,#8
|
||||
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
|
||||
.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
|
||||
add v3.2d,v2.2d,v1.2d // "D + T1"
|
||||
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
|
||||
add v24.2d,v24.2d,v16.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v3.16b,v0.16b,#8
|
||||
ext v6.16b,v4.16b,v3.16b,#8
|
||||
add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08230 //sha512su0 v16.16b,v17.16b
|
||||
ext v7.16b,v20.16b,v21.16b,#8
|
||||
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
|
||||
.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
|
||||
add v2.2d,v4.2d,v0.2d // "D + T1"
|
||||
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
|
||||
add v25.2d,v25.2d,v17.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v2.16b,v3.16b,#8
|
||||
ext v6.16b,v1.16b,v2.16b,#8
|
||||
add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08251 //sha512su0 v17.16b,v18.16b
|
||||
ext v7.16b,v21.16b,v22.16b,#8
|
||||
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
|
||||
add v4.2d,v1.2d,v3.2d // "D + T1"
|
||||
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
|
||||
add v24.2d,v24.2d,v18.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v4.16b,v2.16b,#8
|
||||
ext v6.16b,v0.16b,v4.16b,#8
|
||||
add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08272 //sha512su0 v18.16b,v19.16b
|
||||
ext v7.16b,v22.16b,v23.16b,#8
|
||||
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
|
||||
add v1.2d,v0.2d,v2.2d // "D + T1"
|
||||
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
|
||||
add v25.2d,v25.2d,v19.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v1.16b,v4.16b,#8
|
||||
ext v6.16b,v3.16b,v1.16b,#8
|
||||
add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08293 //sha512su0 v19.16b,v20.16b
|
||||
ext v7.16b,v23.16b,v16.16b,#8
|
||||
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
|
||||
add v0.2d,v3.2d,v4.2d // "D + T1"
|
||||
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
|
||||
add v24.2d,v24.2d,v20.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v0.16b,v1.16b,#8
|
||||
ext v6.16b,v2.16b,v0.16b,#8
|
||||
add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
|
||||
ext v7.16b,v16.16b,v17.16b,#8
|
||||
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
|
||||
add v3.2d,v2.2d,v1.2d // "D + T1"
|
||||
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
|
||||
add v25.2d,v25.2d,v21.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v3.16b,v0.16b,#8
|
||||
ext v6.16b,v4.16b,v3.16b,#8
|
||||
add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
|
||||
ext v7.16b,v17.16b,v18.16b,#8
|
||||
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
|
||||
add v2.2d,v4.2d,v0.2d // "D + T1"
|
||||
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
|
||||
add v24.2d,v24.2d,v22.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v2.16b,v3.16b,#8
|
||||
ext v6.16b,v1.16b,v2.16b,#8
|
||||
add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
|
||||
ext v7.16b,v18.16b,v19.16b,#8
|
||||
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
|
||||
.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
|
||||
add v4.2d,v1.2d,v3.2d // "D + T1"
|
||||
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
|
||||
add v25.2d,v25.2d,v23.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v4.16b,v2.16b,#8
|
||||
ext v6.16b,v0.16b,v4.16b,#8
|
||||
add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08217 //sha512su0 v23.16b,v16.16b
|
||||
ext v7.16b,v19.16b,v20.16b,#8
|
||||
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
|
||||
.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
|
||||
add v1.2d,v0.2d,v2.2d // "D + T1"
|
||||
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
|
||||
ld1 {v25.2d},[x3],#16
|
||||
add v24.2d,v24.2d,v16.2d
|
||||
ld1 {v16.16b},[x1],#16 // load next input
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v1.16b,v4.16b,#8
|
||||
ext v6.16b,v3.16b,v1.16b,#8
|
||||
add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
|
||||
rev64 v16.16b,v16.16b
|
||||
add v0.2d,v3.2d,v4.2d // "D + T1"
|
||||
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
|
||||
ld1 {v24.2d},[x3],#16
|
||||
add v25.2d,v25.2d,v17.2d
|
||||
ld1 {v17.16b},[x1],#16 // load next input
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v0.16b,v1.16b,#8
|
||||
ext v6.16b,v2.16b,v0.16b,#8
|
||||
add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
|
||||
rev64 v17.16b,v17.16b
|
||||
add v3.2d,v2.2d,v1.2d // "D + T1"
|
||||
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
|
||||
ld1 {v25.2d},[x3],#16
|
||||
add v24.2d,v24.2d,v18.2d
|
||||
ld1 {v18.16b},[x1],#16 // load next input
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v3.16b,v0.16b,#8
|
||||
ext v6.16b,v4.16b,v3.16b,#8
|
||||
add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
|
||||
rev64 v18.16b,v18.16b
|
||||
add v2.2d,v4.2d,v0.2d // "D + T1"
|
||||
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
|
||||
ld1 {v24.2d},[x3],#16
|
||||
add v25.2d,v25.2d,v19.2d
|
||||
ld1 {v19.16b},[x1],#16 // load next input
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v2.16b,v3.16b,#8
|
||||
ext v6.16b,v1.16b,v2.16b,#8
|
||||
add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
|
||||
rev64 v19.16b,v19.16b
|
||||
add v4.2d,v1.2d,v3.2d // "D + T1"
|
||||
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
|
||||
ld1 {v25.2d},[x3],#16
|
||||
add v24.2d,v24.2d,v20.2d
|
||||
ld1 {v20.16b},[x1],#16 // load next input
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v4.16b,v2.16b,#8
|
||||
ext v6.16b,v0.16b,v4.16b,#8
|
||||
add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
|
||||
rev64 v20.16b,v20.16b
|
||||
add v1.2d,v0.2d,v2.2d // "D + T1"
|
||||
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
|
||||
ld1 {v24.2d},[x3],#16
|
||||
add v25.2d,v25.2d,v21.2d
|
||||
ld1 {v21.16b},[x1],#16 // load next input
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v1.16b,v4.16b,#8
|
||||
ext v6.16b,v3.16b,v1.16b,#8
|
||||
add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
|
||||
rev64 v21.16b,v21.16b
|
||||
add v0.2d,v3.2d,v4.2d // "D + T1"
|
||||
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
|
||||
ld1 {v25.2d},[x3],#16
|
||||
add v24.2d,v24.2d,v22.2d
|
||||
ld1 {v22.16b},[x1],#16 // load next input
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v0.16b,v1.16b,#8
|
||||
ext v6.16b,v2.16b,v0.16b,#8
|
||||
add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
|
||||
rev64 v22.16b,v22.16b
|
||||
add v3.2d,v2.2d,v1.2d // "D + T1"
|
||||
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
|
||||
sub x3,x3,#80*8 // rewind
|
||||
add v25.2d,v25.2d,v23.2d
|
||||
ld1 {v23.16b},[x1],#16 // load next input
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v3.16b,v0.16b,#8
|
||||
ext v6.16b,v4.16b,v3.16b,#8
|
||||
add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
|
||||
rev64 v23.16b,v23.16b
|
||||
add v2.2d,v4.2d,v0.2d // "D + T1"
|
||||
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
|
||||
add v0.2d,v0.2d,v26.2d // accumulate
|
||||
add v1.2d,v1.2d,v27.2d
|
||||
add v2.2d,v2.2d,v28.2d
|
||||
add v3.2d,v3.2d,v29.2d
|
||||
|
||||
cbnz x2,.Loop_hw
|
||||
|
||||
st1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // store context
|
||||
|
||||
ldr x29,[sp],#16
|
||||
ret
|
||||
.size sha512_block_armv8,.-sha512_block_armv8
|
||||
#endif
|
||||
#ifndef __KERNEL__
|
||||
.comm OPENSSL_armcap_P,4,4
|
||||
#endif
|
||||
|
1180
secure/lib/libcrypto/aarch64/vpaes-armv8.S
Normal file
1180
secure/lib/libcrypto/aarch64/vpaes-armv8.S
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user