Regen assembly files for arm.

This commit is contained in:
Jung-uk Kim 2018-09-22 02:42:51 +00:00
parent 4b7c498f1f
commit 2c17169a65
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/projects/openssl111/; revision=338877
15 changed files with 13420 additions and 3007 deletions

View File

@ -149,22 +149,34 @@ ${s}.S: ${s}.s
.PATH: ${LCRYPTO_SRC}/crypto \ .PATH: ${LCRYPTO_SRC}/crypto \
${LCRYPTO_SRC}/crypto/aes/asm \ ${LCRYPTO_SRC}/crypto/aes/asm \
${LCRYPTO_SRC}/crypto/bn/asm \ ${LCRYPTO_SRC}/crypto/bn/asm \
${LCRYPTO_SRC}/crypto/chacha/asm \
${LCRYPTO_SRC}/crypto/ec/asm \
${LCRYPTO_SRC}/crypto/modes/asm \ ${LCRYPTO_SRC}/crypto/modes/asm \
${LCRYPTO_SRC}/crypto/poly1305/asm \
${LCRYPTO_SRC}/crypto/sha/asm ${LCRYPTO_SRC}/crypto/sha/asm
PERLPATH= -I${LCRYPTO_SRC}/crypto/perlasm PERLPATH= -I${LCRYPTO_SRC}/crypto/perlasm
# aes # aes
SRCS= aesv8-armx.pl bsaes-armv7.pl SRCS= aes-armv4.pl aesv8-armx.pl bsaes-armv7.pl
# bn # bn
SRCS+= armv4-mont.pl armv4-gf2m.pl SRCS+= armv4-mont.pl armv4-gf2m.pl
# chacha
SRCS+= chacha-armv4.pl
# ec
SRCS+= ecp_nistz256-armv4.pl
# modes # modes
SRCS+= ghash-armv4.pl ghashv8-armx.pl SRCS+= ghash-armv4.pl ghashv8-armx.pl
# poly1305
SRCS+= poly1305-armv4.pl
# sha # sha
SRCS+= sha1-armv4-large.pl sha256-armv4.pl sha512-armv4.pl SRCS+= keccak1600-armv4.pl sha1-armv4-large.pl sha256-armv4.pl sha512-armv4.pl
ASM= aes-armv4.S ${SRCS:R:S/$/.S/} ASM= aes-armv4.S ${SRCS:R:S/$/.S/}

View File

@ -1,5 +1,12 @@
/* $FreeBSD$ */ /* $FreeBSD$ */
/* Do not modify. This file is auto-generated from aes-armv4.pl. */ /* Do not modify. This file is auto-generated from aes-armv4.pl. */
@ Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
@
@ Licensed under the OpenSSL license (the "License"). You may not use
@ this file except in compliance with the License. You can obtain a copy
@ in the file LICENSE in the source distribution or at
@ https://www.openssl.org/source/license.html
@ ==================================================================== @ ====================================================================
@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@ -40,15 +47,12 @@
#endif #endif
.text .text
#if __ARM_ARCH__<7 #if defined(__thumb2__) && !defined(__APPLE__)
.code 32
#else
.syntax unified .syntax unified
# ifdef __thumb2__
.thumb .thumb
# else #else
.code 32 .code 32
# endif #undef __thumb2__
#endif #endif
.type AES_Te,%object .type AES_Te,%object
@ -159,19 +163,23 @@ AES_Te:
@ void AES_encrypt(const unsigned char *in, unsigned char *out, @ void AES_encrypt(const unsigned char *in, unsigned char *out,
@ const AES_KEY *key) { @ const AES_KEY *key) {
.global AES_encrypt .globl AES_encrypt
.type AES_encrypt,%function .type AES_encrypt,%function
.align 5 .align 5
AES_encrypt: AES_encrypt:
#if __ARM_ARCH__<7 #ifndef __thumb2__
sub r3,pc,#8 @ AES_encrypt sub r3,pc,#8 @ AES_encrypt
#else #else
adr r3,. adr r3,.
#endif #endif
stmdb sp!,{r1,r4-r12,lr} stmdb sp!,{r1,r4-r12,lr}
#if defined(__thumb2__) || defined(__APPLE__)
adr r10,AES_Te
#else
sub r10,r3,#AES_encrypt-AES_Te @ Te
#endif
mov r12,r0 @ inp mov r12,r0 @ inp
mov r11,r2 mov r11,r2
sub r10,r3,#AES_encrypt-AES_Te @ Te
#if __ARM_ARCH__<7 #if __ARM_ARCH__<7
ldrb r0,[r12,#3] @ load input data in endian-neutral ldrb r0,[r12,#3] @ load input data in endian-neutral
ldrb r4,[r12,#2] @ manner... ldrb r4,[r12,#2] @ manner...
@ -258,20 +266,20 @@ AES_encrypt:
strb r3,[r12,#15] strb r3,[r12,#15]
#endif #endif
#if __ARM_ARCH__>=5 #if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc} ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
#else #else
ldmia sp!,{r4-r12,lr} ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
tst lr,#1 tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-) .word 0xe12fff1e @ interoperable with Thumb ISA:-)
#endif #endif
.size AES_encrypt,.-AES_encrypt .size AES_encrypt,.-AES_encrypt
.type _armv4_AES_encrypt,%function .type _armv4_AES_encrypt,%function
.align 2 .align 2
_armv4_AES_encrypt: _armv4_AES_encrypt:
str lr,[sp,#-4]! @ push lr str lr,[sp,#-4]! @ push lr
ldmia r11!,{r4-r7} ldmia r11!,{r4,r5,r6,r7}
eor r0,r0,r4 eor r0,r0,r4
ldr r12,[r11,#240-16] ldr r12,[r11,#240-16]
eor r1,r1,r5 eor r1,r1,r5
@ -404,24 +412,24 @@ _armv4_AES_encrypt:
ldr pc,[sp],#4 @ pop and return ldr pc,[sp],#4 @ pop and return
.size _armv4_AES_encrypt,.-_armv4_AES_encrypt .size _armv4_AES_encrypt,.-_armv4_AES_encrypt
.global private_AES_set_encrypt_key .globl AES_set_encrypt_key
.type private_AES_set_encrypt_key,%function .type AES_set_encrypt_key,%function
.align 5 .align 5
private_AES_set_encrypt_key: AES_set_encrypt_key:
_armv4_AES_set_encrypt_key: _armv4_AES_set_encrypt_key:
#if __ARM_ARCH__<7 #ifndef __thumb2__
sub r3,pc,#8 @ AES_set_encrypt_key sub r3,pc,#8 @ AES_set_encrypt_key
#else #else
adr r3,. adr r3,.
#endif #endif
teq r0,#0 teq r0,#0
#if __ARM_ARCH__>=7 #ifdef __thumb2__
itt eq @ Thumb2 thing, sanity check in ARM itt eq @ Thumb2 thing, sanity check in ARM
#endif #endif
moveq r0,#-1 moveq r0,#-1
beq .Labrt beq .Labrt
teq r2,#0 teq r2,#0
#if __ARM_ARCH__>=7 #ifdef __thumb2__
itt eq @ Thumb2 thing, sanity check in ARM itt eq @ Thumb2 thing, sanity check in ARM
#endif #endif
moveq r0,#-1 moveq r0,#-1
@ -432,19 +440,23 @@ _armv4_AES_set_encrypt_key:
teq r1,#192 teq r1,#192
beq .Lok beq .Lok
teq r1,#256 teq r1,#256
#if __ARM_ARCH__>=7 #ifdef __thumb2__
itt ne @ Thumb2 thing, sanity check in ARM itt ne @ Thumb2 thing, sanity check in ARM
#endif #endif
movne r0,#-1 movne r0,#-1
bne .Labrt bne .Labrt
.Lok: stmdb sp!,{r4-r12,lr} .Lok: stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
sub r10,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4
mov r12,r0 @ inp mov r12,r0 @ inp
mov lr,r1 @ bits mov lr,r1 @ bits
mov r11,r2 @ key mov r11,r2 @ key
#if defined(__thumb2__) || defined(__APPLE__)
adr r10,AES_Te+1024 @ Te4
#else
sub r10,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4
#endif
#if __ARM_ARCH__<7 #if __ARM_ARCH__<7
ldrb r0,[r12,#3] @ load input data in endian-neutral ldrb r0,[r12,#3] @ load input data in endian-neutral
ldrb r4,[r12,#2] @ manner... ldrb r4,[r12,#2] @ manner...
@ -589,7 +601,7 @@ _armv4_AES_set_encrypt_key:
str r2,[r11,#-16] str r2,[r11,#-16]
subs r12,r12,#1 subs r12,r12,#1
str r3,[r11,#-12] str r3,[r11,#-12]
#if __ARM_ARCH__>=7 #ifdef __thumb2__
itt eq @ Thumb2 thing, sanity check in ARM itt eq @ Thumb2 thing, sanity check in ARM
#endif #endif
subeq r2,r11,#216 subeq r2,r11,#216
@ -661,7 +673,7 @@ _armv4_AES_set_encrypt_key:
str r2,[r11,#-24] str r2,[r11,#-24]
subs r12,r12,#1 subs r12,r12,#1
str r3,[r11,#-20] str r3,[r11,#-20]
#if __ARM_ARCH__>=7 #ifdef __thumb2__
itt eq @ Thumb2 thing, sanity check in ARM itt eq @ Thumb2 thing, sanity check in ARM
#endif #endif
subeq r2,r11,#256 subeq r2,r11,#256
@ -695,21 +707,21 @@ _armv4_AES_set_encrypt_key:
.align 2 .align 2
.Ldone: mov r0,#0 .Ldone: mov r0,#0
ldmia sp!,{r4-r12,lr} ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
.Labrt: .Labrt:
#if __ARM_ARCH__>=5 #if __ARM_ARCH__>=5
bx lr @ .word 0xe12fff1e bx lr @ .word 0xe12fff1e
#else #else
tst lr,#1 tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-) .word 0xe12fff1e @ interoperable with Thumb ISA:-)
#endif #endif
.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key .size AES_set_encrypt_key,.-AES_set_encrypt_key
.global private_AES_set_decrypt_key .globl AES_set_decrypt_key
.type private_AES_set_decrypt_key,%function .type AES_set_decrypt_key,%function
.align 5 .align 5
private_AES_set_decrypt_key: AES_set_decrypt_key:
str lr,[sp,#-4]! @ push lr str lr,[sp,#-4]! @ push lr
bl _armv4_AES_set_encrypt_key bl _armv4_AES_set_encrypt_key
teq r0,#0 teq r0,#0
@ -719,20 +731,20 @@ private_AES_set_decrypt_key:
mov r0,r2 @ AES_set_encrypt_key preserves r2, mov r0,r2 @ AES_set_encrypt_key preserves r2,
mov r1,r2 @ which is AES_KEY *key mov r1,r2 @ which is AES_KEY *key
b _armv4_AES_set_enc2dec_key b _armv4_AES_set_enc2dec_key
.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key .size AES_set_decrypt_key,.-AES_set_decrypt_key
@ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out) @ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out)
.global AES_set_enc2dec_key .globl AES_set_enc2dec_key
.type AES_set_enc2dec_key,%function .type AES_set_enc2dec_key,%function
.align 5 .align 5
AES_set_enc2dec_key: AES_set_enc2dec_key:
_armv4_AES_set_enc2dec_key: _armv4_AES_set_enc2dec_key:
stmdb sp!,{r4-r12,lr} stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
ldr r12,[r0,#240] ldr r12,[r0,#240]
mov r7,r0 @ input mov r7,r0 @ input
add r8,r0,r12,lsl#4 add r8,r0,r12,lsl#4
mov r11,r1 @ ouput mov r11,r1 @ output
add r10,r1,r12,lsl#4 add r10,r1,r12,lsl#4
str r12,[r1,#240] str r12,[r1,#240]
@ -809,12 +821,12 @@ _armv4_AES_set_enc2dec_key:
mov r0,#0 mov r0,#0
#if __ARM_ARCH__>=5 #if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc} ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
#else #else
ldmia sp!,{r4-r12,lr} ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
tst lr,#1 tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-) .word 0xe12fff1e @ interoperable with Thumb ISA:-)
#endif #endif
.size AES_set_enc2dec_key,.-AES_set_enc2dec_key .size AES_set_enc2dec_key,.-AES_set_enc2dec_key
@ -922,19 +934,23 @@ AES_Td:
@ void AES_decrypt(const unsigned char *in, unsigned char *out, @ void AES_decrypt(const unsigned char *in, unsigned char *out,
@ const AES_KEY *key) { @ const AES_KEY *key) {
.global AES_decrypt .globl AES_decrypt
.type AES_decrypt,%function .type AES_decrypt,%function
.align 5 .align 5
AES_decrypt: AES_decrypt:
#if __ARM_ARCH__<7 #ifndef __thumb2__
sub r3,pc,#8 @ AES_decrypt sub r3,pc,#8 @ AES_decrypt
#else #else
adr r3,. adr r3,.
#endif #endif
stmdb sp!,{r1,r4-r12,lr} stmdb sp!,{r1,r4-r12,lr}
#if defined(__thumb2__) || defined(__APPLE__)
adr r10,AES_Td
#else
sub r10,r3,#AES_decrypt-AES_Td @ Td
#endif
mov r12,r0 @ inp mov r12,r0 @ inp
mov r11,r2 mov r11,r2
sub r10,r3,#AES_decrypt-AES_Td @ Td
#if __ARM_ARCH__<7 #if __ARM_ARCH__<7
ldrb r0,[r12,#3] @ load input data in endian-neutral ldrb r0,[r12,#3] @ load input data in endian-neutral
ldrb r4,[r12,#2] @ manner... ldrb r4,[r12,#2] @ manner...
@ -1021,20 +1037,20 @@ AES_decrypt:
strb r3,[r12,#15] strb r3,[r12,#15]
#endif #endif
#if __ARM_ARCH__>=5 #if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc} ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
#else #else
ldmia sp!,{r4-r12,lr} ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
tst lr,#1 tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-) .word 0xe12fff1e @ interoperable with Thumb ISA:-)
#endif #endif
.size AES_decrypt,.-AES_decrypt .size AES_decrypt,.-AES_decrypt
.type _armv4_AES_decrypt,%function .type _armv4_AES_decrypt,%function
.align 2 .align 2
_armv4_AES_decrypt: _armv4_AES_decrypt:
str lr,[sp,#-4]! @ push lr str lr,[sp,#-4]! @ push lr
ldmia r11!,{r4-r7} ldmia r11!,{r4,r5,r6,r7}
eor r0,r0,r4 eor r0,r0,r4
ldr r12,[r11,#240-16] ldr r12,[r11,#240-16]
eor r1,r1,r5 eor r1,r1,r5
@ -1175,5 +1191,6 @@ _armv4_AES_decrypt:
sub r10,r10,#1024 sub r10,r10,#1024
ldr pc,[sp],#4 @ pop and return ldr pc,[sp],#4 @ pop and return
.size _armv4_AES_decrypt,.-_armv4_AES_decrypt .size _armv4_AES_decrypt,.-_armv4_AES_decrypt
.asciz "AES for ARMv4, CRYPTOGAMS by <appro@openssl.org>" .byte 65,69,83,32,102,111,114,32,65,82,77,118,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2
.align 2 .align 2

View File

@ -4,11 +4,12 @@
#if __ARM_MAX_ARCH__>=7 #if __ARM_MAX_ARCH__>=7
.text .text
.arch armv7-a .arch armv7-a @ don't confuse not-so-latest binutils with argv8 :-)
.fpu neon .fpu neon
.code 32 .code 32
#undef __thumb2__
.align 5 .align 5
rcon: .Lrcon:
.long 0x01,0x01,0x01,0x01 .long 0x01,0x01,0x01,0x01
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d @ rotate-n-splat .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d @ rotate-n-splat
.long 0x1b,0x1b,0x1b,0x1b .long 0x1b,0x1b,0x1b,0x1b
@ -31,7 +32,7 @@ aes_v8_set_encrypt_key:
tst r1,#0x3f tst r1,#0x3f
bne .Lenc_key_abort bne .Lenc_key_abort
adr r3,rcon adr r3,.Lrcon
cmp r1,#192 cmp r1,#192
veor q0,q0,q0 veor q0,q0,q0
@ -49,14 +50,14 @@ aes_v8_set_encrypt_key:
vtbl.8 d21,{q3},d5 vtbl.8 d21,{q3},d5
vext.8 q9,q0,q3,#12 vext.8 q9,q0,q3,#12
vst1.32 {q3},[r2]! vst1.32 {q3},[r2]!
.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
subs r1,r1,#1 subs r1,r1,#1
veor q3,q3,q9 veor q3,q3,q9
vext.8 q9,q0,q9,#12 vext.8 q9,q0,q9,#12
veor q3,q3,q9 veor q3,q3,q9
vext.8 q9,q0,q9,#12 vext.8 q9,q0,q9,#12
veor q10,q10,q1 veor q10,q10,q1
veor q3,q3,q9 veor q3,q3,q9
vshl.u8 q1,q1,#1 vshl.u8 q1,q1,#1
veor q3,q3,q10 veor q3,q3,q10
@ -68,13 +69,13 @@ aes_v8_set_encrypt_key:
vtbl.8 d21,{q3},d5 vtbl.8 d21,{q3},d5
vext.8 q9,q0,q3,#12 vext.8 q9,q0,q3,#12
vst1.32 {q3},[r2]! vst1.32 {q3},[r2]!
.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
veor q3,q3,q9 veor q3,q3,q9
vext.8 q9,q0,q9,#12 vext.8 q9,q0,q9,#12
veor q3,q3,q9 veor q3,q3,q9
vext.8 q9,q0,q9,#12 vext.8 q9,q0,q9,#12
veor q10,q10,q1 veor q10,q10,q1
veor q3,q3,q9 veor q3,q3,q9
vshl.u8 q1,q1,#1 vshl.u8 q1,q1,#1
veor q3,q3,q10 veor q3,q3,q10
@ -83,13 +84,13 @@ aes_v8_set_encrypt_key:
vtbl.8 d21,{q3},d5 vtbl.8 d21,{q3},d5
vext.8 q9,q0,q3,#12 vext.8 q9,q0,q3,#12
vst1.32 {q3},[r2]! vst1.32 {q3},[r2]!
.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
veor q3,q3,q9 veor q3,q3,q9
vext.8 q9,q0,q9,#12 vext.8 q9,q0,q9,#12
veor q3,q3,q9 veor q3,q3,q9
vext.8 q9,q0,q9,#12 vext.8 q9,q0,q9,#12
veor q10,q10,q1 veor q10,q10,q1
veor q3,q3,q9 veor q3,q3,q9
veor q3,q3,q10 veor q3,q3,q10
vst1.32 {q3},[r2] vst1.32 {q3},[r2]
@ -110,7 +111,7 @@ aes_v8_set_encrypt_key:
vtbl.8 d21,{q8},d5 vtbl.8 d21,{q8},d5
vext.8 q9,q0,q3,#12 vext.8 q9,q0,q3,#12
vst1.32 {d16},[r2]! vst1.32 {d16},[r2]!
.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
subs r1,r1,#1 subs r1,r1,#1
veor q3,q3,q9 veor q3,q3,q9
@ -121,7 +122,7 @@ aes_v8_set_encrypt_key:
vdup.32 q9,d7[1] vdup.32 q9,d7[1]
veor q9,q9,q8 veor q9,q9,q8
veor q10,q10,q1 veor q10,q10,q1
vext.8 q8,q0,q8,#12 vext.8 q8,q0,q8,#12
vshl.u8 q1,q1,#1 vshl.u8 q1,q1,#1
veor q8,q8,q9 veor q8,q8,q9
@ -146,14 +147,14 @@ aes_v8_set_encrypt_key:
vtbl.8 d21,{q8},d5 vtbl.8 d21,{q8},d5
vext.8 q9,q0,q3,#12 vext.8 q9,q0,q3,#12
vst1.32 {q8},[r2]! vst1.32 {q8},[r2]!
.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
subs r1,r1,#1 subs r1,r1,#1
veor q3,q3,q9 veor q3,q3,q9
vext.8 q9,q0,q9,#12 vext.8 q9,q0,q9,#12
veor q3,q3,q9 veor q3,q3,q9
vext.8 q9,q0,q9,#12 vext.8 q9,q0,q9,#12
veor q10,q10,q1 veor q10,q10,q1
veor q3,q3,q9 veor q3,q3,q9
vshl.u8 q1,q1,#1 vshl.u8 q1,q1,#1
veor q3,q3,q10 veor q3,q3,q10
@ -162,7 +163,7 @@ aes_v8_set_encrypt_key:
vdup.32 q10,d7[1] vdup.32 q10,d7[1]
vext.8 q9,q0,q8,#12 vext.8 q9,q0,q8,#12
.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
veor q8,q8,q9 veor q8,q8,q9
vext.8 q9,q0,q9,#12 vext.8 q9,q0,q9,#12
@ -179,7 +180,7 @@ aes_v8_set_encrypt_key:
.Lenc_key_abort: .Lenc_key_abort:
mov r0,r3 @ return value mov r0,r3 @ return value
bx lr bx lr
.size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key .size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
@ -205,15 +206,15 @@ aes_v8_set_decrypt_key:
.Loop_imc: .Loop_imc:
vld1.32 {q0},[r2] vld1.32 {q0},[r2]
vld1.32 {q1},[r0] vld1.32 {q1},[r0]
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
vst1.32 {q0},[r0],r4 vst1.32 {q0},[r0],r4
vst1.32 {q1},[r2]! vst1.32 {q1},[r2]!
cmp r0,r2 cmp r0,r2
bhi .Loop_imc bhi .Loop_imc
vld1.32 {q0},[r2] vld1.32 {q0},[r2]
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
vst1.32 {q0},[r0] vst1.32 {q0},[r0]
eor r0,r0,r0 @ return value eor r0,r0,r0 @ return value
@ -231,19 +232,19 @@ aes_v8_encrypt:
vld1.32 {q1},[r2]! vld1.32 {q1},[r2]!
.Loop_enc: .Loop_enc:
.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0 .byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0
.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 .byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
vld1.32 {q0},[r2]! vld1.32 {q0},[r2]!
subs r3,r3,#2 subs r3,r3,#2
.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1 .byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1
.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 .byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
vld1.32 {q1},[r2]! vld1.32 {q1},[r2]!
bgt .Loop_enc bgt .Loop_enc
.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0 .byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0
.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 .byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
vld1.32 {q0},[r2] vld1.32 {q0},[r2]
.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1 .byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1
veor q2,q2,q0 veor q2,q2,q0
vst1.8 {q2},[r1] vst1.8 {q2},[r1]
@ -260,19 +261,19 @@ aes_v8_decrypt:
vld1.32 {q1},[r2]! vld1.32 {q1},[r2]!
.Loop_dec: .Loop_dec:
.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0 .byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0
.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 .byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
vld1.32 {q0},[r2]! vld1.32 {q0},[r2]!
subs r3,r3,#2 subs r3,r3,#2
.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1 .byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1
.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 .byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
vld1.32 {q1},[r2]! vld1.32 {q1},[r2]!
bgt .Loop_dec bgt .Loop_dec
.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0 .byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0
.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 .byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
vld1.32 {q0},[r2] vld1.32 {q0},[r2]
.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1 .byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1
veor q2,q2,q0 veor q2,q2,q0
vst1.8 {q2},[r1] vst1.8 {q2},[r1]
@ -283,9 +284,9 @@ aes_v8_decrypt:
.align 5 .align 5
aes_v8_cbc_encrypt: aes_v8_cbc_encrypt:
mov ip,sp mov ip,sp
stmdb sp!,{r4-r8,lr} stmdb sp!,{r4,r5,r6,r7,r8,lr}
vstmdb sp!,{d8-d15} @ ABI specification says so vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so
ldmia ip,{r4-r5} @ load remaining args ldmia ip,{r4,r5} @ load remaining args
subs r2,r2,#16 subs r2,r2,#16
mov r8,#16 mov r8,#16
blo .Lcbc_abort blo .Lcbc_abort
@ -297,13 +298,13 @@ aes_v8_cbc_encrypt:
vld1.8 {q6},[r4] vld1.8 {q6},[r4]
vld1.8 {q0},[r0],r8 vld1.8 {q0},[r0],r8
vld1.32 {q8-q9},[r3] @ load key schedule... vld1.32 {q8,q9},[r3] @ load key schedule...
sub r5,r5,#6 sub r5,r5,#6
add r7,r3,r5,lsl#4 @ pointer to last 7 round keys add r7,r3,r5,lsl#4 @ pointer to last 7 round keys
sub r5,r5,#2 sub r5,r5,#2
vld1.32 {q10-q11},[r7]! vld1.32 {q10,q11},[r7]!
vld1.32 {q12-q13},[r7]! vld1.32 {q12,q13},[r7]!
vld1.32 {q14-q15},[r7]! vld1.32 {q14,q15},[r7]!
vld1.32 {q7},[r7] vld1.32 {q7},[r7]
add r7,r3,#32 add r7,r3,#32
@ -315,62 +316,62 @@ aes_v8_cbc_encrypt:
veor q5,q8,q7 veor q5,q8,q7
beq .Lcbc_enc128 beq .Lcbc_enc128
vld1.32 {q2-q3},[r7] vld1.32 {q2,q3},[r7]
add r7,r3,#16 add r7,r3,#16
add r6,r3,#16*4 add r6,r3,#16*4
add r12,r3,#16*5 add r12,r3,#16*5
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
add r14,r3,#16*6 add r14,r3,#16*6
add r3,r3,#16*7 add r3,r3,#16*7
b .Lenter_cbc_enc b .Lenter_cbc_enc
.align 4 .align 4
.Loop_cbc_enc: .Loop_cbc_enc:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vst1.8 {q6},[r1]! vst1.8 {q6},[r1]!
.Lenter_cbc_enc: .Lenter_cbc_enc:
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2 .byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.32 {q8},[r6] vld1.32 {q8},[r6]
cmp r5,#4 cmp r5,#4
.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3 .byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.32 {q9},[r12] vld1.32 {q9},[r12]
beq .Lcbc_enc192 beq .Lcbc_enc192
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.32 {q8},[r14] vld1.32 {q8},[r14]
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.32 {q9},[r3] vld1.32 {q9},[r3]
nop nop
.Lcbc_enc192: .Lcbc_enc192:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
subs r2,r2,#16 subs r2,r2,#16
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
moveq r8,#0 moveq r8,#0
.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 .byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 .byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.8 {q8},[r0],r8 vld1.8 {q8},[r0],r8
.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 .byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
veor q8,q8,q5 veor q8,q8,q5
.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 .byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.32 {q9},[r7] @ re-pre-load rndkey[1] vld1.32 {q9},[r7] @ re-pre-load rndkey[1]
.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 .byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
veor q6,q0,q7 veor q6,q0,q7
bhs .Loop_cbc_enc bhs .Loop_cbc_enc
@ -379,36 +380,36 @@ aes_v8_cbc_encrypt:
.align 5 .align 5
.Lcbc_enc128: .Lcbc_enc128:
vld1.32 {q2-q3},[r7] vld1.32 {q2,q3},[r7]
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
b .Lenter_cbc_enc128 b .Lenter_cbc_enc128
.Loop_cbc_enc128: .Loop_cbc_enc128:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vst1.8 {q6},[r1]! vst1.8 {q6},[r1]!
.Lenter_cbc_enc128: .Lenter_cbc_enc128:
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
subs r2,r2,#16 subs r2,r2,#16
.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2 .byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
moveq r8,#0 moveq r8,#0
.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3 .byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 .byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 .byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.8 {q8},[r0],r8 vld1.8 {q8},[r0],r8
.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 .byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 .byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
veor q8,q8,q5 veor q8,q8,q5
.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 .byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
veor q6,q0,q7 veor q6,q0,q7
bhs .Loop_cbc_enc128 bhs .Loop_cbc_enc128
@ -431,81 +432,81 @@ aes_v8_cbc_encrypt:
vorr q11,q10,q10 vorr q11,q10,q10
.Loop3x_cbc_dec: .Loop3x_cbc_dec:
.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 .byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vld1.32 {q8},[r7]! vld1.32 {q8},[r7]!
subs r6,r6,#2 subs r6,r6,#2
.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 .byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vld1.32 {q9},[r7]! vld1.32 {q9},[r7]!
bgt .Loop3x_cbc_dec bgt .Loop3x_cbc_dec
.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 .byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
veor q4,q6,q7 veor q4,q6,q7
subs r2,r2,#0x30 subs r2,r2,#0x30
veor q5,q2,q7 veor q5,q2,q7
movlo r6,r2 @ r6, r6, is zero at this point movlo r6,r2 @ r6, r6, is zero at this point
.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 .byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
veor q9,q3,q7 veor q9,q3,q7
add r0,r0,r6 @ r0 is adjusted in such way that add r0,r0,r6 @ r0 is adjusted in such way that
@ at exit from the loop q1-q10 @ at exit from the loop q1-q10
@ are loaded with last "words" @ are loaded with last "words"
vorr q6,q11,q11 vorr q6,q11,q11
mov r7,r3 mov r7,r3
.byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12 .byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12 .byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12 .byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vld1.8 {q2},[r0]! vld1.8 {q2},[r0]!
.byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13 .byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13 .byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13 .byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vld1.8 {q3},[r0]! vld1.8 {q3},[r0]!
.byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14 .byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14 .byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14 .byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vld1.8 {q11},[r0]! vld1.8 {q11},[r0]!
.byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15 .byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15
.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15 .byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15
.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15 .byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15
vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
add r6,r5,#2 add r6,r5,#2
veor q4,q4,q0 veor q4,q4,q0
veor q5,q5,q1 veor q5,q5,q1
veor q10,q10,q9 veor q10,q10,q9
vld1.32 {q9},[r7]! @ re-pre-load rndkey[1] vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
vst1.8 {q4},[r1]! vst1.8 {q4},[r1]!
vorr q0,q2,q2 vorr q0,q2,q2
vst1.8 {q5},[r1]! vst1.8 {q5},[r1]!
vorr q1,q3,q3 vorr q1,q3,q3
vst1.8 {q10},[r1]! vst1.8 {q10},[r1]!
vorr q10,q11,q11 vorr q10,q11,q11
bhs .Loop3x_cbc_dec bhs .Loop3x_cbc_dec
cmn r2,#0x30 cmn r2,#0x30
@ -513,244 +514,244 @@ aes_v8_cbc_encrypt:
nop nop
.Lcbc_dec_tail: .Lcbc_dec_tail:
.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vld1.32 {q8},[r7]! vld1.32 {q8},[r7]!
subs r6,r6,#2 subs r6,r6,#2
.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vld1.32 {q9},[r7]! vld1.32 {q9},[r7]!
bgt .Lcbc_dec_tail bgt .Lcbc_dec_tail
.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12 .byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12 .byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
cmn r2,#0x20 cmn r2,#0x20
.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13 .byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13 .byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
veor q5,q6,q7 veor q5,q6,q7
.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14 .byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14 .byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
veor q9,q3,q7 veor q9,q3,q7
.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15 .byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15
.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15 .byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15
beq .Lcbc_dec_one beq .Lcbc_dec_one
veor q5,q5,q1 veor q5,q5,q1
veor q9,q9,q10 veor q9,q9,q10
vorr q6,q11,q11 vorr q6,q11,q11
vst1.8 {q5},[r1]! vst1.8 {q5},[r1]!
vst1.8 {q9},[r1]! vst1.8 {q9},[r1]!
b .Lcbc_done b .Lcbc_done
.Lcbc_dec_one: .Lcbc_dec_one:
veor q5,q5,q10 veor q5,q5,q10
vorr q6,q11,q11 vorr q6,q11,q11
vst1.8 {q5},[r1]! vst1.8 {q5},[r1]!
.Lcbc_done: .Lcbc_done:
vst1.8 {q6},[r4] vst1.8 {q6},[r4]
.Lcbc_abort: .Lcbc_abort:
vldmia sp!,{d8-d15} vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
ldmia sp!,{r4-r8,pc} ldmia sp!,{r4,r5,r6,r7,r8,pc}
.size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt .size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
.globl aes_v8_ctr32_encrypt_blocks .globl aes_v8_ctr32_encrypt_blocks
.type aes_v8_ctr32_encrypt_blocks,%function .type aes_v8_ctr32_encrypt_blocks,%function
.align 5 .align 5
aes_v8_ctr32_encrypt_blocks: aes_v8_ctr32_encrypt_blocks:
mov ip,sp mov ip,sp
stmdb sp!,{r4-r10,lr} stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,lr}
vstmdb sp!,{d8-d15} @ ABI specification says so vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so
ldr r4, [ip] @ load remaining arg ldr r4, [ip] @ load remaining arg
ldr r5,[r3,#240] ldr r5,[r3,#240]
ldr r8, [r4, #12] ldr r8, [r4, #12]
vld1.32 {q0},[r4] vld1.32 {q0},[r4]
vld1.32 {q8-q9},[r3] @ load key schedule... vld1.32 {q8,q9},[r3] @ load key schedule...
sub r5,r5,#4 sub r5,r5,#4
mov r12,#16 mov r12,#16
cmp r2,#2 cmp r2,#2
add r7,r3,r5,lsl#4 @ pointer to last 5 round keys add r7,r3,r5,lsl#4 @ pointer to last 5 round keys
sub r5,r5,#2 sub r5,r5,#2
vld1.32 {q12-q13},[r7]! vld1.32 {q12,q13},[r7]!
vld1.32 {q14-q15},[r7]! vld1.32 {q14,q15},[r7]!
vld1.32 {q7},[r7] vld1.32 {q7},[r7]
add r7,r3,#32 add r7,r3,#32
mov r6,r5 mov r6,r5
movlo r12,#0 movlo r12,#0
#ifndef __ARMEB__ #ifndef __ARMEB__
rev r8, r8 rev r8, r8
#endif #endif
vorr q1,q0,q0 vorr q1,q0,q0
add r10, r8, #1 add r10, r8, #1
vorr q10,q0,q0 vorr q10,q0,q0
add r8, r8, #2 add r8, r8, #2
vorr q6,q0,q0 vorr q6,q0,q0
rev r10, r10 rev r10, r10
vmov.32 d3[1],r10 vmov.32 d3[1],r10
bls .Lctr32_tail bls .Lctr32_tail
rev r12, r8 rev r12, r8
sub r2,r2,#3 @ bias sub r2,r2,#3 @ bias
vmov.32 d21[1],r12 vmov.32 d21[1],r12
b .Loop3x_ctr32 b .Loop3x_ctr32
.align 4 .align 4
.Loop3x_ctr32: .Loop3x_ctr32:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8 .byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8
.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 .byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
vld1.32 {q8},[r7]! vld1.32 {q8},[r7]!
subs r6,r6,#2 subs r6,r6,#2
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9 .byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9
.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 .byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
vld1.32 {q9},[r7]! vld1.32 {q9},[r7]!
bgt .Loop3x_ctr32 bgt .Loop3x_ctr32
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x83,0xb0,0xf3 @ aesmc q4,q0 .byte 0x80,0x83,0xb0,0xf3 @ aesmc q4,q0
.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x82,0xa3,0xb0,0xf3 @ aesmc q5,q1 .byte 0x82,0xa3,0xb0,0xf3 @ aesmc q5,q1
vld1.8 {q2},[r0]! vld1.8 {q2},[r0]!
vorr q0,q6,q6 vorr q0,q6,q6
.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8 .byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8
.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 .byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
vld1.8 {q3},[r0]! vld1.8 {q3},[r0]!
vorr q1,q6,q6 vorr q1,q6,q6
.byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9 .byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
.byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9 .byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
vld1.8 {q11},[r0]! vld1.8 {q11},[r0]!
mov r7,r3 mov r7,r3
.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9 .byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9
.byte 0xa4,0x23,0xf0,0xf3 @ aesmc q9,q10 .byte 0xa4,0x23,0xf0,0xf3 @ aesmc q9,q10
vorr q10,q6,q6 vorr q10,q6,q6
add r9,r8,#1 add r9,r8,#1
.byte 0x28,0x83,0xb0,0xf3 @ aese q4,q12 .byte 0x28,0x83,0xb0,0xf3 @ aese q4,q12
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
.byte 0x28,0xa3,0xb0,0xf3 @ aese q5,q12 .byte 0x28,0xa3,0xb0,0xf3 @ aese q5,q12
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
veor q2,q2,q7 veor q2,q2,q7
add r10,r8,#2 add r10,r8,#2
.byte 0x28,0x23,0xf0,0xf3 @ aese q9,q12 .byte 0x28,0x23,0xf0,0xf3 @ aese q9,q12
.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 .byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
veor q3,q3,q7 veor q3,q3,q7
add r8,r8,#3 add r8,r8,#3
.byte 0x2a,0x83,0xb0,0xf3 @ aese q4,q13 .byte 0x2a,0x83,0xb0,0xf3 @ aese q4,q13
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
.byte 0x2a,0xa3,0xb0,0xf3 @ aese q5,q13 .byte 0x2a,0xa3,0xb0,0xf3 @ aese q5,q13
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
veor q11,q11,q7 veor q11,q11,q7
rev r9,r9 rev r9,r9
.byte 0x2a,0x23,0xf0,0xf3 @ aese q9,q13 .byte 0x2a,0x23,0xf0,0xf3 @ aese q9,q13
.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 .byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
vmov.32 d1[1], r9 vmov.32 d1[1], r9
rev r10,r10 rev r10,r10
.byte 0x2c,0x83,0xb0,0xf3 @ aese q4,q14 .byte 0x2c,0x83,0xb0,0xf3 @ aese q4,q14
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
.byte 0x2c,0xa3,0xb0,0xf3 @ aese q5,q14 .byte 0x2c,0xa3,0xb0,0xf3 @ aese q5,q14
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
vmov.32 d3[1], r10 vmov.32 d3[1], r10
rev r12,r8 rev r12,r8
.byte 0x2c,0x23,0xf0,0xf3 @ aese q9,q14 .byte 0x2c,0x23,0xf0,0xf3 @ aese q9,q14
.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 .byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
vmov.32 d21[1], r12 vmov.32 d21[1], r12
subs r2,r2,#3 subs r2,r2,#3
.byte 0x2e,0x83,0xb0,0xf3 @ aese q4,q15 .byte 0x2e,0x83,0xb0,0xf3 @ aese q4,q15
.byte 0x2e,0xa3,0xb0,0xf3 @ aese q5,q15 .byte 0x2e,0xa3,0xb0,0xf3 @ aese q5,q15
.byte 0x2e,0x23,0xf0,0xf3 @ aese q9,q15 .byte 0x2e,0x23,0xf0,0xf3 @ aese q9,q15
veor q2,q2,q4 veor q2,q2,q4
vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
vst1.8 {q2},[r1]! vst1.8 {q2},[r1]!
veor q3,q3,q5 veor q3,q3,q5
mov r6,r5 mov r6,r5
vst1.8 {q3},[r1]! vst1.8 {q3},[r1]!
veor q11,q11,q9 veor q11,q11,q9
vld1.32 {q9},[r7]! @ re-pre-load rndkey[1] vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
vst1.8 {q11},[r1]! vst1.8 {q11},[r1]!
bhs .Loop3x_ctr32 bhs .Loop3x_ctr32
adds r2,r2,#3 adds r2,r2,#3
beq .Lctr32_done beq .Lctr32_done
cmp r2,#1 cmp r2,#1
mov r12,#16 mov r12,#16
moveq r12,#0 moveq r12,#0
.Lctr32_tail: .Lctr32_tail:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
vld1.32 {q8},[r7]! vld1.32 {q8},[r7]!
subs r6,r6,#2 subs r6,r6,#2
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
vld1.32 {q9},[r7]! vld1.32 {q9},[r7]!
bgt .Lctr32_tail bgt .Lctr32_tail
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
vld1.8 {q2},[r0],r12 vld1.8 {q2},[r0],r12
.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 .byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12 .byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
vld1.8 {q3},[r0] vld1.8 {q3},[r0]
.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 .byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13 .byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
veor q2,q2,q7 veor q2,q2,q7
.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14 .byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
veor q3,q3,q7 veor q3,q3,q7
.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 .byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
.byte 0x2e,0x23,0xb0,0xf3 @ aese q1,q15 .byte 0x2e,0x23,0xb0,0xf3 @ aese q1,q15
cmp r2,#1 cmp r2,#1
veor q2,q2,q0 veor q2,q2,q0
veor q3,q3,q1 veor q3,q3,q1
vst1.8 {q2},[r1]! vst1.8 {q2},[r1]!
beq .Lctr32_done beq .Lctr32_done
vst1.8 {q3},[r1] vst1.8 {q3},[r1]
.Lctr32_done: .Lctr32_done:
vldmia sp!,{d8-d15} vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
ldmia sp!,{r4-r10,pc} ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,pc}
.size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks .size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
#endif #endif

View File

@ -3,7 +3,12 @@
#include "arm_arch.h" #include "arm_arch.h"
.text .text
#if defined(__thumb2__)
.syntax unified
.thumb
#else
.code 32 .code 32
#endif
.type mul_1x1_ialu,%function .type mul_1x1_ialu,%function
.align 5 .align 5
mul_1x1_ialu: mul_1x1_ialu:
@ -71,11 +76,17 @@ mul_1x1_ialu:
eor r4,r4,r6,lsr#8 eor r4,r4,r6,lsr#8
ldr r6,[sp,r8] @ tab[b >> 30 ] ldr r6,[sp,r8] @ tab[b >> 30 ]
#ifdef __thumb2__
itt ne
#endif
eorne r5,r5,r0,lsl#30 eorne r5,r5,r0,lsl#30
eorne r4,r4,r0,lsr#2 eorne r4,r4,r0,lsr#2
tst r1,#1<<31 tst r1,#1<<31
eor r5,r5,r7,lsl#27 eor r5,r5,r7,lsl#27
eor r4,r4,r7,lsr#5 eor r4,r4,r7,lsr#5
#ifdef __thumb2__
itt ne
#endif
eorne r5,r5,r0,lsl#31 eorne r5,r5,r0,lsl#31
eorne r4,r4,r0,lsr#1 eorne r4,r4,r0,lsr#1
eor r5,r5,r6,lsl#30 eor r5,r5,r6,lsl#30
@ -83,33 +94,46 @@ mul_1x1_ialu:
mov pc,lr mov pc,lr
.size mul_1x1_ialu,.-mul_1x1_ialu .size mul_1x1_ialu,.-mul_1x1_ialu
.global bn_GF2m_mul_2x2 .globl bn_GF2m_mul_2x2
.type bn_GF2m_mul_2x2,%function .type bn_GF2m_mul_2x2,%function
.align 5 .align 5
bn_GF2m_mul_2x2: bn_GF2m_mul_2x2:
#if __ARM_MAX_ARCH__>=7 #if __ARM_MAX_ARCH__>=7
stmdb sp!,{r10,lr}
ldr r12,.LOPENSSL_armcap ldr r12,.LOPENSSL_armcap
.Lpic: ldr r12,[pc,r12] adr r10,.LOPENSSL_armcap
tst r12,#1 ldr r12,[r12,r10]
bne .LNEON #ifdef __APPLE__
ldr r12,[r12]
#endif
tst r12,#ARMV7_NEON
itt ne
ldrne r10,[sp],#8
bne .LNEON
stmdb sp!,{r4,r5,r6,r7,r8,r9}
#else
stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,lr}
#endif #endif
stmdb sp!,{r4-r10,lr}
mov r10,r0 @ reassign 1st argument mov r10,r0 @ reassign 1st argument
mov r0,r3 @ r0=b1 mov r0,r3 @ r0=b1
sub r7,sp,#36
mov r8,sp
and r7,r7,#-32
ldr r3,[sp,#32] @ load b0 ldr r3,[sp,#32] @ load b0
mov r12,#7<<2 mov r12,#7<<2
sub sp,sp,#32 @ allocate tab[8] mov sp,r7 @ allocate tab[8]
str r8,[r7,#32]
bl mul_1x1_ialu @ a1·b1 bl mul_1x1_ialu @ a1·b1
str r5,[r10,#8] str r5,[r10,#8]
str r4,[r10,#12] str r4,[r10,#12]
eor r0,r0,r3 @ flip b0 and b1 eor r0,r0,r3 @ flip b0 and b1
eor r1,r1,r2 @ flip a0 and a1 eor r1,r1,r2 @ flip a0 and a1
eor r3,r3,r0 eor r3,r3,r0
eor r2,r2,r1 eor r2,r2,r1
eor r0,r0,r3 eor r0,r0,r3
eor r1,r1,r2 eor r1,r1,r2
bl mul_1x1_ialu @ a0·b0 bl mul_1x1_ialu @ a0·b0
str r5,[r10] str r5,[r10]
str r4,[r10,#4] str r4,[r10,#4]
@ -117,8 +141,9 @@ bn_GF2m_mul_2x2:
eor r1,r1,r2 eor r1,r1,r2
eor r0,r0,r3 eor r0,r0,r3
bl mul_1x1_ialu @ (a1+a0)·(b1+b0) bl mul_1x1_ialu @ (a1+a0)·(b1+b0)
ldmia r10,{r6-r9} ldmia r10,{r6,r7,r8,r9}
eor r5,r5,r4 eor r5,r5,r4
ldr sp,[sp,#32] @ destroy tab[8]
eor r4,r4,r7 eor r4,r4,r7
eor r5,r5,r6 eor r5,r5,r6
eor r4,r4,r8 eor r4,r4,r8
@ -126,16 +151,15 @@ bn_GF2m_mul_2x2:
eor r4,r4,r9 eor r4,r4,r9
str r4,[r10,#8] str r4,[r10,#8]
eor r5,r5,r4 eor r5,r5,r4
add sp,sp,#32 @ destroy tab[8]
str r5,[r10,#4] str r5,[r10,#4]
#if __ARM_ARCH__>=5 #if __ARM_ARCH__>=5
ldmia sp!,{r4-r10,pc} ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,pc}
#else #else
ldmia sp!,{r4-r10,lr} ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,lr}
tst lr,#1 tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-) .word 0xe12fff1e @ interoperable with Thumb ISA:-)
#endif #endif
#if __ARM_MAX_ARCH__>=7 #if __ARM_MAX_ARCH__>=7
.arch armv7-a .arch armv7-a
@ -143,61 +167,62 @@ bn_GF2m_mul_2x2:
.align 5 .align 5
.LNEON: .LNEON:
ldr r12, [sp] @ 5th argument ldr r12, [sp] @ 5th argument
vmov d26, r2, r1 vmov d26, r2, r1
vmov d27, r12, r3 vmov d27, r12, r3
vmov.i64 d28, #0x0000ffffffffffff vmov.i64 d28, #0x0000ffffffffffff
vmov.i64 d29, #0x00000000ffffffff vmov.i64 d29, #0x00000000ffffffff
vmov.i64 d30, #0x000000000000ffff vmov.i64 d30, #0x000000000000ffff
vext.8 d2, d26, d26, #1 @ A1 vext.8 d2, d26, d26, #1 @ A1
vmull.p8 q1, d2, d27 @ F = A1*B vmull.p8 q1, d2, d27 @ F = A1*B
vext.8 d0, d27, d27, #1 @ B1 vext.8 d0, d27, d27, #1 @ B1
vmull.p8 q0, d26, d0 @ E = A*B1 vmull.p8 q0, d26, d0 @ E = A*B1
vext.8 d4, d26, d26, #2 @ A2 vext.8 d4, d26, d26, #2 @ A2
vmull.p8 q2, d4, d27 @ H = A2*B vmull.p8 q2, d4, d27 @ H = A2*B
vext.8 d16, d27, d27, #2 @ B2 vext.8 d16, d27, d27, #2 @ B2
vmull.p8 q8, d26, d16 @ G = A*B2 vmull.p8 q8, d26, d16 @ G = A*B2
vext.8 d6, d26, d26, #3 @ A3 vext.8 d6, d26, d26, #3 @ A3
veor q1, q1, q0 @ L = E + F veor q1, q1, q0 @ L = E + F
vmull.p8 q3, d6, d27 @ J = A3*B vmull.p8 q3, d6, d27 @ J = A3*B
vext.8 d0, d27, d27, #3 @ B3 vext.8 d0, d27, d27, #3 @ B3
veor q2, q2, q8 @ M = G + H veor q2, q2, q8 @ M = G + H
vmull.p8 q0, d26, d0 @ I = A*B3 vmull.p8 q0, d26, d0 @ I = A*B3
veor d2, d2, d3 @ t0 = (L) (P0 + P1) << 8 veor d2, d2, d3 @ t0 = (L) (P0 + P1) << 8
vand d3, d3, d28 vand d3, d3, d28
vext.8 d16, d27, d27, #4 @ B4 vext.8 d16, d27, d27, #4 @ B4
veor d4, d4, d5 @ t1 = (M) (P2 + P3) << 16 veor d4, d4, d5 @ t1 = (M) (P2 + P3) << 16
vand d5, d5, d29 vand d5, d5, d29
vmull.p8 q8, d26, d16 @ K = A*B4 vmull.p8 q8, d26, d16 @ K = A*B4
veor q3, q3, q0 @ N = I + J veor q3, q3, q0 @ N = I + J
veor d2, d2, d3 veor d2, d2, d3
veor d4, d4, d5 veor d4, d4, d5
veor d6, d6, d7 @ t2 = (N) (P4 + P5) << 24 veor d6, d6, d7 @ t2 = (N) (P4 + P5) << 24
vand d7, d7, d30 vand d7, d7, d30
vext.8 q1, q1, q1, #15 vext.8 q1, q1, q1, #15
veor d16, d16, d17 @ t3 = (K) (P6 + P7) << 32 veor d16, d16, d17 @ t3 = (K) (P6 + P7) << 32
vmov.i64 d17, #0 vmov.i64 d17, #0
vext.8 q2, q2, q2, #14 vext.8 q2, q2, q2, #14
veor d6, d6, d7 veor d6, d6, d7
vmull.p8 q0, d26, d27 @ D = A*B vmull.p8 q0, d26, d27 @ D = A*B
vext.8 q8, q8, q8, #12 vext.8 q8, q8, q8, #12
vext.8 q3, q3, q3, #13 vext.8 q3, q3, q3, #13
veor q1, q1, q2 veor q1, q1, q2
veor q3, q3, q8 veor q3, q3, q8
veor q0, q0, q1 veor q0, q0, q1
veor q0, q0, q3 veor q0, q0, q3
vst1.32 {q0}, [r0] vst1.32 {q0}, [r0]
bx lr @ bx lr bx lr @ bx lr
#endif #endif
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
#if __ARM_MAX_ARCH__>=7 #if __ARM_MAX_ARCH__>=7
.align 5 .align 5
.LOPENSSL_armcap: .LOPENSSL_armcap:
.word OPENSSL_armcap_P-(.Lpic+8) .word OPENSSL_armcap_P-.
#endif #endif
.asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>" .byte 71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2
.align 5 .align 5
#if __ARM_MAX_ARCH__>=7 #if __ARM_MAX_ARCH__>=7

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -3,11 +3,15 @@
#include "arm_arch.h" #include "arm_arch.h"
.text .text
#if defined(__thumb2__) || defined(__clang__)
.syntax unified
#define ldrplb ldrbpl
#define ldrneb ldrbne
#endif
#if defined(__thumb2__)
.thumb
#else
.code 32 .code 32
#ifdef __clang__
#define ldrplb ldrbpl
#define ldrneb ldrbne
#endif #endif
.type rem_4bit,%object .type rem_4bit,%object
@ -21,22 +25,30 @@ rem_4bit:
.type rem_4bit_get,%function .type rem_4bit_get,%function
rem_4bit_get: rem_4bit_get:
sub r2,pc,#8 #if defined(__thumb2__)
sub r2,r2,#32 @ &rem_4bit adr r2,rem_4bit
#else
sub r2,pc,#8+32 @ &rem_4bit
#endif
b .Lrem_4bit_got b .Lrem_4bit_got
nop nop
nop
.size rem_4bit_get,.-rem_4bit_get .size rem_4bit_get,.-rem_4bit_get
.global gcm_ghash_4bit .globl gcm_ghash_4bit
.type gcm_ghash_4bit,%function .type gcm_ghash_4bit,%function
.align 4
gcm_ghash_4bit: gcm_ghash_4bit:
sub r12,pc,#8 #if defined(__thumb2__)
adr r12,rem_4bit
#else
sub r12,pc,#8+48 @ &rem_4bit
#endif
add r3,r2,r3 @ r3 to point at the end add r3,r2,r3 @ r3 to point at the end
stmdb sp!,{r3-r11,lr} @ save r3/end too stmdb sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} @ save r3/end too
sub r12,r12,#48 @ &rem_4bit
ldmia r12,{r4-r11} @ copy rem_4bit ... ldmia r12,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy rem_4bit ...
stmdb sp!,{r4-r11} @ ... to stack stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ ... to stack
ldrb r12,[r2,#15] ldrb r12,[r2,#15]
ldrb r14,[r0,#15] ldrb r14,[r0,#15]
@ -47,12 +59,12 @@ gcm_ghash_4bit:
mov r3,#14 mov r3,#14
add r7,r1,r12,lsl#4 add r7,r1,r12,lsl#4
ldmia r7,{r4-r7} @ load Htbl[nlo] ldmia r7,{r4,r5,r6,r7} @ load Htbl[nlo]
add r11,r1,r14 add r11,r1,r14
ldrb r12,[r2,#14] ldrb r12,[r2,#14]
and r14,r4,#0xf @ rem and r14,r4,#0xf @ rem
ldmia r11,{r8-r11} @ load Htbl[nhi] ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi]
add r14,r14,r14 add r14,r14,r14
eor r4,r8,r4,lsr#4 eor r4,r8,r4,lsr#4
ldrh r8,[sp,r14] @ rem_4bit[rem] ldrh r8,[sp,r14] @ rem_4bit[rem]
@ -73,13 +85,16 @@ gcm_ghash_4bit:
and r12,r4,#0xf @ rem and r12,r4,#0xf @ rem
subs r3,r3,#1 subs r3,r3,#1
add r12,r12,r12 add r12,r12,r12
ldmia r11,{r8-r11} @ load Htbl[nlo] ldmia r11,{r8,r9,r10,r11} @ load Htbl[nlo]
eor r4,r8,r4,lsr#4 eor r4,r8,r4,lsr#4
eor r4,r4,r5,lsl#28 eor r4,r4,r5,lsl#28
eor r5,r9,r5,lsr#4 eor r5,r9,r5,lsr#4
eor r5,r5,r6,lsl#28 eor r5,r5,r6,lsl#28
ldrh r8,[sp,r12] @ rem_4bit[rem] ldrh r8,[sp,r12] @ rem_4bit[rem]
eor r6,r10,r6,lsr#4 eor r6,r10,r6,lsr#4
#ifdef __thumb2__
it pl
#endif
ldrplb r12,[r2,r3] ldrplb r12,[r2,r3]
eor r6,r6,r7,lsl#28 eor r6,r6,r7,lsl#28
eor r7,r11,r7,lsr#4 eor r7,r11,r7,lsr#4
@ -88,8 +103,11 @@ gcm_ghash_4bit:
and r14,r4,#0xf @ rem and r14,r4,#0xf @ rem
eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem] eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]
add r14,r14,r14 add r14,r14,r14
ldmia r11,{r8-r11} @ load Htbl[nhi] ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi]
eor r4,r8,r4,lsr#4 eor r4,r8,r4,lsr#4
#ifdef __thumb2__
it pl
#endif
ldrplb r8,[r0,r3] ldrplb r8,[r0,r3]
eor r4,r4,r5,lsl#28 eor r4,r4,r5,lsl#28
eor r5,r9,r5,lsr#4 eor r5,r9,r5,lsr#4
@ -97,8 +115,14 @@ gcm_ghash_4bit:
eor r5,r5,r6,lsl#28 eor r5,r5,r6,lsl#28
eor r6,r10,r6,lsr#4 eor r6,r10,r6,lsr#4
eor r6,r6,r7,lsl#28 eor r6,r6,r7,lsl#28
#ifdef __thumb2__
it pl
#endif
eorpl r12,r12,r8 eorpl r12,r12,r8
eor r7,r11,r7,lsr#4 eor r7,r11,r7,lsr#4
#ifdef __thumb2__
itt pl
#endif
andpl r14,r12,#0xf0 andpl r14,r12,#0xf0
andpl r12,r12,#0x0f andpl r12,r12,#0x0f
eor r7,r7,r9,lsl#16 @ ^= rem_4bit[rem] eor r7,r7,r9,lsl#16 @ ^= rem_4bit[rem]
@ -136,6 +160,10 @@ gcm_ghash_4bit:
strb r10,[r0,#8+1] strb r10,[r0,#8+1]
strb r11,[r0,#8] strb r11,[r0,#8]
#endif #endif
#ifdef __thumb2__
it ne
#endif
ldrneb r12,[r2,#15] ldrneb r12,[r2,#15]
#if __ARM_ARCH__>=7 && defined(__ARMEL__) #if __ARM_ARCH__>=7 && defined(__ARMEL__)
rev r6,r6 rev r6,r6
@ -151,7 +179,7 @@ gcm_ghash_4bit:
strb r10,[r0,#4+1] strb r10,[r0,#4+1]
strb r11,[r0,#4] strb r11,[r0,#4]
#endif #endif
#if __ARM_ARCH__>=7 && defined(__ARMEL__) #if __ARM_ARCH__>=7 && defined(__ARMEL__)
rev r7,r7 rev r7,r7
str r7,[r0,#0] str r7,[r0,#0]
@ -166,24 +194,24 @@ gcm_ghash_4bit:
strb r10,[r0,#0+1] strb r10,[r0,#0+1]
strb r11,[r0,#0] strb r11,[r0,#0]
#endif #endif
bne .Louter bne .Louter
add sp,sp,#36 add sp,sp,#36
#if __ARM_ARCH__>=5 #if __ARM_ARCH__>=5
ldmia sp!,{r4-r11,pc} ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
#else #else
ldmia sp!,{r4-r11,lr} ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
tst lr,#1 tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-) .word 0xe12fff1e @ interoperable with Thumb ISA:-)
#endif #endif
.size gcm_ghash_4bit,.-gcm_ghash_4bit .size gcm_ghash_4bit,.-gcm_ghash_4bit
.global gcm_gmult_4bit .globl gcm_gmult_4bit
.type gcm_gmult_4bit,%function .type gcm_gmult_4bit,%function
gcm_gmult_4bit: gcm_gmult_4bit:
stmdb sp!,{r4-r11,lr} stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
ldrb r12,[r0,#15] ldrb r12,[r0,#15]
b rem_4bit_get b rem_4bit_get
.Lrem_4bit_got: .Lrem_4bit_got:
@ -192,12 +220,12 @@ gcm_gmult_4bit:
mov r3,#14 mov r3,#14
add r7,r1,r12,lsl#4 add r7,r1,r12,lsl#4
ldmia r7,{r4-r7} @ load Htbl[nlo] ldmia r7,{r4,r5,r6,r7} @ load Htbl[nlo]
ldrb r12,[r0,#14] ldrb r12,[r0,#14]
add r11,r1,r14 add r11,r1,r14
and r14,r4,#0xf @ rem and r14,r4,#0xf @ rem
ldmia r11,{r8-r11} @ load Htbl[nhi] ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi]
add r14,r14,r14 add r14,r14,r14
eor r4,r8,r4,lsr#4 eor r4,r8,r4,lsr#4
ldrh r8,[r2,r14] @ rem_4bit[rem] ldrh r8,[r2,r14] @ rem_4bit[rem]
@ -216,13 +244,16 @@ gcm_gmult_4bit:
and r12,r4,#0xf @ rem and r12,r4,#0xf @ rem
subs r3,r3,#1 subs r3,r3,#1
add r12,r12,r12 add r12,r12,r12
ldmia r11,{r8-r11} @ load Htbl[nlo] ldmia r11,{r8,r9,r10,r11} @ load Htbl[nlo]
eor r4,r8,r4,lsr#4 eor r4,r8,r4,lsr#4
eor r4,r4,r5,lsl#28 eor r4,r4,r5,lsl#28
eor r5,r9,r5,lsr#4 eor r5,r9,r5,lsr#4
eor r5,r5,r6,lsl#28 eor r5,r5,r6,lsl#28
ldrh r8,[r2,r12] @ rem_4bit[rem] ldrh r8,[r2,r12] @ rem_4bit[rem]
eor r6,r10,r6,lsr#4 eor r6,r10,r6,lsr#4
#ifdef __thumb2__
it pl
#endif
ldrplb r12,[r0,r3] ldrplb r12,[r0,r3]
eor r6,r6,r7,lsl#28 eor r6,r6,r7,lsl#28
eor r7,r11,r7,lsr#4 eor r7,r11,r7,lsr#4
@ -231,7 +262,7 @@ gcm_gmult_4bit:
and r14,r4,#0xf @ rem and r14,r4,#0xf @ rem
eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem] eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]
add r14,r14,r14 add r14,r14,r14
ldmia r11,{r8-r11} @ load Htbl[nhi] ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi]
eor r4,r8,r4,lsr#4 eor r4,r8,r4,lsr#4
eor r4,r4,r5,lsl#28 eor r4,r4,r5,lsl#28
eor r5,r9,r5,lsr#4 eor r5,r9,r5,lsr#4
@ -240,6 +271,9 @@ gcm_gmult_4bit:
eor r6,r10,r6,lsr#4 eor r6,r10,r6,lsr#4
eor r6,r6,r7,lsl#28 eor r6,r6,r7,lsl#28
eor r7,r11,r7,lsr#4 eor r7,r11,r7,lsr#4
#ifdef __thumb2__
itt pl
#endif
andpl r14,r12,#0xf0 andpl r14,r12,#0xf0
andpl r12,r12,#0x0f andpl r12,r12,#0x0f
eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem] eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]
@ -258,7 +292,7 @@ gcm_gmult_4bit:
strb r10,[r0,#12+1] strb r10,[r0,#12+1]
strb r11,[r0,#12] strb r11,[r0,#12]
#endif #endif
#if __ARM_ARCH__>=7 && defined(__ARMEL__) #if __ARM_ARCH__>=7 && defined(__ARMEL__)
rev r5,r5 rev r5,r5
str r5,[r0,#8] str r5,[r0,#8]
@ -273,7 +307,7 @@ gcm_gmult_4bit:
strb r10,[r0,#8+1] strb r10,[r0,#8+1]
strb r11,[r0,#8] strb r11,[r0,#8]
#endif #endif
#if __ARM_ARCH__>=7 && defined(__ARMEL__) #if __ARM_ARCH__>=7 && defined(__ARMEL__)
rev r6,r6 rev r6,r6
str r6,[r0,#4] str r6,[r0,#4]
@ -288,7 +322,7 @@ gcm_gmult_4bit:
strb r10,[r0,#4+1] strb r10,[r0,#4+1]
strb r11,[r0,#4] strb r11,[r0,#4]
#endif #endif
#if __ARM_ARCH__>=7 && defined(__ARMEL__) #if __ARM_ARCH__>=7 && defined(__ARMEL__)
rev r7,r7 rev r7,r7
str r7,[r0,#0] str r7,[r0,#0]
@ -303,228 +337,229 @@ gcm_gmult_4bit:
strb r10,[r0,#0+1] strb r10,[r0,#0+1]
strb r11,[r0,#0] strb r11,[r0,#0]
#endif #endif
#if __ARM_ARCH__>=5 #if __ARM_ARCH__>=5
ldmia sp!,{r4-r11,pc} ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
#else #else
ldmia sp!,{r4-r11,lr} ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
tst lr,#1 tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-) .word 0xe12fff1e @ interoperable with Thumb ISA:-)
#endif #endif
.size gcm_gmult_4bit,.-gcm_gmult_4bit .size gcm_gmult_4bit,.-gcm_gmult_4bit
#if __ARM_MAX_ARCH__>=7 #if __ARM_MAX_ARCH__>=7
.arch armv7-a .arch armv7-a
.fpu neon .fpu neon
.global gcm_init_neon .globl gcm_init_neon
.type gcm_init_neon,%function .type gcm_init_neon,%function
.align 4 .align 4
gcm_init_neon: gcm_init_neon:
vld1.64 d7,[r1,:64]! @ load H vld1.64 d7,[r1]! @ load H
vmov.i8 q8,#0xe1 vmov.i8 q8,#0xe1
vld1.64 d6,[r1,:64] vld1.64 d6,[r1]
vshl.i64 d17,#57 vshl.i64 d17,#57
vshr.u64 d16,#63 @ t0=0xc2....01 vshr.u64 d16,#63 @ t0=0xc2....01
vdup.8 q9,d7[7] vdup.8 q9,d7[7]
vshr.u64 d26,d6,#63 vshr.u64 d26,d6,#63
vshr.s8 q9,#7 @ broadcast carry bit vshr.s8 q9,#7 @ broadcast carry bit
vshl.i64 q3,q3,#1 vshl.i64 q3,q3,#1
vand q8,q8,q9 vand q8,q8,q9
vorr d7,d26 @ H<<<=1 vorr d7,d26 @ H<<<=1
veor q3,q3,q8 @ twisted H veor q3,q3,q8 @ twisted H
vstmia r0,{q3} vstmia r0,{q3}
bx lr @ bx lr bx lr @ bx lr
.size gcm_init_neon,.-gcm_init_neon .size gcm_init_neon,.-gcm_init_neon
.global gcm_gmult_neon .globl gcm_gmult_neon
.type gcm_gmult_neon,%function .type gcm_gmult_neon,%function
.align 4 .align 4
gcm_gmult_neon: gcm_gmult_neon:
vld1.64 d7,[r0,:64]! @ load Xi vld1.64 d7,[r0]! @ load Xi
vld1.64 d6,[r0,:64]! vld1.64 d6,[r0]!
vmov.i64 d29,#0x0000ffffffffffff vmov.i64 d29,#0x0000ffffffffffff
vldmia r1,{d26-d27} @ load twisted H vldmia r1,{d26,d27} @ load twisted H
vmov.i64 d30,#0x00000000ffffffff vmov.i64 d30,#0x00000000ffffffff
#ifdef __ARMEL__ #ifdef __ARMEL__
vrev64.8 q3,q3 vrev64.8 q3,q3
#endif #endif
vmov.i64 d31,#0x000000000000ffff vmov.i64 d31,#0x000000000000ffff
veor d28,d26,d27 @ Karatsuba pre-processing veor d28,d26,d27 @ Karatsuba pre-processing
mov r3,#16 mov r3,#16
b .Lgmult_neon b .Lgmult_neon
.size gcm_gmult_neon,.-gcm_gmult_neon .size gcm_gmult_neon,.-gcm_gmult_neon
.global gcm_ghash_neon .globl gcm_ghash_neon
.type gcm_ghash_neon,%function .type gcm_ghash_neon,%function
.align 4 .align 4
gcm_ghash_neon: gcm_ghash_neon:
vld1.64 d1,[r0,:64]! @ load Xi vld1.64 d1,[r0]! @ load Xi
vld1.64 d0,[r0,:64]! vld1.64 d0,[r0]!
vmov.i64 d29,#0x0000ffffffffffff vmov.i64 d29,#0x0000ffffffffffff
vldmia r1,{d26-d27} @ load twisted H vldmia r1,{d26,d27} @ load twisted H
vmov.i64 d30,#0x00000000ffffffff vmov.i64 d30,#0x00000000ffffffff
#ifdef __ARMEL__ #ifdef __ARMEL__
vrev64.8 q0,q0 vrev64.8 q0,q0
#endif #endif
vmov.i64 d31,#0x000000000000ffff vmov.i64 d31,#0x000000000000ffff
veor d28,d26,d27 @ Karatsuba pre-processing veor d28,d26,d27 @ Karatsuba pre-processing
.Loop_neon: .Loop_neon:
vld1.64 d7,[r2]! @ load inp vld1.64 d7,[r2]! @ load inp
vld1.64 d6,[r2]! vld1.64 d6,[r2]!
#ifdef __ARMEL__ #ifdef __ARMEL__
vrev64.8 q3,q3 vrev64.8 q3,q3
#endif #endif
veor q3,q0 @ inp^=Xi veor q3,q0 @ inp^=Xi
.Lgmult_neon: .Lgmult_neon:
vext.8 d16, d26, d26, #1 @ A1 vext.8 d16, d26, d26, #1 @ A1
vmull.p8 q8, d16, d6 @ F = A1*B vmull.p8 q8, d16, d6 @ F = A1*B
vext.8 d0, d6, d6, #1 @ B1 vext.8 d0, d6, d6, #1 @ B1
vmull.p8 q0, d26, d0 @ E = A*B1 vmull.p8 q0, d26, d0 @ E = A*B1
vext.8 d18, d26, d26, #2 @ A2 vext.8 d18, d26, d26, #2 @ A2
vmull.p8 q9, d18, d6 @ H = A2*B vmull.p8 q9, d18, d6 @ H = A2*B
vext.8 d22, d6, d6, #2 @ B2 vext.8 d22, d6, d6, #2 @ B2
vmull.p8 q11, d26, d22 @ G = A*B2 vmull.p8 q11, d26, d22 @ G = A*B2
vext.8 d20, d26, d26, #3 @ A3 vext.8 d20, d26, d26, #3 @ A3
veor q8, q8, q0 @ L = E + F veor q8, q8, q0 @ L = E + F
vmull.p8 q10, d20, d6 @ J = A3*B vmull.p8 q10, d20, d6 @ J = A3*B
vext.8 d0, d6, d6, #3 @ B3 vext.8 d0, d6, d6, #3 @ B3
veor q9, q9, q11 @ M = G + H veor q9, q9, q11 @ M = G + H
vmull.p8 q0, d26, d0 @ I = A*B3 vmull.p8 q0, d26, d0 @ I = A*B3
veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
vand d17, d17, d29 vand d17, d17, d29
vext.8 d22, d6, d6, #4 @ B4 vext.8 d22, d6, d6, #4 @ B4
veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
vand d19, d19, d30 vand d19, d19, d30
vmull.p8 q11, d26, d22 @ K = A*B4 vmull.p8 q11, d26, d22 @ K = A*B4
veor q10, q10, q0 @ N = I + J veor q10, q10, q0 @ N = I + J
veor d16, d16, d17 veor d16, d16, d17
veor d18, d18, d19 veor d18, d18, d19
veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
vand d21, d21, d31 vand d21, d21, d31
vext.8 q8, q8, q8, #15 vext.8 q8, q8, q8, #15
veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
vmov.i64 d23, #0 vmov.i64 d23, #0
vext.8 q9, q9, q9, #14 vext.8 q9, q9, q9, #14
veor d20, d20, d21 veor d20, d20, d21
vmull.p8 q0, d26, d6 @ D = A*B vmull.p8 q0, d26, d6 @ D = A*B
vext.8 q11, q11, q11, #12 vext.8 q11, q11, q11, #12
vext.8 q10, q10, q10, #13 vext.8 q10, q10, q10, #13
veor q8, q8, q9 veor q8, q8, q9
veor q10, q10, q11 veor q10, q10, q11
veor q0, q0, q8 veor q0, q0, q8
veor q0, q0, q10 veor q0, q0, q10
veor d6,d6,d7 @ Karatsuba pre-processing veor d6,d6,d7 @ Karatsuba pre-processing
vext.8 d16, d28, d28, #1 @ A1 vext.8 d16, d28, d28, #1 @ A1
vmull.p8 q8, d16, d6 @ F = A1*B vmull.p8 q8, d16, d6 @ F = A1*B
vext.8 d2, d6, d6, #1 @ B1 vext.8 d2, d6, d6, #1 @ B1
vmull.p8 q1, d28, d2 @ E = A*B1 vmull.p8 q1, d28, d2 @ E = A*B1
vext.8 d18, d28, d28, #2 @ A2 vext.8 d18, d28, d28, #2 @ A2
vmull.p8 q9, d18, d6 @ H = A2*B vmull.p8 q9, d18, d6 @ H = A2*B
vext.8 d22, d6, d6, #2 @ B2 vext.8 d22, d6, d6, #2 @ B2
vmull.p8 q11, d28, d22 @ G = A*B2 vmull.p8 q11, d28, d22 @ G = A*B2
vext.8 d20, d28, d28, #3 @ A3 vext.8 d20, d28, d28, #3 @ A3
veor q8, q8, q1 @ L = E + F veor q8, q8, q1 @ L = E + F
vmull.p8 q10, d20, d6 @ J = A3*B vmull.p8 q10, d20, d6 @ J = A3*B
vext.8 d2, d6, d6, #3 @ B3 vext.8 d2, d6, d6, #3 @ B3
veor q9, q9, q11 @ M = G + H veor q9, q9, q11 @ M = G + H
vmull.p8 q1, d28, d2 @ I = A*B3 vmull.p8 q1, d28, d2 @ I = A*B3
veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
vand d17, d17, d29 vand d17, d17, d29
vext.8 d22, d6, d6, #4 @ B4 vext.8 d22, d6, d6, #4 @ B4
veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
vand d19, d19, d30 vand d19, d19, d30
vmull.p8 q11, d28, d22 @ K = A*B4 vmull.p8 q11, d28, d22 @ K = A*B4
veor q10, q10, q1 @ N = I + J veor q10, q10, q1 @ N = I + J
veor d16, d16, d17 veor d16, d16, d17
veor d18, d18, d19 veor d18, d18, d19
veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
vand d21, d21, d31 vand d21, d21, d31
vext.8 q8, q8, q8, #15 vext.8 q8, q8, q8, #15
veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
vmov.i64 d23, #0 vmov.i64 d23, #0
vext.8 q9, q9, q9, #14 vext.8 q9, q9, q9, #14
veor d20, d20, d21 veor d20, d20, d21
vmull.p8 q1, d28, d6 @ D = A*B vmull.p8 q1, d28, d6 @ D = A*B
vext.8 q11, q11, q11, #12 vext.8 q11, q11, q11, #12
vext.8 q10, q10, q10, #13 vext.8 q10, q10, q10, #13
veor q8, q8, q9 veor q8, q8, q9
veor q10, q10, q11 veor q10, q10, q11
veor q1, q1, q8 veor q1, q1, q8
veor q1, q1, q10 veor q1, q1, q10
vext.8 d16, d27, d27, #1 @ A1 vext.8 d16, d27, d27, #1 @ A1
vmull.p8 q8, d16, d7 @ F = A1*B vmull.p8 q8, d16, d7 @ F = A1*B
vext.8 d4, d7, d7, #1 @ B1 vext.8 d4, d7, d7, #1 @ B1
vmull.p8 q2, d27, d4 @ E = A*B1 vmull.p8 q2, d27, d4 @ E = A*B1
vext.8 d18, d27, d27, #2 @ A2 vext.8 d18, d27, d27, #2 @ A2
vmull.p8 q9, d18, d7 @ H = A2*B vmull.p8 q9, d18, d7 @ H = A2*B
vext.8 d22, d7, d7, #2 @ B2 vext.8 d22, d7, d7, #2 @ B2
vmull.p8 q11, d27, d22 @ G = A*B2 vmull.p8 q11, d27, d22 @ G = A*B2
vext.8 d20, d27, d27, #3 @ A3 vext.8 d20, d27, d27, #3 @ A3
veor q8, q8, q2 @ L = E + F veor q8, q8, q2 @ L = E + F
vmull.p8 q10, d20, d7 @ J = A3*B vmull.p8 q10, d20, d7 @ J = A3*B
vext.8 d4, d7, d7, #3 @ B3 vext.8 d4, d7, d7, #3 @ B3
veor q9, q9, q11 @ M = G + H veor q9, q9, q11 @ M = G + H
vmull.p8 q2, d27, d4 @ I = A*B3 vmull.p8 q2, d27, d4 @ I = A*B3
veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
vand d17, d17, d29 vand d17, d17, d29
vext.8 d22, d7, d7, #4 @ B4 vext.8 d22, d7, d7, #4 @ B4
veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
vand d19, d19, d30 vand d19, d19, d30
vmull.p8 q11, d27, d22 @ K = A*B4 vmull.p8 q11, d27, d22 @ K = A*B4
veor q10, q10, q2 @ N = I + J veor q10, q10, q2 @ N = I + J
veor d16, d16, d17 veor d16, d16, d17
veor d18, d18, d19 veor d18, d18, d19
veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
vand d21, d21, d31 vand d21, d21, d31
vext.8 q8, q8, q8, #15 vext.8 q8, q8, q8, #15
veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
vmov.i64 d23, #0 vmov.i64 d23, #0
vext.8 q9, q9, q9, #14 vext.8 q9, q9, q9, #14
veor d20, d20, d21 veor d20, d20, d21
vmull.p8 q2, d27, d7 @ D = A*B vmull.p8 q2, d27, d7 @ D = A*B
vext.8 q11, q11, q11, #12 vext.8 q11, q11, q11, #12
vext.8 q10, q10, q10, #13 vext.8 q10, q10, q10, #13
veor q8, q8, q9 veor q8, q8, q9
veor q10, q10, q11 veor q10, q10, q11
veor q2, q2, q8 veor q2, q2, q8
veor q2, q2, q10 veor q2, q2, q10
veor q1,q1,q0 @ Karatsuba post-processing veor q1,q1,q0 @ Karatsuba post-processing
veor q1,q1,q2 veor q1,q1,q2
veor d1,d1,d2 veor d1,d1,d2
veor d4,d4,d3 @ Xh|Xl - 256-bit result veor d4,d4,d3 @ Xh|Xl - 256-bit result
@ equivalent of reduction_avx from ghash-x86_64.pl @ equivalent of reduction_avx from ghash-x86_64.pl
vshl.i64 q9,q0,#57 @ 1st phase vshl.i64 q9,q0,#57 @ 1st phase
vshl.i64 q10,q0,#62 vshl.i64 q10,q0,#62
veor q10,q10,q9 @ veor q10,q10,q9 @
vshl.i64 q9,q0,#63 vshl.i64 q9,q0,#63
veor q10, q10, q9 @ veor q10, q10, q9 @
veor d1,d1,d20 @ veor d1,d1,d20 @
veor d4,d4,d21 veor d4,d4,d21
vshr.u64 q10,q0,#1 @ 2nd phase vshr.u64 q10,q0,#1 @ 2nd phase
veor q2,q2,q0 veor q2,q2,q0
veor q0,q0,q10 @ veor q0,q0,q10 @
vshr.u64 q10,q10,#6 vshr.u64 q10,q10,#6
vshr.u64 q0,q0,#1 @ vshr.u64 q0,q0,#1 @
veor q0,q0,q2 @ veor q0,q0,q2 @
veor q0,q0,q10 @ veor q0,q0,q10 @
subs r3,#16 subs r3,#16
bne .Loop_neon bne .Loop_neon
#ifdef __ARMEL__ #ifdef __ARMEL__
vrev64.8 q0,q0 vrev64.8 q0,q0
#endif #endif
sub r0,#16 sub r0,#16
vst1.64 d1,[r0,:64]! @ write out Xi vst1.64 d1,[r0]! @ write out Xi
vst1.64 d0,[r0,:64] vst1.64 d0,[r0]
bx lr @ bx lr bx lr @ bx lr
.size gcm_ghash_neon,.-gcm_ghash_neon .size gcm_ghash_neon,.-gcm_ghash_neon
#endif #endif
.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>" .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2 .align 2
.align 2

View File

@ -2,231 +2,234 @@
/* Do not modify. This file is auto-generated from ghashv8-armx.pl. */ /* Do not modify. This file is auto-generated from ghashv8-armx.pl. */
#include "arm_arch.h" #include "arm_arch.h"
#if __ARM_MAX_ARCH__>=7
.text .text
.fpu neon .fpu neon
.code 32 .code 32
.global gcm_init_v8 #undef __thumb2__
.globl gcm_init_v8
.type gcm_init_v8,%function .type gcm_init_v8,%function
.align 4 .align 4
gcm_init_v8: gcm_init_v8:
vld1.64 {q9},[r1] @ load input H vld1.64 {q9},[r1] @ load input H
vmov.i8 q11,#0xe1 vmov.i8 q11,#0xe1
vshl.i64 q11,q11,#57 @ 0xc2.0 vshl.i64 q11,q11,#57 @ 0xc2.0
vext.8 q3,q9,q9,#8 vext.8 q3,q9,q9,#8
vshr.u64 q10,q11,#63 vshr.u64 q10,q11,#63
vdup.32 q9,d18[1] vdup.32 q9,d18[1]
vext.8 q8,q10,q11,#8 @ t0=0xc2....01 vext.8 q8,q10,q11,#8 @ t0=0xc2....01
vshr.u64 q10,q3,#63 vshr.u64 q10,q3,#63
vshr.s32 q9,q9,#31 @ broadcast carry bit vshr.s32 q9,q9,#31 @ broadcast carry bit
vand q10,q10,q8 vand q10,q10,q8
vshl.i64 q3,q3,#1 vshl.i64 q3,q3,#1
vext.8 q10,q10,q10,#8 vext.8 q10,q10,q10,#8
vand q8,q8,q9 vand q8,q8,q9
vorr q3,q3,q10 @ H<<<=1 vorr q3,q3,q10 @ H<<<=1
veor q12,q3,q8 @ twisted H veor q12,q3,q8 @ twisted H
vst1.64 {q12},[r0]! @ store Htable[0] vst1.64 {q12},[r0]! @ store Htable[0]
@ calculate H^2 @ calculate H^2
vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing
.byte 0xa8,0x0e,0xa8,0xf2 @ pmull q0,q12,q12 .byte 0xa8,0x0e,0xa8,0xf2 @ pmull q0,q12,q12
veor q8,q8,q12 veor q8,q8,q12
.byte 0xa9,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q12 .byte 0xa9,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q12
.byte 0xa0,0x2e,0xa0,0xf2 @ pmull q1,q8,q8 .byte 0xa0,0x2e,0xa0,0xf2 @ pmull q1,q8,q8
vext.8 q9,q0,q2,#8 @ Karatsuba post-processing vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
veor q10,q0,q2 veor q10,q0,q2
veor q1,q1,q9 veor q1,q1,q9
veor q1,q1,q10 veor q1,q1,q10
.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase
vmov d4,d3 @ Xh|Xm - 256-bit result vmov d4,d3 @ Xh|Xm - 256-bit result
vmov d3,d0 @ Xm is rotated Xl vmov d3,d0 @ Xm is rotated Xl
veor q0,q1,q10 veor q0,q1,q10
vext.8 q10,q0,q0,#8 @ 2nd phase vext.8 q10,q0,q0,#8 @ 2nd phase
.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
veor q10,q10,q2 veor q10,q10,q2
veor q14,q0,q10 veor q14,q0,q10
vext.8 q9,q14,q14,#8 @ Karatsuba pre-processing
veor q9,q9,q14
vext.8 q13,q8,q9,#8 @ pack Karatsuba pre-processed
vst1.64 {q13-q14},[r0] @ store Htable[1..2]
vext.8 q9,q14,q14,#8 @ Karatsuba pre-processing
veor q9,q9,q14
vext.8 q13,q8,q9,#8 @ pack Karatsuba pre-processed
vst1.64 {q13,q14},[r0]! @ store Htable[1..2]
bx lr bx lr
.size gcm_init_v8,.-gcm_init_v8 .size gcm_init_v8,.-gcm_init_v8
.global gcm_gmult_v8 .globl gcm_gmult_v8
.type gcm_gmult_v8,%function .type gcm_gmult_v8,%function
.align 4 .align 4
gcm_gmult_v8: gcm_gmult_v8:
vld1.64 {q9},[r0] @ load Xi vld1.64 {q9},[r0] @ load Xi
vmov.i8 q11,#0xe1 vmov.i8 q11,#0xe1
vld1.64 {q12-q13},[r1] @ load twisted H, ... vld1.64 {q12,q13},[r1] @ load twisted H, ...
vshl.u64 q11,q11,#57 vshl.u64 q11,q11,#57
#ifndef __ARMEB__ #ifndef __ARMEB__
vrev64.8 q9,q9 vrev64.8 q9,q9
#endif #endif
vext.8 q3,q9,q9,#8 vext.8 q3,q9,q9,#8
.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo .byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo
veor q9,q9,q3 @ Karatsuba pre-processing veor q9,q9,q3 @ Karatsuba pre-processing
.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi .byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi
.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) .byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
vext.8 q9,q0,q2,#8 @ Karatsuba post-processing vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
veor q10,q0,q2 veor q10,q0,q2
veor q1,q1,q9 veor q1,q1,q9
veor q1,q1,q10 veor q1,q1,q10
.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
vmov d4,d3 @ Xh|Xm - 256-bit result vmov d4,d3 @ Xh|Xm - 256-bit result
vmov d3,d0 @ Xm is rotated Xl vmov d3,d0 @ Xm is rotated Xl
veor q0,q1,q10 veor q0,q1,q10
vext.8 q10,q0,q0,#8 @ 2nd phase of reduction vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
veor q10,q10,q2 veor q10,q10,q2
veor q0,q0,q10 veor q0,q0,q10
#ifndef __ARMEB__ #ifndef __ARMEB__
vrev64.8 q0,q0 vrev64.8 q0,q0
#endif #endif
vext.8 q0,q0,q0,#8 vext.8 q0,q0,q0,#8
vst1.64 {q0},[r0] @ write out Xi vst1.64 {q0},[r0] @ write out Xi
bx lr bx lr
.size gcm_gmult_v8,.-gcm_gmult_v8 .size gcm_gmult_v8,.-gcm_gmult_v8
.global gcm_ghash_v8 .globl gcm_ghash_v8
.type gcm_ghash_v8,%function .type gcm_ghash_v8,%function
.align 4 .align 4
gcm_ghash_v8: gcm_ghash_v8:
vstmdb sp!,{d8-d15} @ 32-bit ABI says so vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so
vld1.64 {q0},[r0] @ load [rotated] Xi vld1.64 {q0},[r0] @ load [rotated] Xi
@ "[rotated]" means that @ "[rotated]" means that
@ loaded value would have @ loaded value would have
@ to be rotated in order to @ to be rotated in order to
@ make it appear as in @ make it appear as in
@ alorithm specification @ algorithm specification
subs r3,r3,#32 @ see if r3 is 32 or larger subs r3,r3,#32 @ see if r3 is 32 or larger
mov r12,#16 @ r12 is used as post- mov r12,#16 @ r12 is used as post-
@ increment for input pointer; @ increment for input pointer;
@ as loop is modulo-scheduled @ as loop is modulo-scheduled
@ r12 is zeroed just in time @ r12 is zeroed just in time
@ to preclude oversteping @ to preclude overstepping
@ inp[len], which means that @ inp[len], which means that
@ last block[s] are actually @ last block[s] are actually
@ loaded twice, but last @ loaded twice, but last
@ copy is not processed @ copy is not processed
vld1.64 {q12-q13},[r1]! @ load twisted H, ..., H^2 vld1.64 {q12,q13},[r1]! @ load twisted H, ..., H^2
vmov.i8 q11,#0xe1 vmov.i8 q11,#0xe1
vld1.64 {q14},[r1] vld1.64 {q14},[r1]
moveq r12,#0 @ is it time to zero r12? moveq r12,#0 @ is it time to zero r12?
vext.8 q0,q0,q0,#8 @ rotate Xi vext.8 q0,q0,q0,#8 @ rotate Xi
vld1.64 {q8},[r2]! @ load [rotated] I[0] vld1.64 {q8},[r2]! @ load [rotated] I[0]
vshl.u64 q11,q11,#57 @ compose 0xc2.0 constant vshl.u64 q11,q11,#57 @ compose 0xc2.0 constant
#ifndef __ARMEB__ #ifndef __ARMEB__
vrev64.8 q8,q8 vrev64.8 q8,q8
vrev64.8 q0,q0 vrev64.8 q0,q0
#endif #endif
vext.8 q3,q8,q8,#8 @ rotate I[0] vext.8 q3,q8,q8,#8 @ rotate I[0]
blo .Lodd_tail_v8 @ r3 was less than 32 blo .Lodd_tail_v8 @ r3 was less than 32
vld1.64 {q9},[r2],r12 @ load [rotated] I[1] vld1.64 {q9},[r2],r12 @ load [rotated] I[1]
#ifndef __ARMEB__ #ifndef __ARMEB__
vrev64.8 q9,q9 vrev64.8 q9,q9
#endif #endif
vext.8 q7,q9,q9,#8 vext.8 q7,q9,q9,#8
veor q3,q3,q0 @ I[i]^=Xi veor q3,q3,q0 @ I[i]^=Xi
.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 .byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1
veor q9,q9,q7 @ Karatsuba pre-processing veor q9,q9,q7 @ Karatsuba pre-processing
.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 .byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7
b .Loop_mod2x_v8 b .Loop_mod2x_v8
.align 4 .align 4
.Loop_mod2x_v8: .Loop_mod2x_v8:
vext.8 q10,q3,q3,#8 vext.8 q10,q3,q3,#8
subs r3,r3,#32 @ is there more data? subs r3,r3,#32 @ is there more data?
.byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo .byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo
movlo r12,#0 @ is it time to zero r12? movlo r12,#0 @ is it time to zero r12?
.byte 0xa2,0xae,0xaa,0xf2 @ pmull q5,q13,q9 .byte 0xa2,0xae,0xaa,0xf2 @ pmull q5,q13,q9
veor q10,q10,q3 @ Karatsuba pre-processing veor q10,q10,q3 @ Karatsuba pre-processing
.byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi .byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi
veor q0,q0,q4 @ accumulate veor q0,q0,q4 @ accumulate
.byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) .byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2] vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2]
veor q2,q2,q6 veor q2,q2,q6
moveq r12,#0 @ is it time to zero r12? moveq r12,#0 @ is it time to zero r12?
veor q1,q1,q5 veor q1,q1,q5
vext.8 q9,q0,q2,#8 @ Karatsuba post-processing vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
veor q10,q0,q2 veor q10,q0,q2
veor q1,q1,q9 veor q1,q1,q9
vld1.64 {q9},[r2],r12 @ load [rotated] I[i+3] vld1.64 {q9},[r2],r12 @ load [rotated] I[i+3]
#ifndef __ARMEB__ #ifndef __ARMEB__
vrev64.8 q8,q8 vrev64.8 q8,q8
#endif #endif
veor q1,q1,q10 veor q1,q1,q10
.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
#ifndef __ARMEB__ #ifndef __ARMEB__
vrev64.8 q9,q9 vrev64.8 q9,q9
#endif #endif
vmov d4,d3 @ Xh|Xm - 256-bit result vmov d4,d3 @ Xh|Xm - 256-bit result
vmov d3,d0 @ Xm is rotated Xl vmov d3,d0 @ Xm is rotated Xl
vext.8 q7,q9,q9,#8 vext.8 q7,q9,q9,#8
vext.8 q3,q8,q8,#8 vext.8 q3,q8,q8,#8
veor q0,q1,q10 veor q0,q1,q10
.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 .byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1
veor q3,q3,q2 @ accumulate q3 early veor q3,q3,q2 @ accumulate q3 early
vext.8 q10,q0,q0,#8 @ 2nd phase of reduction vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
veor q3,q3,q10 veor q3,q3,q10
veor q9,q9,q7 @ Karatsuba pre-processing veor q9,q9,q7 @ Karatsuba pre-processing
veor q3,q3,q0 veor q3,q3,q0
.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 .byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7
bhs .Loop_mod2x_v8 @ there was at least 32 more bytes bhs .Loop_mod2x_v8 @ there was at least 32 more bytes
veor q2,q2,q10 veor q2,q2,q10
vext.8 q3,q8,q8,#8 @ re-construct q3 vext.8 q3,q8,q8,#8 @ re-construct q3
adds r3,r3,#32 @ re-construct r3 adds r3,r3,#32 @ re-construct r3
veor q0,q0,q2 @ re-construct q0 veor q0,q0,q2 @ re-construct q0
beq .Ldone_v8 @ is r3 zero? beq .Ldone_v8 @ is r3 zero?
.Lodd_tail_v8: .Lodd_tail_v8:
vext.8 q10,q0,q0,#8 vext.8 q10,q0,q0,#8
veor q3,q3,q0 @ inp^=Xi veor q3,q3,q0 @ inp^=Xi
veor q9,q8,q10 @ q9 is rotated inp^Xi veor q9,q8,q10 @ q9 is rotated inp^Xi
.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo .byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo
veor q9,q9,q3 @ Karatsuba pre-processing veor q9,q9,q3 @ Karatsuba pre-processing
.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi .byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi
.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) .byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
vext.8 q9,q0,q2,#8 @ Karatsuba post-processing vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
veor q10,q0,q2 veor q10,q0,q2
veor q1,q1,q9 veor q1,q1,q9
veor q1,q1,q10 veor q1,q1,q10
.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
vmov d4,d3 @ Xh|Xm - 256-bit result vmov d4,d3 @ Xh|Xm - 256-bit result
vmov d3,d0 @ Xm is rotated Xl vmov d3,d0 @ Xm is rotated Xl
veor q0,q1,q10 veor q0,q1,q10
vext.8 q10,q0,q0,#8 @ 2nd phase of reduction vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
veor q10,q10,q2 veor q10,q10,q2
veor q0,q0,q10 veor q0,q0,q10
.Ldone_v8: .Ldone_v8:
#ifndef __ARMEB__ #ifndef __ARMEB__
vrev64.8 q0,q0 vrev64.8 q0,q0
#endif #endif
vext.8 q0,q0,q0,#8 vext.8 q0,q0,q0,#8
vst1.64 {q0},[r0] @ write out Xi vst1.64 {q0},[r0] @ write out Xi
vldmia sp!,{d8-d15} @ 32-bit ABI says so vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so
bx lr bx lr
.size gcm_ghash_v8,.-gcm_ghash_v8 .size gcm_ghash_v8,.-gcm_ghash_v8
.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro@openssl.org>" .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2 .align 2
.align 2
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -3,23 +3,32 @@
#include "arm_arch.h" #include "arm_arch.h"
.text .text
#if defined(__thumb2__)
.syntax unified
.thumb
#else
.code 32 .code 32
#endif
.global sha1_block_data_order .globl sha1_block_data_order
.type sha1_block_data_order,%function .type sha1_block_data_order,%function
.align 5 .align 5
sha1_block_data_order: sha1_block_data_order:
#if __ARM_MAX_ARCH__>=7 #if __ARM_MAX_ARCH__>=7
sub r3,pc,#8 @ sha1_block_data_order .Lsha1_block:
adr r3,.Lsha1_block
ldr r12,.LOPENSSL_armcap ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P ldr r12,[r3,r12] @ OPENSSL_armcap_P
#ifdef __APPLE__
ldr r12,[r12]
#endif
tst r12,#ARMV8_SHA1 tst r12,#ARMV8_SHA1
bne .LARMv8 bne .LARMv8
tst r12,#ARMV7_NEON tst r12,#ARMV7_NEON
bne .LNEON bne .LNEON
#endif #endif
stmdb sp!,{r4-r12,lr} stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 add r2,r1,r2,lsl#6 @ r2 to point at the end of r1
ldmia r0,{r3,r4,r5,r6,r7} ldmia r0,{r3,r4,r5,r6,r7}
.Lloop: .Lloop:
@ -155,7 +164,12 @@ sha1_block_data_order:
eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
str r9,[r14,#-4]! str r9,[r14,#-4]!
add r3,r3,r10 @ E+=F_00_19(B,C,D) add r3,r3,r10 @ E+=F_00_19(B,C,D)
#if defined(__thumb2__)
mov r12,sp
teq r14,r12
#else
teq r14,sp teq r14,sp
#endif
bne .L_00_15 @ [((11+4)*5+2)*3] bne .L_00_15 @ [((11+4)*5+2)*3]
sub sp,sp,#25*4 sub sp,sp,#25*4
#if __ARM_ARCH__<7 #if __ARM_ARCH__<7
@ -195,7 +209,7 @@ sha1_block_data_order:
add r6,r6,r7,ror#27 @ E+=ROR(A,27) add r6,r6,r7,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31 eor r9,r9,r11,ror#31
str r9,[r14,#-4]! str r9,[r14,#-4]!
and r10,r3,r10,ror#2 @ F_xx_xx and r10,r3,r10,ror#2 @ F_xx_xx
@ F_xx_xx @ F_xx_xx
add r6,r6,r9 @ E+=X[i] add r6,r6,r9 @ E+=X[i]
eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
@ -212,7 +226,7 @@ sha1_block_data_order:
add r5,r5,r6,ror#27 @ E+=ROR(A,27) add r5,r5,r6,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31 eor r9,r9,r11,ror#31
str r9,[r14,#-4]! str r9,[r14,#-4]!
and r10,r7,r10,ror#2 @ F_xx_xx and r10,r7,r10,ror#2 @ F_xx_xx
@ F_xx_xx @ F_xx_xx
add r5,r5,r9 @ E+=X[i] add r5,r5,r9 @ E+=X[i]
eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
@ -229,7 +243,7 @@ sha1_block_data_order:
add r4,r4,r5,ror#27 @ E+=ROR(A,27) add r4,r4,r5,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31 eor r9,r9,r11,ror#31
str r9,[r14,#-4]! str r9,[r14,#-4]!
and r10,r6,r10,ror#2 @ F_xx_xx and r10,r6,r10,ror#2 @ F_xx_xx
@ F_xx_xx @ F_xx_xx
add r4,r4,r9 @ E+=X[i] add r4,r4,r9 @ E+=X[i]
eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
@ -246,7 +260,7 @@ sha1_block_data_order:
add r3,r3,r4,ror#27 @ E+=ROR(A,27) add r3,r3,r4,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31 eor r9,r9,r11,ror#31
str r9,[r14,#-4]! str r9,[r14,#-4]!
and r10,r5,r10,ror#2 @ F_xx_xx and r10,r5,r10,ror#2 @ F_xx_xx
@ F_xx_xx @ F_xx_xx
add r3,r3,r9 @ E+=X[i] add r3,r3,r9 @ E+=X[i]
eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
@ -267,7 +281,7 @@ sha1_block_data_order:
add r7,r7,r3,ror#27 @ E+=ROR(A,27) add r7,r7,r3,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31 eor r9,r9,r11,ror#31
str r9,[r14,#-4]! str r9,[r14,#-4]!
eor r10,r4,r10,ror#2 @ F_xx_xx eor r10,r4,r10,ror#2 @ F_xx_xx
@ F_xx_xx @ F_xx_xx
add r7,r7,r9 @ E+=X[i] add r7,r7,r9 @ E+=X[i]
add r7,r7,r10 @ E+=F_20_39(B,C,D) add r7,r7,r10 @ E+=F_20_39(B,C,D)
@ -283,7 +297,7 @@ sha1_block_data_order:
add r6,r6,r7,ror#27 @ E+=ROR(A,27) add r6,r6,r7,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31 eor r9,r9,r11,ror#31
str r9,[r14,#-4]! str r9,[r14,#-4]!
eor r10,r3,r10,ror#2 @ F_xx_xx eor r10,r3,r10,ror#2 @ F_xx_xx
@ F_xx_xx @ F_xx_xx
add r6,r6,r9 @ E+=X[i] add r6,r6,r9 @ E+=X[i]
add r6,r6,r10 @ E+=F_20_39(B,C,D) add r6,r6,r10 @ E+=F_20_39(B,C,D)
@ -299,7 +313,7 @@ sha1_block_data_order:
add r5,r5,r6,ror#27 @ E+=ROR(A,27) add r5,r5,r6,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31 eor r9,r9,r11,ror#31
str r9,[r14,#-4]! str r9,[r14,#-4]!
eor r10,r7,r10,ror#2 @ F_xx_xx eor r10,r7,r10,ror#2 @ F_xx_xx
@ F_xx_xx @ F_xx_xx
add r5,r5,r9 @ E+=X[i] add r5,r5,r9 @ E+=X[i]
add r5,r5,r10 @ E+=F_20_39(B,C,D) add r5,r5,r10 @ E+=F_20_39(B,C,D)
@ -315,7 +329,7 @@ sha1_block_data_order:
add r4,r4,r5,ror#27 @ E+=ROR(A,27) add r4,r4,r5,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31 eor r9,r9,r11,ror#31
str r9,[r14,#-4]! str r9,[r14,#-4]!
eor r10,r6,r10,ror#2 @ F_xx_xx eor r10,r6,r10,ror#2 @ F_xx_xx
@ F_xx_xx @ F_xx_xx
add r4,r4,r9 @ E+=X[i] add r4,r4,r9 @ E+=X[i]
add r4,r4,r10 @ E+=F_20_39(B,C,D) add r4,r4,r10 @ E+=F_20_39(B,C,D)
@ -331,11 +345,16 @@ sha1_block_data_order:
add r3,r3,r4,ror#27 @ E+=ROR(A,27) add r3,r3,r4,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31 eor r9,r9,r11,ror#31
str r9,[r14,#-4]! str r9,[r14,#-4]!
eor r10,r5,r10,ror#2 @ F_xx_xx eor r10,r5,r10,ror#2 @ F_xx_xx
@ F_xx_xx @ F_xx_xx
add r3,r3,r9 @ E+=X[i] add r3,r3,r9 @ E+=X[i]
add r3,r3,r10 @ E+=F_20_39(B,C,D) add r3,r3,r10 @ E+=F_20_39(B,C,D)
#if defined(__thumb2__)
mov r12,sp
teq r14,r12
#else
teq r14,sp @ preserve carry teq r14,sp @ preserve carry
#endif
bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
@ -354,8 +373,8 @@ sha1_block_data_order:
add r7,r7,r3,ror#27 @ E+=ROR(A,27) add r7,r7,r3,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31 eor r9,r9,r11,ror#31
str r9,[r14,#-4]! str r9,[r14,#-4]!
and r10,r4,r10,ror#2 @ F_xx_xx and r10,r4,r10,ror#2 @ F_xx_xx
and r11,r5,r6 @ F_xx_xx and r11,r5,r6 @ F_xx_xx
add r7,r7,r9 @ E+=X[i] add r7,r7,r9 @ E+=X[i]
add r7,r7,r10 @ E+=F_40_59(B,C,D) add r7,r7,r10 @ E+=F_40_59(B,C,D)
add r7,r7,r11,ror#2 add r7,r7,r11,ror#2
@ -371,8 +390,8 @@ sha1_block_data_order:
add r6,r6,r7,ror#27 @ E+=ROR(A,27) add r6,r6,r7,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31 eor r9,r9,r11,ror#31
str r9,[r14,#-4]! str r9,[r14,#-4]!
and r10,r3,r10,ror#2 @ F_xx_xx and r10,r3,r10,ror#2 @ F_xx_xx
and r11,r4,r5 @ F_xx_xx and r11,r4,r5 @ F_xx_xx
add r6,r6,r9 @ E+=X[i] add r6,r6,r9 @ E+=X[i]
add r6,r6,r10 @ E+=F_40_59(B,C,D) add r6,r6,r10 @ E+=F_40_59(B,C,D)
add r6,r6,r11,ror#2 add r6,r6,r11,ror#2
@ -388,8 +407,8 @@ sha1_block_data_order:
add r5,r5,r6,ror#27 @ E+=ROR(A,27) add r5,r5,r6,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31 eor r9,r9,r11,ror#31
str r9,[r14,#-4]! str r9,[r14,#-4]!
and r10,r7,r10,ror#2 @ F_xx_xx and r10,r7,r10,ror#2 @ F_xx_xx
and r11,r3,r4 @ F_xx_xx and r11,r3,r4 @ F_xx_xx
add r5,r5,r9 @ E+=X[i] add r5,r5,r9 @ E+=X[i]
add r5,r5,r10 @ E+=F_40_59(B,C,D) add r5,r5,r10 @ E+=F_40_59(B,C,D)
add r5,r5,r11,ror#2 add r5,r5,r11,ror#2
@ -405,8 +424,8 @@ sha1_block_data_order:
add r4,r4,r5,ror#27 @ E+=ROR(A,27) add r4,r4,r5,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31 eor r9,r9,r11,ror#31
str r9,[r14,#-4]! str r9,[r14,#-4]!
and r10,r6,r10,ror#2 @ F_xx_xx and r10,r6,r10,ror#2 @ F_xx_xx
and r11,r7,r3 @ F_xx_xx and r11,r7,r3 @ F_xx_xx
add r4,r4,r9 @ E+=X[i] add r4,r4,r9 @ E+=X[i]
add r4,r4,r10 @ E+=F_40_59(B,C,D) add r4,r4,r10 @ E+=F_40_59(B,C,D)
add r4,r4,r11,ror#2 add r4,r4,r11,ror#2
@ -422,12 +441,17 @@ sha1_block_data_order:
add r3,r3,r4,ror#27 @ E+=ROR(A,27) add r3,r3,r4,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31 eor r9,r9,r11,ror#31
str r9,[r14,#-4]! str r9,[r14,#-4]!
and r10,r5,r10,ror#2 @ F_xx_xx and r10,r5,r10,ror#2 @ F_xx_xx
and r11,r6,r7 @ F_xx_xx and r11,r6,r7 @ F_xx_xx
add r3,r3,r9 @ E+=X[i] add r3,r3,r9 @ E+=X[i]
add r3,r3,r10 @ E+=F_40_59(B,C,D) add r3,r3,r10 @ E+=F_40_59(B,C,D)
add r3,r3,r11,ror#2 add r3,r3,r11,ror#2
#if defined(__thumb2__)
mov r12,sp
teq r14,r12
#else
teq r14,sp teq r14,sp
#endif
bne .L_40_59 @ [+((12+5)*5+2)*4] bne .L_40_59 @ [+((12+5)*5+2)*4]
ldr r8,.LK_60_79 ldr r8,.LK_60_79
@ -447,25 +471,26 @@ sha1_block_data_order:
bne .Lloop @ [+18], total 1307 bne .Lloop @ [+18], total 1307
#if __ARM_ARCH__>=5 #if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc} ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
#else #else
ldmia sp!,{r4-r12,lr} ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
tst lr,#1 tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-) .word 0xe12fff1e @ interoperable with Thumb ISA:-)
#endif #endif
.size sha1_block_data_order,.-sha1_block_data_order .size sha1_block_data_order,.-sha1_block_data_order
.align 5 .align 5
.LK_00_19: .word 0x5a827999 .LK_00_19:.word 0x5a827999
.LK_20_39: .word 0x6ed9eba1 .LK_20_39:.word 0x6ed9eba1
.LK_40_59: .word 0x8f1bbcdc .LK_40_59:.word 0x8f1bbcdc
.LK_60_79: .word 0xca62c1d6 .LK_60_79:.word 0xca62c1d6
#if __ARM_MAX_ARCH__>=7 #if __ARM_MAX_ARCH__>=7
.LOPENSSL_armcap: .LOPENSSL_armcap:
.word OPENSSL_armcap_P-sha1_block_data_order .word OPENSSL_armcap_P-.Lsha1_block
#endif #endif
.asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro@openssl.org>" .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2
.align 5 .align 5
#if __ARM_MAX_ARCH__>=7 #if __ARM_MAX_ARCH__>=7
.arch armv7-a .arch armv7-a
@ -475,33 +500,33 @@ sha1_block_data_order:
.align 4 .align 4
sha1_block_data_order_neon: sha1_block_data_order_neon:
.LNEON: .LNEON:
stmdb sp!,{r4-r12,lr} stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 add r2,r1,r2,lsl#6 @ r2 to point at the end of r1
@ dmb @ errata #451034 on early Cortex A8 @ dmb @ errata #451034 on early Cortex A8
@ vstmdb sp!,{d8-d15} @ ABI specification says so @ vstmdb sp!,{d8-d15} @ ABI specification says so
mov r14,sp mov r14,sp
sub sp,sp,#64 @ alloca sub r12,sp,#64
adr r8,.LK_00_19 adr r8,.LK_00_19
bic sp,sp,#15 @ align for 128-bit stores bic r12,r12,#15 @ align for 128-bit stores
ldmia r0,{r3,r4,r5,r6,r7} @ load context ldmia r0,{r3,r4,r5,r6,r7} @ load context
mov r12,sp mov sp,r12 @ alloca
vld1.8 {q0-q1},[r1]! @ handles unaligned vld1.8 {q0,q1},[r1]! @ handles unaligned
veor q15,q15,q15 veor q15,q15,q15
vld1.8 {q2-q3},[r1]! vld1.8 {q2,q3},[r1]!
vld1.32 {d28[],d29[]},[r8,:32]! @ load K_00_19 vld1.32 {d28[],d29[]},[r8,:32]! @ load K_00_19
vrev32.8 q0,q0 @ yes, even on vrev32.8 q0,q0 @ yes, even on
vrev32.8 q1,q1 @ big-endian... vrev32.8 q1,q1 @ big-endian...
vrev32.8 q2,q2 vrev32.8 q2,q2
vadd.i32 q8,q0,q14 vadd.i32 q8,q0,q14
vrev32.8 q3,q3 vrev32.8 q3,q3
vadd.i32 q9,q1,q14 vadd.i32 q9,q1,q14
vst1.32 {q8},[r12,:128]! vst1.32 {q8},[r12,:128]!
vadd.i32 q10,q2,q14 vadd.i32 q10,q2,q14
vst1.32 {q9},[r12,:128]! vst1.32 {q9},[r12,:128]!
vst1.32 {q10},[r12,:128]! vst1.32 {q10},[r12,:128]!
ldr r9,[sp] @ big RAW stall ldr r9,[sp] @ big RAW stall
.Loop_neon: .Loop_neon:
vext.8 q8,q0,q1,#8 vext.8 q8,q0,q1,#8
@ -1178,11 +1203,12 @@ sha1_block_data_order_neon:
sub r12,r12,#64 sub r12,r12,#64
teq r1,r2 teq r1,r2
sub r8,r8,#16 sub r8,r8,#16
it eq
subeq r1,r1,#64 subeq r1,r1,#64
vld1.8 {q0-q1},[r1]! vld1.8 {q0,q1},[r1]!
ldr r9,[sp,#4] ldr r9,[sp,#4]
eor r11,r10,r6 eor r11,r10,r6
vld1.8 {q2-q3},[r1]! vld1.8 {q2,q3},[r1]!
add r3,r3,r4,ror#27 add r3,r3,r4,ror#27
mov r5,r5,ror#2 mov r5,r5,ror#2
vld1.32 {d28[],d29[]},[r8,:32]! vld1.32 {d28[],d29[]},[r8,:32]!
@ -1307,23 +1333,33 @@ sha1_block_data_order_neon:
add r4,r4,r10 add r4,r4,r10
add r5,r5,r11 add r5,r5,r11
add r6,r6,r12 add r6,r6,r12
it eq
moveq sp,r14 moveq sp,r14
add r7,r7,r9 add r7,r7,r9
it ne
ldrne r9,[sp] ldrne r9,[sp]
stmia r0,{r3,r4,r5,r6,r7} stmia r0,{r3,r4,r5,r6,r7}
itt ne
addne r12,sp,#3*16 addne r12,sp,#3*16
bne .Loop_neon bne .Loop_neon
@ vldmia sp!,{d8-d15} @ vldmia sp!,{d8-d15}
ldmia sp!,{r4-r12,pc} ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
.size sha1_block_data_order_neon,.-sha1_block_data_order_neon .size sha1_block_data_order_neon,.-sha1_block_data_order_neon
#endif #endif
#if __ARM_MAX_ARCH__>=7 #if __ARM_MAX_ARCH__>=7
# if defined(__thumb2__)
# define INST(a,b,c,d) .byte c,d|0xf,a,b
# else
# define INST(a,b,c,d) .byte a,b,c,d|0x10
# endif
.type sha1_block_data_order_armv8,%function .type sha1_block_data_order_armv8,%function
.align 5 .align 5
sha1_block_data_order_armv8: sha1_block_data_order_armv8:
.LARMv8: .LARMv8:
vstmdb sp!,{d8-d15} @ ABI specification says so vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so
veor q1,q1,q1 veor q1,q1,q1
adr r3,.LK_00_19 adr r3,.LK_00_19
@ -1336,119 +1372,119 @@ sha1_block_data_order_armv8:
vld1.32 {d22[],d23[]},[r3,:32] vld1.32 {d22[],d23[]},[r3,:32]
.Loop_v8: .Loop_v8:
vld1.8 {q4-q5},[r1]! vld1.8 {q4,q5},[r1]!
vld1.8 {q6-q7},[r1]! vld1.8 {q6,q7},[r1]!
vrev32.8 q4,q4 vrev32.8 q4,q4
vrev32.8 q5,q5 vrev32.8 q5,q5
vadd.i32 q12,q8,q4 vadd.i32 q12,q8,q4
vrev32.8 q6,q6 vrev32.8 q6,q6
vmov q14,q0 @ offload vmov q14,q0 @ offload
subs r2,r2,#1 subs r2,r2,#1
vadd.i32 q13,q8,q5 vadd.i32 q13,q8,q5
vrev32.8 q7,q7 vrev32.8 q7,q7
.byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 0 INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 0
.byte 0x68,0x0c,0x02,0xf2 @ sha1c q0,q1,q12 INST(0x68,0x0c,0x02,0xe2) @ sha1c q0,q1,q12
vadd.i32 q12,q8,q6 vadd.i32 q12,q8,q6
.byte 0x4c,0x8c,0x3a,0xf2 @ sha1su0 q4,q5,q6 INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6
.byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 1 INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 1
.byte 0x6a,0x0c,0x06,0xf2 @ sha1c q0,q3,q13 INST(0x6a,0x0c,0x06,0xe2) @ sha1c q0,q3,q13
vadd.i32 q13,q8,q7 vadd.i32 q13,q8,q7
.byte 0x8e,0x83,0xba,0xf3 @ sha1su1 q4,q7 INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7
.byte 0x4e,0xac,0x3c,0xf2 @ sha1su0 q5,q6,q7 INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7
.byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 2 INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 2
.byte 0x68,0x0c,0x04,0xf2 @ sha1c q0,q2,q12 INST(0x68,0x0c,0x04,0xe2) @ sha1c q0,q2,q12
vadd.i32 q12,q8,q4 vadd.i32 q12,q8,q4
.byte 0x88,0xa3,0xba,0xf3 @ sha1su1 q5,q4 INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4
.byte 0x48,0xcc,0x3e,0xf2 @ sha1su0 q6,q7,q4 INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4
.byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 3 INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 3
.byte 0x6a,0x0c,0x06,0xf2 @ sha1c q0,q3,q13 INST(0x6a,0x0c,0x06,0xe2) @ sha1c q0,q3,q13
vadd.i32 q13,q9,q5 vadd.i32 q13,q9,q5
.byte 0x8a,0xc3,0xba,0xf3 @ sha1su1 q6,q5 INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5
.byte 0x4a,0xec,0x38,0xf2 @ sha1su0 q7,q4,q5 INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5
.byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 4 INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 4
.byte 0x68,0x0c,0x04,0xf2 @ sha1c q0,q2,q12 INST(0x68,0x0c,0x04,0xe2) @ sha1c q0,q2,q12
vadd.i32 q12,q9,q6 vadd.i32 q12,q9,q6
.byte 0x8c,0xe3,0xba,0xf3 @ sha1su1 q7,q6 INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6
.byte 0x4c,0x8c,0x3a,0xf2 @ sha1su0 q4,q5,q6 INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6
.byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 5 INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 5
.byte 0x6a,0x0c,0x16,0xf2 @ sha1p q0,q3,q13 INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13
vadd.i32 q13,q9,q7 vadd.i32 q13,q9,q7
.byte 0x8e,0x83,0xba,0xf3 @ sha1su1 q4,q7 INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7
.byte 0x4e,0xac,0x3c,0xf2 @ sha1su0 q5,q6,q7 INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7
.byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 6 INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 6
.byte 0x68,0x0c,0x14,0xf2 @ sha1p q0,q2,q12 INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12
vadd.i32 q12,q9,q4 vadd.i32 q12,q9,q4
.byte 0x88,0xa3,0xba,0xf3 @ sha1su1 q5,q4 INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4
.byte 0x48,0xcc,0x3e,0xf2 @ sha1su0 q6,q7,q4 INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4
.byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 7 INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 7
.byte 0x6a,0x0c,0x16,0xf2 @ sha1p q0,q3,q13 INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13
vadd.i32 q13,q9,q5 vadd.i32 q13,q9,q5
.byte 0x8a,0xc3,0xba,0xf3 @ sha1su1 q6,q5 INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5
.byte 0x4a,0xec,0x38,0xf2 @ sha1su0 q7,q4,q5 INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5
.byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 8 INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 8
.byte 0x68,0x0c,0x14,0xf2 @ sha1p q0,q2,q12 INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12
vadd.i32 q12,q10,q6 vadd.i32 q12,q10,q6
.byte 0x8c,0xe3,0xba,0xf3 @ sha1su1 q7,q6 INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6
.byte 0x4c,0x8c,0x3a,0xf2 @ sha1su0 q4,q5,q6 INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6
.byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 9 INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 9
.byte 0x6a,0x0c,0x16,0xf2 @ sha1p q0,q3,q13 INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13
vadd.i32 q13,q10,q7 vadd.i32 q13,q10,q7
.byte 0x8e,0x83,0xba,0xf3 @ sha1su1 q4,q7 INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7
.byte 0x4e,0xac,0x3c,0xf2 @ sha1su0 q5,q6,q7 INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7
.byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 10 INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 10
.byte 0x68,0x0c,0x24,0xf2 @ sha1m q0,q2,q12 INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12
vadd.i32 q12,q10,q4 vadd.i32 q12,q10,q4
.byte 0x88,0xa3,0xba,0xf3 @ sha1su1 q5,q4 INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4
.byte 0x48,0xcc,0x3e,0xf2 @ sha1su0 q6,q7,q4 INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4
.byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 11 INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 11
.byte 0x6a,0x0c,0x26,0xf2 @ sha1m q0,q3,q13 INST(0x6a,0x0c,0x26,0xe2) @ sha1m q0,q3,q13
vadd.i32 q13,q10,q5 vadd.i32 q13,q10,q5
.byte 0x8a,0xc3,0xba,0xf3 @ sha1su1 q6,q5 INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5
.byte 0x4a,0xec,0x38,0xf2 @ sha1su0 q7,q4,q5 INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5
.byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 12 INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 12
.byte 0x68,0x0c,0x24,0xf2 @ sha1m q0,q2,q12 INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12
vadd.i32 q12,q10,q6 vadd.i32 q12,q10,q6
.byte 0x8c,0xe3,0xba,0xf3 @ sha1su1 q7,q6 INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6
.byte 0x4c,0x8c,0x3a,0xf2 @ sha1su0 q4,q5,q6 INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6
.byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 13 INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 13
.byte 0x6a,0x0c,0x26,0xf2 @ sha1m q0,q3,q13 INST(0x6a,0x0c,0x26,0xe2) @ sha1m q0,q3,q13
vadd.i32 q13,q11,q7 vadd.i32 q13,q11,q7
.byte 0x8e,0x83,0xba,0xf3 @ sha1su1 q4,q7 INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7
.byte 0x4e,0xac,0x3c,0xf2 @ sha1su0 q5,q6,q7 INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7
.byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 14 INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 14
.byte 0x68,0x0c,0x24,0xf2 @ sha1m q0,q2,q12 INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12
vadd.i32 q12,q11,q4 vadd.i32 q12,q11,q4
.byte 0x88,0xa3,0xba,0xf3 @ sha1su1 q5,q4 INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4
.byte 0x48,0xcc,0x3e,0xf2 @ sha1su0 q6,q7,q4 INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4
.byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 15 INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 15
.byte 0x6a,0x0c,0x16,0xf2 @ sha1p q0,q3,q13 INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13
vadd.i32 q13,q11,q5 vadd.i32 q13,q11,q5
.byte 0x8a,0xc3,0xba,0xf3 @ sha1su1 q6,q5 INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5
.byte 0x4a,0xec,0x38,0xf2 @ sha1su0 q7,q4,q5 INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5
.byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 16 INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 16
.byte 0x68,0x0c,0x14,0xf2 @ sha1p q0,q2,q12 INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12
vadd.i32 q12,q11,q6 vadd.i32 q12,q11,q6
.byte 0x8c,0xe3,0xba,0xf3 @ sha1su1 q7,q6 INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6
.byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 17 INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 17
.byte 0x6a,0x0c,0x16,0xf2 @ sha1p q0,q3,q13 INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13
vadd.i32 q13,q11,q7 vadd.i32 q13,q11,q7
.byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 18 INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 18
.byte 0x68,0x0c,0x14,0xf2 @ sha1p q0,q2,q12 INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12
.byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 19 INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 19
.byte 0x6a,0x0c,0x16,0xf2 @ sha1p q0,q3,q13 INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13
vadd.i32 q1,q1,q2 vadd.i32 q1,q1,q2
vadd.i32 q0,q0,q14 vadd.i32 q0,q0,q14
bne .Loop_v8 bne .Loop_v8
vst1.32 {q0},[r0]! vst1.32 {q0},[r0]!
vst1.32 {d2[0]},[r0] vst1.32 {d2[0]},[r0]
vldmia sp!,{d8-d15} vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
bx lr @ bx lr bx lr @ bx lr
.size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8 .size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
#endif #endif

View File

@ -1,5 +1,12 @@
/* $FreeBSD$ */ /* $FreeBSD$ */
/* Do not modify. This file is auto-generated from sha256-armv4.pl. */ /* Do not modify. This file is auto-generated from sha256-armv4.pl. */
@ Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
@
@ Licensed under the OpenSSL license (the "License"). You may not use
@ this file except in compliance with the License. You can obtain a copy
@ in the file LICENSE in the source distribution or at
@ https://www.openssl.org/source/license.html
@ ==================================================================== @ ====================================================================
@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@ -46,15 +53,11 @@
#endif #endif
.text .text
#if __ARM_ARCH__<7 #if defined(__thumb2__)
.code 32 .syntax unified
#else
.syntax unified
# ifdef __thumb2__
.thumb .thumb
# else #else
.code 32 .code 32
# endif
#endif #endif
.type K256,%object .type K256,%object
@ -80,21 +83,25 @@ K256:
.word 0 @ terminator .word 0 @ terminator
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.LOPENSSL_armcap: .LOPENSSL_armcap:
.word OPENSSL_armcap_P-sha256_block_data_order .word OPENSSL_armcap_P-.Lsha256_block_data_order
#endif #endif
.align 5 .align 5
.global sha256_block_data_order .globl sha256_block_data_order
.type sha256_block_data_order,%function .type sha256_block_data_order,%function
sha256_block_data_order: sha256_block_data_order:
#if __ARM_ARCH__<7 .Lsha256_block_data_order:
#if __ARM_ARCH__<7 && !defined(__thumb2__)
sub r3,pc,#8 @ sha256_block_data_order sub r3,pc,#8 @ sha256_block_data_order
#else #else
adr r3,. adr r3,.Lsha256_block_data_order
#endif #endif
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
ldr r12,.LOPENSSL_armcap ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P ldr r12,[r3,r12] @ OPENSSL_armcap_P
#ifdef __APPLE__
ldr r12,[r12]
#endif
tst r12,#ARMV8_SHA256 tst r12,#ARMV8_SHA256
bne .LARMv8 bne .LARMv8
tst r12,#ARMV7_NEON tst r12,#ARMV7_NEON
@ -121,7 +128,9 @@ sha256_block_data_order:
eor r0,r8,r8,ror#5 eor r0,r8,r8,ror#5
add r4,r4,r12 @ h+=Maj(a,b,c) from the past add r4,r4,r12 @ h+=Maj(a,b,c) from the past
eor r0,r0,r8,ror#19 @ Sigma1(e) eor r0,r0,r8,ror#19 @ Sigma1(e)
# ifndef __ARMEB__
rev r2,r2 rev r2,r2
# endif
#else #else
@ ldrb r2,[r1,#3] @ 0 @ ldrb r2,[r1,#3] @ 0
add r4,r4,r12 @ h+=Maj(a,b,c) from the past add r4,r4,r12 @ h+=Maj(a,b,c) from the past
@ -177,7 +186,9 @@ sha256_block_data_order:
eor r0,r7,r7,ror#5 eor r0,r7,r7,ror#5
add r11,r11,r3 @ h+=Maj(a,b,c) from the past add r11,r11,r3 @ h+=Maj(a,b,c) from the past
eor r0,r0,r7,ror#19 @ Sigma1(e) eor r0,r0,r7,ror#19 @ Sigma1(e)
# ifndef __ARMEB__
rev r2,r2 rev r2,r2
# endif
#else #else
@ ldrb r2,[r1,#3] @ 1 @ ldrb r2,[r1,#3] @ 1
add r11,r11,r3 @ h+=Maj(a,b,c) from the past add r11,r11,r3 @ h+=Maj(a,b,c) from the past
@ -233,7 +244,9 @@ sha256_block_data_order:
eor r0,r6,r6,ror#5 eor r0,r6,r6,ror#5
add r10,r10,r12 @ h+=Maj(a,b,c) from the past add r10,r10,r12 @ h+=Maj(a,b,c) from the past
eor r0,r0,r6,ror#19 @ Sigma1(e) eor r0,r0,r6,ror#19 @ Sigma1(e)
# ifndef __ARMEB__
rev r2,r2 rev r2,r2
# endif
#else #else
@ ldrb r2,[r1,#3] @ 2 @ ldrb r2,[r1,#3] @ 2
add r10,r10,r12 @ h+=Maj(a,b,c) from the past add r10,r10,r12 @ h+=Maj(a,b,c) from the past
@ -289,7 +302,9 @@ sha256_block_data_order:
eor r0,r5,r5,ror#5 eor r0,r5,r5,ror#5
add r9,r9,r3 @ h+=Maj(a,b,c) from the past add r9,r9,r3 @ h+=Maj(a,b,c) from the past
eor r0,r0,r5,ror#19 @ Sigma1(e) eor r0,r0,r5,ror#19 @ Sigma1(e)
# ifndef __ARMEB__
rev r2,r2 rev r2,r2
# endif
#else #else
@ ldrb r2,[r1,#3] @ 3 @ ldrb r2,[r1,#3] @ 3
add r9,r9,r3 @ h+=Maj(a,b,c) from the past add r9,r9,r3 @ h+=Maj(a,b,c) from the past
@ -345,7 +360,9 @@ sha256_block_data_order:
eor r0,r4,r4,ror#5 eor r0,r4,r4,ror#5
add r8,r8,r12 @ h+=Maj(a,b,c) from the past add r8,r8,r12 @ h+=Maj(a,b,c) from the past
eor r0,r0,r4,ror#19 @ Sigma1(e) eor r0,r0,r4,ror#19 @ Sigma1(e)
# ifndef __ARMEB__
rev r2,r2 rev r2,r2
# endif
#else #else
@ ldrb r2,[r1,#3] @ 4 @ ldrb r2,[r1,#3] @ 4
add r8,r8,r12 @ h+=Maj(a,b,c) from the past add r8,r8,r12 @ h+=Maj(a,b,c) from the past
@ -401,7 +418,9 @@ sha256_block_data_order:
eor r0,r11,r11,ror#5 eor r0,r11,r11,ror#5
add r7,r7,r3 @ h+=Maj(a,b,c) from the past add r7,r7,r3 @ h+=Maj(a,b,c) from the past
eor r0,r0,r11,ror#19 @ Sigma1(e) eor r0,r0,r11,ror#19 @ Sigma1(e)
# ifndef __ARMEB__
rev r2,r2 rev r2,r2
# endif
#else #else
@ ldrb r2,[r1,#3] @ 5 @ ldrb r2,[r1,#3] @ 5
add r7,r7,r3 @ h+=Maj(a,b,c) from the past add r7,r7,r3 @ h+=Maj(a,b,c) from the past
@ -457,7 +476,9 @@ sha256_block_data_order:
eor r0,r10,r10,ror#5 eor r0,r10,r10,ror#5
add r6,r6,r12 @ h+=Maj(a,b,c) from the past add r6,r6,r12 @ h+=Maj(a,b,c) from the past
eor r0,r0,r10,ror#19 @ Sigma1(e) eor r0,r0,r10,ror#19 @ Sigma1(e)
# ifndef __ARMEB__
rev r2,r2 rev r2,r2
# endif
#else #else
@ ldrb r2,[r1,#3] @ 6 @ ldrb r2,[r1,#3] @ 6
add r6,r6,r12 @ h+=Maj(a,b,c) from the past add r6,r6,r12 @ h+=Maj(a,b,c) from the past
@ -513,7 +534,9 @@ sha256_block_data_order:
eor r0,r9,r9,ror#5 eor r0,r9,r9,ror#5
add r5,r5,r3 @ h+=Maj(a,b,c) from the past add r5,r5,r3 @ h+=Maj(a,b,c) from the past
eor r0,r0,r9,ror#19 @ Sigma1(e) eor r0,r0,r9,ror#19 @ Sigma1(e)
# ifndef __ARMEB__
rev r2,r2 rev r2,r2
# endif
#else #else
@ ldrb r2,[r1,#3] @ 7 @ ldrb r2,[r1,#3] @ 7
add r5,r5,r3 @ h+=Maj(a,b,c) from the past add r5,r5,r3 @ h+=Maj(a,b,c) from the past
@ -569,7 +592,9 @@ sha256_block_data_order:
eor r0,r8,r8,ror#5 eor r0,r8,r8,ror#5
add r4,r4,r12 @ h+=Maj(a,b,c) from the past add r4,r4,r12 @ h+=Maj(a,b,c) from the past
eor r0,r0,r8,ror#19 @ Sigma1(e) eor r0,r0,r8,ror#19 @ Sigma1(e)
# ifndef __ARMEB__
rev r2,r2 rev r2,r2
# endif
#else #else
@ ldrb r2,[r1,#3] @ 8 @ ldrb r2,[r1,#3] @ 8
add r4,r4,r12 @ h+=Maj(a,b,c) from the past add r4,r4,r12 @ h+=Maj(a,b,c) from the past
@ -625,7 +650,9 @@ sha256_block_data_order:
eor r0,r7,r7,ror#5 eor r0,r7,r7,ror#5
add r11,r11,r3 @ h+=Maj(a,b,c) from the past add r11,r11,r3 @ h+=Maj(a,b,c) from the past
eor r0,r0,r7,ror#19 @ Sigma1(e) eor r0,r0,r7,ror#19 @ Sigma1(e)
# ifndef __ARMEB__
rev r2,r2 rev r2,r2
# endif
#else #else
@ ldrb r2,[r1,#3] @ 9 @ ldrb r2,[r1,#3] @ 9
add r11,r11,r3 @ h+=Maj(a,b,c) from the past add r11,r11,r3 @ h+=Maj(a,b,c) from the past
@ -681,7 +708,9 @@ sha256_block_data_order:
eor r0,r6,r6,ror#5 eor r0,r6,r6,ror#5
add r10,r10,r12 @ h+=Maj(a,b,c) from the past add r10,r10,r12 @ h+=Maj(a,b,c) from the past
eor r0,r0,r6,ror#19 @ Sigma1(e) eor r0,r0,r6,ror#19 @ Sigma1(e)
# ifndef __ARMEB__
rev r2,r2 rev r2,r2
# endif
#else #else
@ ldrb r2,[r1,#3] @ 10 @ ldrb r2,[r1,#3] @ 10
add r10,r10,r12 @ h+=Maj(a,b,c) from the past add r10,r10,r12 @ h+=Maj(a,b,c) from the past
@ -737,7 +766,9 @@ sha256_block_data_order:
eor r0,r5,r5,ror#5 eor r0,r5,r5,ror#5
add r9,r9,r3 @ h+=Maj(a,b,c) from the past add r9,r9,r3 @ h+=Maj(a,b,c) from the past
eor r0,r0,r5,ror#19 @ Sigma1(e) eor r0,r0,r5,ror#19 @ Sigma1(e)
# ifndef __ARMEB__
rev r2,r2 rev r2,r2
# endif
#else #else
@ ldrb r2,[r1,#3] @ 11 @ ldrb r2,[r1,#3] @ 11
add r9,r9,r3 @ h+=Maj(a,b,c) from the past add r9,r9,r3 @ h+=Maj(a,b,c) from the past
@ -793,7 +824,9 @@ sha256_block_data_order:
eor r0,r4,r4,ror#5 eor r0,r4,r4,ror#5
add r8,r8,r12 @ h+=Maj(a,b,c) from the past add r8,r8,r12 @ h+=Maj(a,b,c) from the past
eor r0,r0,r4,ror#19 @ Sigma1(e) eor r0,r0,r4,ror#19 @ Sigma1(e)
# ifndef __ARMEB__
rev r2,r2 rev r2,r2
# endif
#else #else
@ ldrb r2,[r1,#3] @ 12 @ ldrb r2,[r1,#3] @ 12
add r8,r8,r12 @ h+=Maj(a,b,c) from the past add r8,r8,r12 @ h+=Maj(a,b,c) from the past
@ -849,7 +882,9 @@ sha256_block_data_order:
eor r0,r11,r11,ror#5 eor r0,r11,r11,ror#5
add r7,r7,r3 @ h+=Maj(a,b,c) from the past add r7,r7,r3 @ h+=Maj(a,b,c) from the past
eor r0,r0,r11,ror#19 @ Sigma1(e) eor r0,r0,r11,ror#19 @ Sigma1(e)
# ifndef __ARMEB__
rev r2,r2 rev r2,r2
# endif
#else #else
@ ldrb r2,[r1,#3] @ 13 @ ldrb r2,[r1,#3] @ 13
add r7,r7,r3 @ h+=Maj(a,b,c) from the past add r7,r7,r3 @ h+=Maj(a,b,c) from the past
@ -905,7 +940,9 @@ sha256_block_data_order:
eor r0,r10,r10,ror#5 eor r0,r10,r10,ror#5
add r6,r6,r12 @ h+=Maj(a,b,c) from the past add r6,r6,r12 @ h+=Maj(a,b,c) from the past
eor r0,r0,r10,ror#19 @ Sigma1(e) eor r0,r0,r10,ror#19 @ Sigma1(e)
# ifndef __ARMEB__
rev r2,r2 rev r2,r2
# endif
#else #else
@ ldrb r2,[r1,#3] @ 14 @ ldrb r2,[r1,#3] @ 14
add r6,r6,r12 @ h+=Maj(a,b,c) from the past add r6,r6,r12 @ h+=Maj(a,b,c) from the past
@ -961,7 +998,9 @@ sha256_block_data_order:
eor r0,r9,r9,ror#5 eor r0,r9,r9,ror#5
add r5,r5,r3 @ h+=Maj(a,b,c) from the past add r5,r5,r3 @ h+=Maj(a,b,c) from the past
eor r0,r0,r9,ror#19 @ Sigma1(e) eor r0,r0,r9,ror#19 @ Sigma1(e)
# ifndef __ARMEB__
rev r2,r2 rev r2,r2
# endif
#else #else
@ ldrb r2,[r1,#3] @ 15 @ ldrb r2,[r1,#3] @ 15
add r5,r5,r3 @ h+=Maj(a,b,c) from the past add r5,r5,r3 @ h+=Maj(a,b,c) from the past
@ -1794,7 +1833,7 @@ sha256_block_data_order:
eor r12,r12,r6 @ Maj(a,b,c) eor r12,r12,r6 @ Maj(a,b,c)
add r4,r4,r0,ror#2 @ h+=Sigma0(a) add r4,r4,r0,ror#2 @ h+=Sigma0(a)
@ add r4,r4,r12 @ h+=Maj(a,b,c) @ add r4,r4,r12 @ h+=Maj(a,b,c)
#if __ARM_ARCH__>=7 #ifdef __thumb2__
ite eq @ Thumb2 thing, sanity check in ARM ite eq @ Thumb2 thing, sanity check in ARM
#endif #endif
ldreq r3,[sp,#16*4] @ pull ctx ldreq r3,[sp,#16*4] @ pull ctx
@ -1826,24 +1865,25 @@ sha256_block_data_order:
add sp,sp,#19*4 @ destroy frame add sp,sp,#19*4 @ destroy frame
#if __ARM_ARCH__>=5 #if __ARM_ARCH__>=5
ldmia sp!,{r4-r11,pc} ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
#else #else
ldmia sp!,{r4-r11,lr} ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
tst lr,#1 tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-) .word 0xe12fff1e @ interoperable with Thumb ISA:-)
#endif #endif
.size sha256_block_data_order,.-sha256_block_data_order .size sha256_block_data_order,.-sha256_block_data_order
#if __ARM_MAX_ARCH__>=7 #if __ARM_MAX_ARCH__>=7
.arch armv7-a .arch armv7-a
.fpu neon .fpu neon
.global sha256_block_data_order_neon .globl sha256_block_data_order_neon
.type sha256_block_data_order_neon,%function .type sha256_block_data_order_neon,%function
.align 4 .align 5
.skip 16
sha256_block_data_order_neon: sha256_block_data_order_neon:
.LNEON: .LNEON:
stmdb sp!,{r4-r12,lr} stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
sub r11,sp,#16*4+16 sub r11,sp,#16*4+16
adr r14,K256 adr r14,K256
@ -1852,38 +1892,38 @@ sha256_block_data_order_neon:
mov sp,r11 @ alloca mov sp,r11 @ alloca
add r2,r1,r2,lsl#6 @ len to point at the end of inp add r2,r1,r2,lsl#6 @ len to point at the end of inp
vld1.8 {q0},[r1]! vld1.8 {q0},[r1]!
vld1.8 {q1},[r1]! vld1.8 {q1},[r1]!
vld1.8 {q2},[r1]! vld1.8 {q2},[r1]!
vld1.8 {q3},[r1]! vld1.8 {q3},[r1]!
vld1.32 {q8},[r14,:128]! vld1.32 {q8},[r14,:128]!
vld1.32 {q9},[r14,:128]! vld1.32 {q9},[r14,:128]!
vld1.32 {q10},[r14,:128]! vld1.32 {q10},[r14,:128]!
vld1.32 {q11},[r14,:128]! vld1.32 {q11},[r14,:128]!
vrev32.8 q0,q0 @ yes, even on vrev32.8 q0,q0 @ yes, even on
str r0,[sp,#64] str r0,[sp,#64]
vrev32.8 q1,q1 @ big-endian vrev32.8 q1,q1 @ big-endian
str r1,[sp,#68] str r1,[sp,#68]
mov r1,sp mov r1,sp
vrev32.8 q2,q2 vrev32.8 q2,q2
str r2,[sp,#72] str r2,[sp,#72]
vrev32.8 q3,q3 vrev32.8 q3,q3
str r12,[sp,#76] @ save original sp str r12,[sp,#76] @ save original sp
vadd.i32 q8,q8,q0 vadd.i32 q8,q8,q0
vadd.i32 q9,q9,q1 vadd.i32 q9,q9,q1
vst1.32 {q8},[r1,:128]! vst1.32 {q8},[r1,:128]!
vadd.i32 q10,q10,q2 vadd.i32 q10,q10,q2
vst1.32 {q9},[r1,:128]! vst1.32 {q9},[r1,:128]!
vadd.i32 q11,q11,q3 vadd.i32 q11,q11,q3
vst1.32 {q10},[r1,:128]! vst1.32 {q10},[r1,:128]!
vst1.32 {q11},[r1,:128]! vst1.32 {q11},[r1,:128]!
ldmia r0,{r4-r11} ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11}
sub r1,r1,#64 sub r1,r1,#64
ldr r2,[sp,#0] ldr r2,[sp,#0]
eor r12,r12,r12 eor r12,r12,r12
eor r3,r5,r6 eor r3,r5,r6
b .L_00_48 b .L_00_48
.align 4 .align 4
.L_00_48: .L_00_48:
@ -2284,19 +2324,19 @@ sha256_block_data_order_neon:
sub r1,r1,#64 sub r1,r1,#64
bne .L_00_48 bne .L_00_48
ldr r1,[sp,#68] ldr r1,[sp,#68]
ldr r0,[sp,#72] ldr r0,[sp,#72]
sub r14,r14,#256 @ rewind r14 sub r14,r14,#256 @ rewind r14
teq r1,r0 teq r1,r0
it eq it eq
subeq r1,r1,#64 @ avoid SEGV subeq r1,r1,#64 @ avoid SEGV
vld1.8 {q0},[r1]! @ load next input block vld1.8 {q0},[r1]! @ load next input block
vld1.8 {q1},[r1]! vld1.8 {q1},[r1]!
vld1.8 {q2},[r1]! vld1.8 {q2},[r1]!
vld1.8 {q3},[r1]! vld1.8 {q3},[r1]!
it ne it ne
strne r1,[sp,#68] strne r1,[sp,#68]
mov r1,sp mov r1,sp
add r11,r11,r2 add r11,r11,r2
eor r2,r9,r10 eor r2,r9,r10
eor r0,r8,r8,ror#5 eor r0,r8,r8,ror#5
@ -2606,7 +2646,7 @@ sha256_block_data_order_neon:
str r6,[r2],#4 str r6,[r2],#4
add r11,r11,r1 add r11,r11,r1
str r7,[r2],#4 str r7,[r2],#4
stmia r2,{r8-r11} stmia r2,{r8,r9,r10,r11}
ittte ne ittte ne
movne r1,sp movne r1,sp
@ -2617,12 +2657,12 @@ sha256_block_data_order_neon:
eorne r3,r5,r6 eorne r3,r5,r6
bne .L_00_48 bne .L_00_48
ldmia sp!,{r4-r12,pc} ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
.size sha256_block_data_order_neon,.-sha256_block_data_order_neon .size sha256_block_data_order_neon,.-sha256_block_data_order_neon
#endif #endif
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
# ifdef __thumb2__ # if defined(__thumb2__)
# define INST(a,b,c,d) .byte c,d|0xc,a,b # define INST(a,b,c,d) .byte c,d|0xc,a,b
# else # else
# define INST(a,b,c,d) .byte a,b,c,d # define INST(a,b,c,d) .byte a,b,c,d
@ -2633,145 +2673,143 @@ sha256_block_data_order_neon:
sha256_block_data_order_armv8: sha256_block_data_order_armv8:
.LARMv8: .LARMv8:
vld1.32 {q0,q1},[r0] vld1.32 {q0,q1},[r0]
# ifdef __thumb2__
adr r3,.LARMv8
sub r3,r3,#.LARMv8-K256
# else
sub r3,r3,#256+32 sub r3,r3,#256+32
# endif
add r2,r1,r2,lsl#6 @ len to point at the end of inp add r2,r1,r2,lsl#6 @ len to point at the end of inp
b .Loop_v8
.align 4
.Loop_v8: .Loop_v8:
vld1.8 {q8-q9},[r1]! vld1.8 {q8,q9},[r1]!
vld1.8 {q10-q11},[r1]! vld1.8 {q10,q11},[r1]!
vld1.32 {q12},[r3]! vld1.32 {q12},[r3]!
vrev32.8 q8,q8 vrev32.8 q8,q8
vrev32.8 q9,q9 vrev32.8 q9,q9
vrev32.8 q10,q10 vrev32.8 q10,q10
vrev32.8 q11,q11 vrev32.8 q11,q11
vmov q14,q0 @ offload vmov q14,q0 @ offload
vmov q15,q1 vmov q15,q1
teq r1,r2 teq r1,r2
vld1.32 {q13},[r3]! vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q8 vadd.i32 q12,q12,q8
INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
vmov q2,q0 vmov q2,q0
INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
vld1.32 {q12},[r3]! vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q9 vadd.i32 q13,q13,q9
INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
vmov q2,q0 vmov q2,q0
INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
vld1.32 {q13},[r3]! vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q10 vadd.i32 q12,q12,q10
INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
vmov q2,q0 vmov q2,q0
INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
vld1.32 {q12},[r3]! vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q11 vadd.i32 q13,q13,q11
INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
vmov q2,q0 vmov q2,q0
INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
vld1.32 {q13},[r3]! vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q8 vadd.i32 q12,q12,q8
INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
vmov q2,q0 vmov q2,q0
INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
vld1.32 {q12},[r3]! vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q9 vadd.i32 q13,q13,q9
INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
vmov q2,q0 vmov q2,q0
INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
vld1.32 {q13},[r3]! vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q10 vadd.i32 q12,q12,q10
INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
vmov q2,q0 vmov q2,q0
INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
vld1.32 {q12},[r3]! vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q11 vadd.i32 q13,q13,q11
INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
vmov q2,q0 vmov q2,q0
INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
vld1.32 {q13},[r3]! vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q8 vadd.i32 q12,q12,q8
INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
vmov q2,q0 vmov q2,q0
INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
vld1.32 {q12},[r3]! vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q9 vadd.i32 q13,q13,q9
INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
vmov q2,q0 vmov q2,q0
INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
vld1.32 {q13},[r3]! vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q10 vadd.i32 q12,q12,q10
INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
vmov q2,q0 vmov q2,q0
INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
vld1.32 {q12},[r3]! vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q11 vadd.i32 q13,q13,q11
INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
vmov q2,q0 vmov q2,q0
INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
vld1.32 {q13},[r3]! vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q8 vadd.i32 q12,q12,q8
vmov q2,q0 vmov q2,q0
INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
vld1.32 {q12},[r3]! vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q9 vadd.i32 q13,q13,q9
vmov q2,q0 vmov q2,q0
INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
vld1.32 {q13},[r3] vld1.32 {q13},[r3]
vadd.i32 q12,q12,q10 vadd.i32 q12,q12,q10
sub r3,r3,#256-16 @ rewind sub r3,r3,#256-16 @ rewind
vmov q2,q0 vmov q2,q0
INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
vadd.i32 q13,q13,q11 vadd.i32 q13,q13,q11
vmov q2,q0 vmov q2,q0
INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
vadd.i32 q0,q0,q14 vadd.i32 q0,q0,q14
vadd.i32 q1,q1,q15 vadd.i32 q1,q1,q15
it ne it ne
bne .Loop_v8 bne .Loop_v8
vst1.32 {q0,q1},[r0] vst1.32 {q0,q1},[r0]
bx lr @ bx lr bx lr @ bx lr
.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 .size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
#endif #endif
.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro@openssl.org>" .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2
.align 2 .align 2
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.comm OPENSSL_armcap_P,4,4 .comm OPENSSL_armcap_P,4,4
#endif #endif

File diff suppressed because it is too large Load Diff