Regen X86 assembly files after r364822.

This commit is contained in:
Jung-uk Kim 2020-08-26 16:56:44 +00:00
parent 63c1bb5162
commit 3971092e11
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=364823
22 changed files with 44046 additions and 93 deletions

View File

@ -2,20 +2,790 @@
/* Do not modify. This file is auto-generated from aesni-gcm-x86_64.pl. */
.text
.globl aesni_gcm_encrypt
.type aesni_gcm_encrypt,@function
aesni_gcm_encrypt:
.type _aesni_ctr32_ghash_6x,@function
.align 32
_aesni_ctr32_ghash_6x:
.cfi_startproc
xorl %eax,%eax
vmovdqu 32(%r11),%xmm2
subq $6,%rdx
vpxor %xmm4,%xmm4,%xmm4
vmovdqu 0-128(%rcx),%xmm15
vpaddb %xmm2,%xmm1,%xmm10
vpaddb %xmm2,%xmm10,%xmm11
vpaddb %xmm2,%xmm11,%xmm12
vpaddb %xmm2,%xmm12,%xmm13
vpaddb %xmm2,%xmm13,%xmm14
vpxor %xmm15,%xmm1,%xmm9
vmovdqu %xmm4,16+8(%rsp)
jmp .Loop6x
.align 32
.Loop6x:
addl $100663296,%ebx
jc .Lhandle_ctr32
vmovdqu 0-32(%r9),%xmm3
vpaddb %xmm2,%xmm14,%xmm1
vpxor %xmm15,%xmm10,%xmm10
vpxor %xmm15,%xmm11,%xmm11
.Lresume_ctr32:
vmovdqu %xmm1,(%r8)
vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5
vpxor %xmm15,%xmm12,%xmm12
vmovups 16-128(%rcx),%xmm2
vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6
xorq %r12,%r12
cmpq %r14,%r15
vaesenc %xmm2,%xmm9,%xmm9
vmovdqu 48+8(%rsp),%xmm0
vpxor %xmm15,%xmm13,%xmm13
vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1
vaesenc %xmm2,%xmm10,%xmm10
vpxor %xmm15,%xmm14,%xmm14
setnc %r12b
vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
vaesenc %xmm2,%xmm11,%xmm11
vmovdqu 16-32(%r9),%xmm3
negq %r12
vaesenc %xmm2,%xmm12,%xmm12
vpxor %xmm5,%xmm6,%xmm6
vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5
vpxor %xmm4,%xmm8,%xmm8
vaesenc %xmm2,%xmm13,%xmm13
vpxor %xmm5,%xmm1,%xmm4
andq $0x60,%r12
vmovups 32-128(%rcx),%xmm15
vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1
vaesenc %xmm2,%xmm14,%xmm14
vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2
leaq (%r14,%r12,1),%r14
vaesenc %xmm15,%xmm9,%xmm9
vpxor 16+8(%rsp),%xmm8,%xmm8
vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3
vmovdqu 64+8(%rsp),%xmm0
vaesenc %xmm15,%xmm10,%xmm10
movbeq 88(%r14),%r13
vaesenc %xmm15,%xmm11,%xmm11
movbeq 80(%r14),%r12
vaesenc %xmm15,%xmm12,%xmm12
movq %r13,32+8(%rsp)
vaesenc %xmm15,%xmm13,%xmm13
movq %r12,40+8(%rsp)
vmovdqu 48-32(%r9),%xmm5
vaesenc %xmm15,%xmm14,%xmm14
vmovups 48-128(%rcx),%xmm15
vpxor %xmm1,%xmm6,%xmm6
vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1
vaesenc %xmm15,%xmm9,%xmm9
vpxor %xmm2,%xmm6,%xmm6
vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2
vaesenc %xmm15,%xmm10,%xmm10
vpxor %xmm3,%xmm7,%xmm7
vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3
vaesenc %xmm15,%xmm11,%xmm11
vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5
vmovdqu 80+8(%rsp),%xmm0
vaesenc %xmm15,%xmm12,%xmm12
vaesenc %xmm15,%xmm13,%xmm13
vpxor %xmm1,%xmm4,%xmm4
vmovdqu 64-32(%r9),%xmm1
vaesenc %xmm15,%xmm14,%xmm14
vmovups 64-128(%rcx),%xmm15
vpxor %xmm2,%xmm6,%xmm6
vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
vaesenc %xmm15,%xmm9,%xmm9
vpxor %xmm3,%xmm6,%xmm6
vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
vaesenc %xmm15,%xmm10,%xmm10
movbeq 72(%r14),%r13
vpxor %xmm5,%xmm7,%xmm7
vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5
vaesenc %xmm15,%xmm11,%xmm11
movbeq 64(%r14),%r12
vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1
vmovdqu 96+8(%rsp),%xmm0
vaesenc %xmm15,%xmm12,%xmm12
movq %r13,48+8(%rsp)
vaesenc %xmm15,%xmm13,%xmm13
movq %r12,56+8(%rsp)
vpxor %xmm2,%xmm4,%xmm4
vmovdqu 96-32(%r9),%xmm2
vaesenc %xmm15,%xmm14,%xmm14
vmovups 80-128(%rcx),%xmm15
vpxor %xmm3,%xmm6,%xmm6
vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3
vaesenc %xmm15,%xmm9,%xmm9
vpxor %xmm5,%xmm6,%xmm6
vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5
vaesenc %xmm15,%xmm10,%xmm10
movbeq 56(%r14),%r13
vpxor %xmm1,%xmm7,%xmm7
vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1
vpxor 112+8(%rsp),%xmm8,%xmm8
vaesenc %xmm15,%xmm11,%xmm11
movbeq 48(%r14),%r12
vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2
vaesenc %xmm15,%xmm12,%xmm12
movq %r13,64+8(%rsp)
vaesenc %xmm15,%xmm13,%xmm13
movq %r12,72+8(%rsp)
vpxor %xmm3,%xmm4,%xmm4
vmovdqu 112-32(%r9),%xmm3
vaesenc %xmm15,%xmm14,%xmm14
vmovups 96-128(%rcx),%xmm15
vpxor %xmm5,%xmm6,%xmm6
vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5
vaesenc %xmm15,%xmm9,%xmm9
vpxor %xmm1,%xmm6,%xmm6
vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1
vaesenc %xmm15,%xmm10,%xmm10
movbeq 40(%r14),%r13
vpxor %xmm2,%xmm7,%xmm7
vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2
vaesenc %xmm15,%xmm11,%xmm11
movbeq 32(%r14),%r12
vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8
vaesenc %xmm15,%xmm12,%xmm12
movq %r13,80+8(%rsp)
vaesenc %xmm15,%xmm13,%xmm13
movq %r12,88+8(%rsp)
vpxor %xmm5,%xmm6,%xmm6
vaesenc %xmm15,%xmm14,%xmm14
vpxor %xmm1,%xmm6,%xmm6
vmovups 112-128(%rcx),%xmm15
vpslldq $8,%xmm6,%xmm5
vpxor %xmm2,%xmm4,%xmm4
vmovdqu 16(%r11),%xmm3
vaesenc %xmm15,%xmm9,%xmm9
vpxor %xmm8,%xmm7,%xmm7
vaesenc %xmm15,%xmm10,%xmm10
vpxor %xmm5,%xmm4,%xmm4
movbeq 24(%r14),%r13
vaesenc %xmm15,%xmm11,%xmm11
movbeq 16(%r14),%r12
vpalignr $8,%xmm4,%xmm4,%xmm0
vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
movq %r13,96+8(%rsp)
vaesenc %xmm15,%xmm12,%xmm12
movq %r12,104+8(%rsp)
vaesenc %xmm15,%xmm13,%xmm13
vmovups 128-128(%rcx),%xmm1
vaesenc %xmm15,%xmm14,%xmm14
vaesenc %xmm1,%xmm9,%xmm9
vmovups 144-128(%rcx),%xmm15
vaesenc %xmm1,%xmm10,%xmm10
vpsrldq $8,%xmm6,%xmm6
vaesenc %xmm1,%xmm11,%xmm11
vpxor %xmm6,%xmm7,%xmm7
vaesenc %xmm1,%xmm12,%xmm12
vpxor %xmm0,%xmm4,%xmm4
movbeq 8(%r14),%r13
vaesenc %xmm1,%xmm13,%xmm13
movbeq 0(%r14),%r12
vaesenc %xmm1,%xmm14,%xmm14
vmovups 160-128(%rcx),%xmm1
cmpl $11,%ebp
jb .Lenc_tail
vaesenc %xmm15,%xmm9,%xmm9
vaesenc %xmm15,%xmm10,%xmm10
vaesenc %xmm15,%xmm11,%xmm11
vaesenc %xmm15,%xmm12,%xmm12
vaesenc %xmm15,%xmm13,%xmm13
vaesenc %xmm15,%xmm14,%xmm14
vaesenc %xmm1,%xmm9,%xmm9
vaesenc %xmm1,%xmm10,%xmm10
vaesenc %xmm1,%xmm11,%xmm11
vaesenc %xmm1,%xmm12,%xmm12
vaesenc %xmm1,%xmm13,%xmm13
vmovups 176-128(%rcx),%xmm15
vaesenc %xmm1,%xmm14,%xmm14
vmovups 192-128(%rcx),%xmm1
je .Lenc_tail
vaesenc %xmm15,%xmm9,%xmm9
vaesenc %xmm15,%xmm10,%xmm10
vaesenc %xmm15,%xmm11,%xmm11
vaesenc %xmm15,%xmm12,%xmm12
vaesenc %xmm15,%xmm13,%xmm13
vaesenc %xmm15,%xmm14,%xmm14
vaesenc %xmm1,%xmm9,%xmm9
vaesenc %xmm1,%xmm10,%xmm10
vaesenc %xmm1,%xmm11,%xmm11
vaesenc %xmm1,%xmm12,%xmm12
vaesenc %xmm1,%xmm13,%xmm13
vmovups 208-128(%rcx),%xmm15
vaesenc %xmm1,%xmm14,%xmm14
vmovups 224-128(%rcx),%xmm1
jmp .Lenc_tail
.align 32
.Lhandle_ctr32:
vmovdqu (%r11),%xmm0
vpshufb %xmm0,%xmm1,%xmm6
vmovdqu 48(%r11),%xmm5
vpaddd 64(%r11),%xmm6,%xmm10
vpaddd %xmm5,%xmm6,%xmm11
vmovdqu 0-32(%r9),%xmm3
vpaddd %xmm5,%xmm10,%xmm12
vpshufb %xmm0,%xmm10,%xmm10
vpaddd %xmm5,%xmm11,%xmm13
vpshufb %xmm0,%xmm11,%xmm11
vpxor %xmm15,%xmm10,%xmm10
vpaddd %xmm5,%xmm12,%xmm14
vpshufb %xmm0,%xmm12,%xmm12
vpxor %xmm15,%xmm11,%xmm11
vpaddd %xmm5,%xmm13,%xmm1
vpshufb %xmm0,%xmm13,%xmm13
vpshufb %xmm0,%xmm14,%xmm14
vpshufb %xmm0,%xmm1,%xmm1
jmp .Lresume_ctr32
.align 32
.Lenc_tail:
vaesenc %xmm15,%xmm9,%xmm9
vmovdqu %xmm7,16+8(%rsp)
vpalignr $8,%xmm4,%xmm4,%xmm8
vaesenc %xmm15,%xmm10,%xmm10
vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
vpxor 0(%rdi),%xmm1,%xmm2
vaesenc %xmm15,%xmm11,%xmm11
vpxor 16(%rdi),%xmm1,%xmm0
vaesenc %xmm15,%xmm12,%xmm12
vpxor 32(%rdi),%xmm1,%xmm5
vaesenc %xmm15,%xmm13,%xmm13
vpxor 48(%rdi),%xmm1,%xmm6
vaesenc %xmm15,%xmm14,%xmm14
vpxor 64(%rdi),%xmm1,%xmm7
vpxor 80(%rdi),%xmm1,%xmm3
vmovdqu (%r8),%xmm1
vaesenclast %xmm2,%xmm9,%xmm9
vmovdqu 32(%r11),%xmm2
vaesenclast %xmm0,%xmm10,%xmm10
vpaddb %xmm2,%xmm1,%xmm0
movq %r13,112+8(%rsp)
leaq 96(%rdi),%rdi
vaesenclast %xmm5,%xmm11,%xmm11
vpaddb %xmm2,%xmm0,%xmm5
movq %r12,120+8(%rsp)
leaq 96(%rsi),%rsi
vmovdqu 0-128(%rcx),%xmm15
vaesenclast %xmm6,%xmm12,%xmm12
vpaddb %xmm2,%xmm5,%xmm6
vaesenclast %xmm7,%xmm13,%xmm13
vpaddb %xmm2,%xmm6,%xmm7
vaesenclast %xmm3,%xmm14,%xmm14
vpaddb %xmm2,%xmm7,%xmm3
addq $0x60,%r10
subq $0x6,%rdx
jc .L6x_done
vmovups %xmm9,-96(%rsi)
vpxor %xmm15,%xmm1,%xmm9
vmovups %xmm10,-80(%rsi)
vmovdqa %xmm0,%xmm10
vmovups %xmm11,-64(%rsi)
vmovdqa %xmm5,%xmm11
vmovups %xmm12,-48(%rsi)
vmovdqa %xmm6,%xmm12
vmovups %xmm13,-32(%rsi)
vmovdqa %xmm7,%xmm13
vmovups %xmm14,-16(%rsi)
vmovdqa %xmm3,%xmm14
vmovdqu 32+8(%rsp),%xmm7
jmp .Loop6x
.L6x_done:
vpxor 16+8(%rsp),%xmm8,%xmm8
vpxor %xmm4,%xmm8,%xmm8
.byte 0xf3,0xc3
.cfi_endproc
.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
.globl aesni_gcm_decrypt
.type aesni_gcm_decrypt,@function
.align 32
aesni_gcm_decrypt:
.cfi_startproc
xorl %eax,%eax
xorq %r10,%r10
cmpq $0x60,%rdx
jb .Lgcm_dec_abort
leaq (%rsp),%rax
.cfi_def_cfa_register %rax
pushq %rbx
.cfi_offset %rbx,-16
pushq %rbp
.cfi_offset %rbp,-24
pushq %r12
.cfi_offset %r12,-32
pushq %r13
.cfi_offset %r13,-40
pushq %r14
.cfi_offset %r14,-48
pushq %r15
.cfi_offset %r15,-56
vzeroupper
vmovdqu (%r8),%xmm1
addq $-128,%rsp
movl 12(%r8),%ebx
leaq .Lbswap_mask(%rip),%r11
leaq -128(%rcx),%r14
movq $0xf80,%r15
vmovdqu (%r9),%xmm8
andq $-128,%rsp
vmovdqu (%r11),%xmm0
leaq 128(%rcx),%rcx
leaq 32+32(%r9),%r9
movl 240-128(%rcx),%ebp
vpshufb %xmm0,%xmm8,%xmm8
andq %r15,%r14
andq %rsp,%r15
subq %r14,%r15
jc .Ldec_no_key_aliasing
cmpq $768,%r15
jnc .Ldec_no_key_aliasing
subq %r15,%rsp
.Ldec_no_key_aliasing:
vmovdqu 80(%rdi),%xmm7
leaq (%rdi),%r14
vmovdqu 64(%rdi),%xmm4
leaq -192(%rdi,%rdx,1),%r15
vmovdqu 48(%rdi),%xmm5
shrq $4,%rdx
xorq %r10,%r10
vmovdqu 32(%rdi),%xmm6
vpshufb %xmm0,%xmm7,%xmm7
vmovdqu 16(%rdi),%xmm2
vpshufb %xmm0,%xmm4,%xmm4
vmovdqu (%rdi),%xmm3
vpshufb %xmm0,%xmm5,%xmm5
vmovdqu %xmm4,48(%rsp)
vpshufb %xmm0,%xmm6,%xmm6
vmovdqu %xmm5,64(%rsp)
vpshufb %xmm0,%xmm2,%xmm2
vmovdqu %xmm6,80(%rsp)
vpshufb %xmm0,%xmm3,%xmm3
vmovdqu %xmm2,96(%rsp)
vmovdqu %xmm3,112(%rsp)
call _aesni_ctr32_ghash_6x
vmovups %xmm9,-96(%rsi)
vmovups %xmm10,-80(%rsi)
vmovups %xmm11,-64(%rsi)
vmovups %xmm12,-48(%rsi)
vmovups %xmm13,-32(%rsi)
vmovups %xmm14,-16(%rsi)
vpshufb (%r11),%xmm8,%xmm8
vmovdqu %xmm8,-64(%r9)
vzeroupper
movq -48(%rax),%r15
.cfi_restore %r15
movq -40(%rax),%r14
.cfi_restore %r14
movq -32(%rax),%r13
.cfi_restore %r13
movq -24(%rax),%r12
.cfi_restore %r12
movq -16(%rax),%rbp
.cfi_restore %rbp
movq -8(%rax),%rbx
.cfi_restore %rbx
leaq (%rax),%rsp
.cfi_def_cfa_register %rsp
.Lgcm_dec_abort:
movq %r10,%rax
.byte 0xf3,0xc3
.cfi_endproc
.size aesni_gcm_decrypt,.-aesni_gcm_decrypt
.type _aesni_ctr32_6x,@function
.align 32
_aesni_ctr32_6x:
.cfi_startproc
vmovdqu 0-128(%rcx),%xmm4
vmovdqu 32(%r11),%xmm2
leaq -1(%rbp),%r13
vmovups 16-128(%rcx),%xmm15
leaq 32-128(%rcx),%r12
vpxor %xmm4,%xmm1,%xmm9
addl $100663296,%ebx
jc .Lhandle_ctr32_2
vpaddb %xmm2,%xmm1,%xmm10
vpaddb %xmm2,%xmm10,%xmm11
vpxor %xmm4,%xmm10,%xmm10
vpaddb %xmm2,%xmm11,%xmm12
vpxor %xmm4,%xmm11,%xmm11
vpaddb %xmm2,%xmm12,%xmm13
vpxor %xmm4,%xmm12,%xmm12
vpaddb %xmm2,%xmm13,%xmm14
vpxor %xmm4,%xmm13,%xmm13
vpaddb %xmm2,%xmm14,%xmm1
vpxor %xmm4,%xmm14,%xmm14
jmp .Loop_ctr32
.align 16
.Loop_ctr32:
vaesenc %xmm15,%xmm9,%xmm9
vaesenc %xmm15,%xmm10,%xmm10
vaesenc %xmm15,%xmm11,%xmm11
vaesenc %xmm15,%xmm12,%xmm12
vaesenc %xmm15,%xmm13,%xmm13
vaesenc %xmm15,%xmm14,%xmm14
vmovups (%r12),%xmm15
leaq 16(%r12),%r12
decl %r13d
jnz .Loop_ctr32
vmovdqu (%r12),%xmm3
vaesenc %xmm15,%xmm9,%xmm9
vpxor 0(%rdi),%xmm3,%xmm4
vaesenc %xmm15,%xmm10,%xmm10
vpxor 16(%rdi),%xmm3,%xmm5
vaesenc %xmm15,%xmm11,%xmm11
vpxor 32(%rdi),%xmm3,%xmm6
vaesenc %xmm15,%xmm12,%xmm12
vpxor 48(%rdi),%xmm3,%xmm8
vaesenc %xmm15,%xmm13,%xmm13
vpxor 64(%rdi),%xmm3,%xmm2
vaesenc %xmm15,%xmm14,%xmm14
vpxor 80(%rdi),%xmm3,%xmm3
leaq 96(%rdi),%rdi
vaesenclast %xmm4,%xmm9,%xmm9
vaesenclast %xmm5,%xmm10,%xmm10
vaesenclast %xmm6,%xmm11,%xmm11
vaesenclast %xmm8,%xmm12,%xmm12
vaesenclast %xmm2,%xmm13,%xmm13
vaesenclast %xmm3,%xmm14,%xmm14
vmovups %xmm9,0(%rsi)
vmovups %xmm10,16(%rsi)
vmovups %xmm11,32(%rsi)
vmovups %xmm12,48(%rsi)
vmovups %xmm13,64(%rsi)
vmovups %xmm14,80(%rsi)
leaq 96(%rsi),%rsi
.byte 0xf3,0xc3
.align 32
.Lhandle_ctr32_2:
vpshufb %xmm0,%xmm1,%xmm6
vmovdqu 48(%r11),%xmm5
vpaddd 64(%r11),%xmm6,%xmm10
vpaddd %xmm5,%xmm6,%xmm11
vpaddd %xmm5,%xmm10,%xmm12
vpshufb %xmm0,%xmm10,%xmm10
vpaddd %xmm5,%xmm11,%xmm13
vpshufb %xmm0,%xmm11,%xmm11
vpxor %xmm4,%xmm10,%xmm10
vpaddd %xmm5,%xmm12,%xmm14
vpshufb %xmm0,%xmm12,%xmm12
vpxor %xmm4,%xmm11,%xmm11
vpaddd %xmm5,%xmm13,%xmm1
vpshufb %xmm0,%xmm13,%xmm13
vpxor %xmm4,%xmm12,%xmm12
vpshufb %xmm0,%xmm14,%xmm14
vpxor %xmm4,%xmm13,%xmm13
vpshufb %xmm0,%xmm1,%xmm1
vpxor %xmm4,%xmm14,%xmm14
jmp .Loop_ctr32
.cfi_endproc
.size _aesni_ctr32_6x,.-_aesni_ctr32_6x
.globl aesni_gcm_encrypt
.type aesni_gcm_encrypt,@function
.align 32
aesni_gcm_encrypt:
.cfi_startproc
xorq %r10,%r10
cmpq $288,%rdx
jb .Lgcm_enc_abort
leaq (%rsp),%rax
.cfi_def_cfa_register %rax
pushq %rbx
.cfi_offset %rbx,-16
pushq %rbp
.cfi_offset %rbp,-24
pushq %r12
.cfi_offset %r12,-32
pushq %r13
.cfi_offset %r13,-40
pushq %r14
.cfi_offset %r14,-48
pushq %r15
.cfi_offset %r15,-56
vzeroupper
vmovdqu (%r8),%xmm1
addq $-128,%rsp
movl 12(%r8),%ebx
leaq .Lbswap_mask(%rip),%r11
leaq -128(%rcx),%r14
movq $0xf80,%r15
leaq 128(%rcx),%rcx
vmovdqu (%r11),%xmm0
andq $-128,%rsp
movl 240-128(%rcx),%ebp
andq %r15,%r14
andq %rsp,%r15
subq %r14,%r15
jc .Lenc_no_key_aliasing
cmpq $768,%r15
jnc .Lenc_no_key_aliasing
subq %r15,%rsp
.Lenc_no_key_aliasing:
leaq (%rsi),%r14
leaq -192(%rsi,%rdx,1),%r15
shrq $4,%rdx
call _aesni_ctr32_6x
vpshufb %xmm0,%xmm9,%xmm8
vpshufb %xmm0,%xmm10,%xmm2
vmovdqu %xmm8,112(%rsp)
vpshufb %xmm0,%xmm11,%xmm4
vmovdqu %xmm2,96(%rsp)
vpshufb %xmm0,%xmm12,%xmm5
vmovdqu %xmm4,80(%rsp)
vpshufb %xmm0,%xmm13,%xmm6
vmovdqu %xmm5,64(%rsp)
vpshufb %xmm0,%xmm14,%xmm7
vmovdqu %xmm6,48(%rsp)
call _aesni_ctr32_6x
vmovdqu (%r9),%xmm8
leaq 32+32(%r9),%r9
subq $12,%rdx
movq $192,%r10
vpshufb %xmm0,%xmm8,%xmm8
call _aesni_ctr32_ghash_6x
vmovdqu 32(%rsp),%xmm7
vmovdqu (%r11),%xmm0
vmovdqu 0-32(%r9),%xmm3
vpunpckhqdq %xmm7,%xmm7,%xmm1
vmovdqu 32-32(%r9),%xmm15
vmovups %xmm9,-96(%rsi)
vpshufb %xmm0,%xmm9,%xmm9
vpxor %xmm7,%xmm1,%xmm1
vmovups %xmm10,-80(%rsi)
vpshufb %xmm0,%xmm10,%xmm10
vmovups %xmm11,-64(%rsi)
vpshufb %xmm0,%xmm11,%xmm11
vmovups %xmm12,-48(%rsi)
vpshufb %xmm0,%xmm12,%xmm12
vmovups %xmm13,-32(%rsi)
vpshufb %xmm0,%xmm13,%xmm13
vmovups %xmm14,-16(%rsi)
vpshufb %xmm0,%xmm14,%xmm14
vmovdqu %xmm9,16(%rsp)
vmovdqu 48(%rsp),%xmm6
vmovdqu 16-32(%r9),%xmm0
vpunpckhqdq %xmm6,%xmm6,%xmm2
vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5
vpxor %xmm6,%xmm2,%xmm2
vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
vmovdqu 64(%rsp),%xmm9
vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4
vmovdqu 48-32(%r9),%xmm3
vpxor %xmm5,%xmm4,%xmm4
vpunpckhqdq %xmm9,%xmm9,%xmm5
vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6
vpxor %xmm9,%xmm5,%xmm5
vpxor %xmm7,%xmm6,%xmm6
vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
vmovdqu 80-32(%r9),%xmm15
vpxor %xmm1,%xmm2,%xmm2
vmovdqu 80(%rsp),%xmm1
vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7
vmovdqu 64-32(%r9),%xmm0
vpxor %xmm4,%xmm7,%xmm7
vpunpckhqdq %xmm1,%xmm1,%xmm4
vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9
vpxor %xmm1,%xmm4,%xmm4
vpxor %xmm6,%xmm9,%xmm9
vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5
vpxor %xmm2,%xmm5,%xmm5
vmovdqu 96(%rsp),%xmm2
vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6
vmovdqu 96-32(%r9),%xmm3
vpxor %xmm7,%xmm6,%xmm6
vpunpckhqdq %xmm2,%xmm2,%xmm7
vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1
vpxor %xmm2,%xmm7,%xmm7
vpxor %xmm9,%xmm1,%xmm1
vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4
vmovdqu 128-32(%r9),%xmm15
vpxor %xmm5,%xmm4,%xmm4
vpxor 112(%rsp),%xmm8,%xmm8
vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5
vmovdqu 112-32(%r9),%xmm0
vpunpckhqdq %xmm8,%xmm8,%xmm9
vpxor %xmm6,%xmm5,%xmm5
vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2
vpxor %xmm8,%xmm9,%xmm9
vpxor %xmm1,%xmm2,%xmm2
vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7
vpxor %xmm4,%xmm7,%xmm4
vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6
vmovdqu 0-32(%r9),%xmm3
vpunpckhqdq %xmm14,%xmm14,%xmm1
vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8
vpxor %xmm14,%xmm1,%xmm1
vpxor %xmm5,%xmm6,%xmm5
vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9
vmovdqu 32-32(%r9),%xmm15
vpxor %xmm2,%xmm8,%xmm7
vpxor %xmm4,%xmm9,%xmm6
vmovdqu 16-32(%r9),%xmm0
vpxor %xmm5,%xmm7,%xmm9
vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4
vpxor %xmm9,%xmm6,%xmm6
vpunpckhqdq %xmm13,%xmm13,%xmm2
vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14
vpxor %xmm13,%xmm2,%xmm2
vpslldq $8,%xmm6,%xmm9
vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
vpxor %xmm9,%xmm5,%xmm8
vpsrldq $8,%xmm6,%xmm6
vpxor %xmm6,%xmm7,%xmm7
vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5
vmovdqu 48-32(%r9),%xmm3
vpxor %xmm4,%xmm5,%xmm5
vpunpckhqdq %xmm12,%xmm12,%xmm9
vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13
vpxor %xmm12,%xmm9,%xmm9
vpxor %xmm14,%xmm13,%xmm13
vpalignr $8,%xmm8,%xmm8,%xmm14
vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
vmovdqu 80-32(%r9),%xmm15
vpxor %xmm1,%xmm2,%xmm2
vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4
vmovdqu 64-32(%r9),%xmm0
vpxor %xmm5,%xmm4,%xmm4
vpunpckhqdq %xmm11,%xmm11,%xmm1
vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12
vpxor %xmm11,%xmm1,%xmm1
vpxor %xmm13,%xmm12,%xmm12
vxorps 16(%rsp),%xmm7,%xmm7
vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9
vpxor %xmm2,%xmm9,%xmm9
vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
vxorps %xmm14,%xmm8,%xmm8
vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5
vmovdqu 96-32(%r9),%xmm3
vpxor %xmm4,%xmm5,%xmm5
vpunpckhqdq %xmm10,%xmm10,%xmm2
vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11
vpxor %xmm10,%xmm2,%xmm2
vpalignr $8,%xmm8,%xmm8,%xmm14
vpxor %xmm12,%xmm11,%xmm11
vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1
vmovdqu 128-32(%r9),%xmm15
vpxor %xmm9,%xmm1,%xmm1
vxorps %xmm7,%xmm14,%xmm14
vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
vxorps %xmm14,%xmm8,%xmm8
vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4
vmovdqu 112-32(%r9),%xmm0
vpxor %xmm5,%xmm4,%xmm4
vpunpckhqdq %xmm8,%xmm8,%xmm9
vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10
vpxor %xmm8,%xmm9,%xmm9
vpxor %xmm11,%xmm10,%xmm10
vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2
vpxor %xmm1,%xmm2,%xmm2
vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5
vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7
vpxor %xmm4,%xmm5,%xmm5
vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6
vpxor %xmm10,%xmm7,%xmm7
vpxor %xmm2,%xmm6,%xmm6
vpxor %xmm5,%xmm7,%xmm4
vpxor %xmm4,%xmm6,%xmm6
vpslldq $8,%xmm6,%xmm1
vmovdqu 16(%r11),%xmm3
vpsrldq $8,%xmm6,%xmm6
vpxor %xmm1,%xmm5,%xmm8
vpxor %xmm6,%xmm7,%xmm7
vpalignr $8,%xmm8,%xmm8,%xmm2
vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
vpxor %xmm2,%xmm8,%xmm8
vpalignr $8,%xmm8,%xmm8,%xmm2
vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
vpxor %xmm7,%xmm2,%xmm2
vpxor %xmm2,%xmm8,%xmm8
vpshufb (%r11),%xmm8,%xmm8
vmovdqu %xmm8,-64(%r9)
vzeroupper
movq -48(%rax),%r15
.cfi_restore %r15
movq -40(%rax),%r14
.cfi_restore %r14
movq -32(%rax),%r13
.cfi_restore %r13
movq -24(%rax),%r12
.cfi_restore %r12
movq -16(%rax),%rbp
.cfi_restore %rbp
movq -8(%rax),%rbx
.cfi_restore %rbx
leaq (%rax),%rsp
.cfi_def_cfa_register %rsp
.Lgcm_enc_abort:
movq %r10,%rax
.byte 0xf3,0xc3
.cfi_endproc
.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
.align 64
.Lbswap_mask:
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
.Lpoly:
.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
.Lone_msb:
.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
.Ltwo_lsb:
.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
.Lone_lsb:
.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 64

View File

@ -9,6 +9,14 @@
.align 32
aesni_multi_cbc_encrypt:
.cfi_startproc
cmpl $2,%edx
jb .Lenc_non_avx
movl OPENSSL_ia32cap_P+4(%rip),%ecx
testl $268435456,%ecx
jnz _avx_cbc_enc_shortcut
jmp .Lenc_non_avx
.align 16
.Lenc_non_avx:
movq %rsp,%rax
.cfi_def_cfa_register %rax
pushq %rbx
@ -283,6 +291,14 @@ aesni_multi_cbc_encrypt:
.align 32
aesni_multi_cbc_decrypt:
.cfi_startproc
cmpl $2,%edx
jb .Ldec_non_avx
movl OPENSSL_ia32cap_P+4(%rip),%ecx
testl $268435456,%ecx
jnz _avx_cbc_dec_shortcut
jmp .Ldec_non_avx
.align 16
.Ldec_non_avx:
movq %rsp,%rax
.cfi_def_cfa_register %rax
pushq %rbx
@ -542,3 +558,952 @@ aesni_multi_cbc_decrypt:
.byte 0xf3,0xc3
.cfi_endproc
.size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
.type aesni_multi_cbc_encrypt_avx,@function
.align 32
aesni_multi_cbc_encrypt_avx:
.cfi_startproc
_avx_cbc_enc_shortcut:
movq %rsp,%rax
.cfi_def_cfa_register %rax
pushq %rbx
.cfi_offset %rbx,-16
pushq %rbp
.cfi_offset %rbp,-24
pushq %r12
.cfi_offset %r12,-32
pushq %r13
.cfi_offset %r13,-40
pushq %r14
.cfi_offset %r14,-48
pushq %r15
.cfi_offset %r15,-56
subq $192,%rsp
andq $-128,%rsp
movq %rax,16(%rsp)
.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08
.Lenc8x_body:
vzeroupper
vmovdqu (%rsi),%xmm15
leaq 120(%rsi),%rsi
leaq 160(%rdi),%rdi
shrl $1,%edx
.Lenc8x_loop_grande:
xorl %edx,%edx
movl -144(%rdi),%ecx
movq -160(%rdi),%r8
cmpl %edx,%ecx
movq -152(%rdi),%rbx
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu -136(%rdi),%xmm2
movl %ecx,32(%rsp)
cmovleq %rsp,%r8
subq %r8,%rbx
movq %rbx,64(%rsp)
movl -104(%rdi),%ecx
movq -120(%rdi),%r9
cmpl %edx,%ecx
movq -112(%rdi),%rbp
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu -96(%rdi),%xmm3
movl %ecx,36(%rsp)
cmovleq %rsp,%r9
subq %r9,%rbp
movq %rbp,72(%rsp)
movl -64(%rdi),%ecx
movq -80(%rdi),%r10
cmpl %edx,%ecx
movq -72(%rdi),%rbp
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu -56(%rdi),%xmm4
movl %ecx,40(%rsp)
cmovleq %rsp,%r10
subq %r10,%rbp
movq %rbp,80(%rsp)
movl -24(%rdi),%ecx
movq -40(%rdi),%r11
cmpl %edx,%ecx
movq -32(%rdi),%rbp
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu -16(%rdi),%xmm5
movl %ecx,44(%rsp)
cmovleq %rsp,%r11
subq %r11,%rbp
movq %rbp,88(%rsp)
movl 16(%rdi),%ecx
movq 0(%rdi),%r12
cmpl %edx,%ecx
movq 8(%rdi),%rbp
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu 24(%rdi),%xmm6
movl %ecx,48(%rsp)
cmovleq %rsp,%r12
subq %r12,%rbp
movq %rbp,96(%rsp)
movl 56(%rdi),%ecx
movq 40(%rdi),%r13
cmpl %edx,%ecx
movq 48(%rdi),%rbp
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu 64(%rdi),%xmm7
movl %ecx,52(%rsp)
cmovleq %rsp,%r13
subq %r13,%rbp
movq %rbp,104(%rsp)
movl 96(%rdi),%ecx
movq 80(%rdi),%r14
cmpl %edx,%ecx
movq 88(%rdi),%rbp
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu 104(%rdi),%xmm8
movl %ecx,56(%rsp)
cmovleq %rsp,%r14
subq %r14,%rbp
movq %rbp,112(%rsp)
movl 136(%rdi),%ecx
movq 120(%rdi),%r15
cmpl %edx,%ecx
movq 128(%rdi),%rbp
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu 144(%rdi),%xmm9
movl %ecx,60(%rsp)
cmovleq %rsp,%r15
subq %r15,%rbp
movq %rbp,120(%rsp)
testl %edx,%edx
jz .Lenc8x_done
vmovups 16-120(%rsi),%xmm1
vmovups 32-120(%rsi),%xmm0
movl 240-120(%rsi),%eax
vpxor (%r8),%xmm15,%xmm10
leaq 128(%rsp),%rbp
vpxor (%r9),%xmm15,%xmm11
vpxor (%r10),%xmm15,%xmm12
vpxor (%r11),%xmm15,%xmm13
vpxor %xmm10,%xmm2,%xmm2
vpxor (%r12),%xmm15,%xmm10
vpxor %xmm11,%xmm3,%xmm3
vpxor (%r13),%xmm15,%xmm11
vpxor %xmm12,%xmm4,%xmm4
vpxor (%r14),%xmm15,%xmm12
vpxor %xmm13,%xmm5,%xmm5
vpxor (%r15),%xmm15,%xmm13
vpxor %xmm10,%xmm6,%xmm6
movl $1,%ecx
vpxor %xmm11,%xmm7,%xmm7
vpxor %xmm12,%xmm8,%xmm8
vpxor %xmm13,%xmm9,%xmm9
jmp .Loop_enc8x
.align 32
.Loop_enc8x:
vaesenc %xmm1,%xmm2,%xmm2
cmpl 32+0(%rsp),%ecx
vaesenc %xmm1,%xmm3,%xmm3
prefetcht0 31(%r8)
vaesenc %xmm1,%xmm4,%xmm4
vaesenc %xmm1,%xmm5,%xmm5
leaq (%r8,%rbx,1),%rbx
cmovgeq %rsp,%r8
vaesenc %xmm1,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesenc %xmm1,%xmm7,%xmm7
subq %r8,%rbx
vaesenc %xmm1,%xmm8,%xmm8
vpxor 16(%r8),%xmm15,%xmm10
movq %rbx,64+0(%rsp)
vaesenc %xmm1,%xmm9,%xmm9
vmovups -72(%rsi),%xmm1
leaq 16(%r8,%rbx,1),%r8
vmovdqu %xmm10,0(%rbp)
vaesenc %xmm0,%xmm2,%xmm2
cmpl 32+4(%rsp),%ecx
movq 64+8(%rsp),%rbx
vaesenc %xmm0,%xmm3,%xmm3
prefetcht0 31(%r9)
vaesenc %xmm0,%xmm4,%xmm4
vaesenc %xmm0,%xmm5,%xmm5
leaq (%r9,%rbx,1),%rbx
cmovgeq %rsp,%r9
vaesenc %xmm0,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesenc %xmm0,%xmm7,%xmm7
subq %r9,%rbx
vaesenc %xmm0,%xmm8,%xmm8
vpxor 16(%r9),%xmm15,%xmm11
movq %rbx,64+8(%rsp)
vaesenc %xmm0,%xmm9,%xmm9
vmovups -56(%rsi),%xmm0
leaq 16(%r9,%rbx,1),%r9
vmovdqu %xmm11,16(%rbp)
vaesenc %xmm1,%xmm2,%xmm2
cmpl 32+8(%rsp),%ecx
movq 64+16(%rsp),%rbx
vaesenc %xmm1,%xmm3,%xmm3
prefetcht0 31(%r10)
vaesenc %xmm1,%xmm4,%xmm4
prefetcht0 15(%r8)
vaesenc %xmm1,%xmm5,%xmm5
leaq (%r10,%rbx,1),%rbx
cmovgeq %rsp,%r10
vaesenc %xmm1,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesenc %xmm1,%xmm7,%xmm7
subq %r10,%rbx
vaesenc %xmm1,%xmm8,%xmm8
vpxor 16(%r10),%xmm15,%xmm12
movq %rbx,64+16(%rsp)
vaesenc %xmm1,%xmm9,%xmm9
vmovups -40(%rsi),%xmm1
leaq 16(%r10,%rbx,1),%r10
vmovdqu %xmm12,32(%rbp)
vaesenc %xmm0,%xmm2,%xmm2
cmpl 32+12(%rsp),%ecx
movq 64+24(%rsp),%rbx
vaesenc %xmm0,%xmm3,%xmm3
prefetcht0 31(%r11)
vaesenc %xmm0,%xmm4,%xmm4
prefetcht0 15(%r9)
vaesenc %xmm0,%xmm5,%xmm5
leaq (%r11,%rbx,1),%rbx
cmovgeq %rsp,%r11
vaesenc %xmm0,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesenc %xmm0,%xmm7,%xmm7
subq %r11,%rbx
vaesenc %xmm0,%xmm8,%xmm8
vpxor 16(%r11),%xmm15,%xmm13
movq %rbx,64+24(%rsp)
vaesenc %xmm0,%xmm9,%xmm9
vmovups -24(%rsi),%xmm0
leaq 16(%r11,%rbx,1),%r11
vmovdqu %xmm13,48(%rbp)
vaesenc %xmm1,%xmm2,%xmm2
cmpl 32+16(%rsp),%ecx
movq 64+32(%rsp),%rbx
vaesenc %xmm1,%xmm3,%xmm3
prefetcht0 31(%r12)
vaesenc %xmm1,%xmm4,%xmm4
prefetcht0 15(%r10)
vaesenc %xmm1,%xmm5,%xmm5
leaq (%r12,%rbx,1),%rbx
cmovgeq %rsp,%r12
vaesenc %xmm1,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesenc %xmm1,%xmm7,%xmm7
subq %r12,%rbx
vaesenc %xmm1,%xmm8,%xmm8
vpxor 16(%r12),%xmm15,%xmm10
movq %rbx,64+32(%rsp)
vaesenc %xmm1,%xmm9,%xmm9
vmovups -8(%rsi),%xmm1
leaq 16(%r12,%rbx,1),%r12
vaesenc %xmm0,%xmm2,%xmm2
cmpl 32+20(%rsp),%ecx
movq 64+40(%rsp),%rbx
vaesenc %xmm0,%xmm3,%xmm3
prefetcht0 31(%r13)
vaesenc %xmm0,%xmm4,%xmm4
prefetcht0 15(%r11)
vaesenc %xmm0,%xmm5,%xmm5
leaq (%rbx,%r13,1),%rbx
cmovgeq %rsp,%r13
vaesenc %xmm0,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesenc %xmm0,%xmm7,%xmm7
subq %r13,%rbx
vaesenc %xmm0,%xmm8,%xmm8
vpxor 16(%r13),%xmm15,%xmm11
movq %rbx,64+40(%rsp)
vaesenc %xmm0,%xmm9,%xmm9
vmovups 8(%rsi),%xmm0
leaq 16(%r13,%rbx,1),%r13
vaesenc %xmm1,%xmm2,%xmm2
cmpl 32+24(%rsp),%ecx
movq 64+48(%rsp),%rbx
vaesenc %xmm1,%xmm3,%xmm3
prefetcht0 31(%r14)
vaesenc %xmm1,%xmm4,%xmm4
prefetcht0 15(%r12)
vaesenc %xmm1,%xmm5,%xmm5
leaq (%r14,%rbx,1),%rbx
cmovgeq %rsp,%r14
vaesenc %xmm1,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesenc %xmm1,%xmm7,%xmm7
subq %r14,%rbx
vaesenc %xmm1,%xmm8,%xmm8
vpxor 16(%r14),%xmm15,%xmm12
movq %rbx,64+48(%rsp)
vaesenc %xmm1,%xmm9,%xmm9
vmovups 24(%rsi),%xmm1
leaq 16(%r14,%rbx,1),%r14
vaesenc %xmm0,%xmm2,%xmm2
cmpl 32+28(%rsp),%ecx
movq 64+56(%rsp),%rbx
vaesenc %xmm0,%xmm3,%xmm3
prefetcht0 31(%r15)
vaesenc %xmm0,%xmm4,%xmm4
prefetcht0 15(%r13)
vaesenc %xmm0,%xmm5,%xmm5
leaq (%r15,%rbx,1),%rbx
cmovgeq %rsp,%r15
vaesenc %xmm0,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesenc %xmm0,%xmm7,%xmm7
subq %r15,%rbx
vaesenc %xmm0,%xmm8,%xmm8
vpxor 16(%r15),%xmm15,%xmm13
movq %rbx,64+56(%rsp)
vaesenc %xmm0,%xmm9,%xmm9
vmovups 40(%rsi),%xmm0
leaq 16(%r15,%rbx,1),%r15
vmovdqu 32(%rsp),%xmm14
prefetcht0 15(%r14)
prefetcht0 15(%r15)
cmpl $11,%eax
jb .Lenc8x_tail
vaesenc %xmm1,%xmm2,%xmm2
vaesenc %xmm1,%xmm3,%xmm3
vaesenc %xmm1,%xmm4,%xmm4
vaesenc %xmm1,%xmm5,%xmm5
vaesenc %xmm1,%xmm6,%xmm6
vaesenc %xmm1,%xmm7,%xmm7
vaesenc %xmm1,%xmm8,%xmm8
vaesenc %xmm1,%xmm9,%xmm9
vmovups 176-120(%rsi),%xmm1
vaesenc %xmm0,%xmm2,%xmm2
vaesenc %xmm0,%xmm3,%xmm3
vaesenc %xmm0,%xmm4,%xmm4
vaesenc %xmm0,%xmm5,%xmm5
vaesenc %xmm0,%xmm6,%xmm6
vaesenc %xmm0,%xmm7,%xmm7
vaesenc %xmm0,%xmm8,%xmm8
vaesenc %xmm0,%xmm9,%xmm9
vmovups 192-120(%rsi),%xmm0
je .Lenc8x_tail
vaesenc %xmm1,%xmm2,%xmm2
vaesenc %xmm1,%xmm3,%xmm3
vaesenc %xmm1,%xmm4,%xmm4
vaesenc %xmm1,%xmm5,%xmm5
vaesenc %xmm1,%xmm6,%xmm6
vaesenc %xmm1,%xmm7,%xmm7
vaesenc %xmm1,%xmm8,%xmm8
vaesenc %xmm1,%xmm9,%xmm9
vmovups 208-120(%rsi),%xmm1
vaesenc %xmm0,%xmm2,%xmm2
vaesenc %xmm0,%xmm3,%xmm3
vaesenc %xmm0,%xmm4,%xmm4
vaesenc %xmm0,%xmm5,%xmm5
vaesenc %xmm0,%xmm6,%xmm6
vaesenc %xmm0,%xmm7,%xmm7
vaesenc %xmm0,%xmm8,%xmm8
vaesenc %xmm0,%xmm9,%xmm9
vmovups 224-120(%rsi),%xmm0
.Lenc8x_tail:
vaesenc %xmm1,%xmm2,%xmm2
vpxor %xmm15,%xmm15,%xmm15
vaesenc %xmm1,%xmm3,%xmm3
vaesenc %xmm1,%xmm4,%xmm4
vpcmpgtd %xmm15,%xmm14,%xmm15
vaesenc %xmm1,%xmm5,%xmm5
vaesenc %xmm1,%xmm6,%xmm6
vpaddd %xmm14,%xmm15,%xmm15
vmovdqu 48(%rsp),%xmm14
vaesenc %xmm1,%xmm7,%xmm7
movq 64(%rsp),%rbx
vaesenc %xmm1,%xmm8,%xmm8
vaesenc %xmm1,%xmm9,%xmm9
vmovups 16-120(%rsi),%xmm1
vaesenclast %xmm0,%xmm2,%xmm2
vmovdqa %xmm15,32(%rsp)
vpxor %xmm15,%xmm15,%xmm15
vaesenclast %xmm0,%xmm3,%xmm3
vaesenclast %xmm0,%xmm4,%xmm4
vpcmpgtd %xmm15,%xmm14,%xmm15
vaesenclast %xmm0,%xmm5,%xmm5
vaesenclast %xmm0,%xmm6,%xmm6
vpaddd %xmm15,%xmm14,%xmm14
vmovdqu -120(%rsi),%xmm15
vaesenclast %xmm0,%xmm7,%xmm7
vaesenclast %xmm0,%xmm8,%xmm8
vmovdqa %xmm14,48(%rsp)
vaesenclast %xmm0,%xmm9,%xmm9
vmovups 32-120(%rsi),%xmm0
vmovups %xmm2,-16(%r8)
subq %rbx,%r8
vpxor 0(%rbp),%xmm2,%xmm2
vmovups %xmm3,-16(%r9)
subq 72(%rsp),%r9
vpxor 16(%rbp),%xmm3,%xmm3
vmovups %xmm4,-16(%r10)
subq 80(%rsp),%r10
vpxor 32(%rbp),%xmm4,%xmm4
vmovups %xmm5,-16(%r11)
subq 88(%rsp),%r11
vpxor 48(%rbp),%xmm5,%xmm5
vmovups %xmm6,-16(%r12)
subq 96(%rsp),%r12
vpxor %xmm10,%xmm6,%xmm6
vmovups %xmm7,-16(%r13)
subq 104(%rsp),%r13
vpxor %xmm11,%xmm7,%xmm7
vmovups %xmm8,-16(%r14)
subq 112(%rsp),%r14
vpxor %xmm12,%xmm8,%xmm8
vmovups %xmm9,-16(%r15)
subq 120(%rsp),%r15
vpxor %xmm13,%xmm9,%xmm9
decl %edx
jnz .Loop_enc8x
movq 16(%rsp),%rax
.cfi_def_cfa %rax,8
.Lenc8x_done:
vzeroupper
movq -48(%rax),%r15
.cfi_restore %r15
movq -40(%rax),%r14
.cfi_restore %r14
movq -32(%rax),%r13
.cfi_restore %r13
movq -24(%rax),%r12
.cfi_restore %r12
movq -16(%rax),%rbp
.cfi_restore %rbp
movq -8(%rax),%rbx
.cfi_restore %rbx
leaq (%rax),%rsp
.cfi_def_cfa_register %rsp
.Lenc8x_epilogue:
.byte 0xf3,0xc3
.cfi_endproc
.size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
.type aesni_multi_cbc_decrypt_avx,@function
.align 32
aesni_multi_cbc_decrypt_avx:
.cfi_startproc
_avx_cbc_dec_shortcut:
movq %rsp,%rax
.cfi_def_cfa_register %rax
pushq %rbx
.cfi_offset %rbx,-16
pushq %rbp
.cfi_offset %rbp,-24
pushq %r12
.cfi_offset %r12,-32
pushq %r13
.cfi_offset %r13,-40
pushq %r14
.cfi_offset %r14,-48
pushq %r15
.cfi_offset %r15,-56
subq $256,%rsp
andq $-256,%rsp
subq $192,%rsp
movq %rax,16(%rsp)
.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08
.Ldec8x_body:
vzeroupper
vmovdqu (%rsi),%xmm15
leaq 120(%rsi),%rsi
leaq 160(%rdi),%rdi
shrl $1,%edx
.Ldec8x_loop_grande:
xorl %edx,%edx
movl -144(%rdi),%ecx
movq -160(%rdi),%r8
cmpl %edx,%ecx
movq -152(%rdi),%rbx
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu -136(%rdi),%xmm2
movl %ecx,32(%rsp)
cmovleq %rsp,%r8
subq %r8,%rbx
movq %rbx,64(%rsp)
vmovdqu %xmm2,192(%rsp)
movl -104(%rdi),%ecx
movq -120(%rdi),%r9
cmpl %edx,%ecx
movq -112(%rdi),%rbp
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu -96(%rdi),%xmm3
movl %ecx,36(%rsp)
cmovleq %rsp,%r9
subq %r9,%rbp
movq %rbp,72(%rsp)
vmovdqu %xmm3,208(%rsp)
movl -64(%rdi),%ecx
movq -80(%rdi),%r10
cmpl %edx,%ecx
movq -72(%rdi),%rbp
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu -56(%rdi),%xmm4
movl %ecx,40(%rsp)
cmovleq %rsp,%r10
subq %r10,%rbp
movq %rbp,80(%rsp)
vmovdqu %xmm4,224(%rsp)
movl -24(%rdi),%ecx
movq -40(%rdi),%r11
cmpl %edx,%ecx
movq -32(%rdi),%rbp
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu -16(%rdi),%xmm5
movl %ecx,44(%rsp)
cmovleq %rsp,%r11
subq %r11,%rbp
movq %rbp,88(%rsp)
vmovdqu %xmm5,240(%rsp)
movl 16(%rdi),%ecx
movq 0(%rdi),%r12
cmpl %edx,%ecx
movq 8(%rdi),%rbp
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu 24(%rdi),%xmm6
movl %ecx,48(%rsp)
cmovleq %rsp,%r12
subq %r12,%rbp
movq %rbp,96(%rsp)
vmovdqu %xmm6,256(%rsp)
movl 56(%rdi),%ecx
movq 40(%rdi),%r13
cmpl %edx,%ecx
movq 48(%rdi),%rbp
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu 64(%rdi),%xmm7
movl %ecx,52(%rsp)
cmovleq %rsp,%r13
subq %r13,%rbp
movq %rbp,104(%rsp)
vmovdqu %xmm7,272(%rsp)
movl 96(%rdi),%ecx
movq 80(%rdi),%r14
cmpl %edx,%ecx
movq 88(%rdi),%rbp
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu 104(%rdi),%xmm8
movl %ecx,56(%rsp)
cmovleq %rsp,%r14
subq %r14,%rbp
movq %rbp,112(%rsp)
vmovdqu %xmm8,288(%rsp)
movl 136(%rdi),%ecx
movq 120(%rdi),%r15
cmpl %edx,%ecx
movq 128(%rdi),%rbp
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu 144(%rdi),%xmm9
movl %ecx,60(%rsp)
cmovleq %rsp,%r15
subq %r15,%rbp
movq %rbp,120(%rsp)
vmovdqu %xmm9,304(%rsp)
testl %edx,%edx
jz .Ldec8x_done
vmovups 16-120(%rsi),%xmm1
vmovups 32-120(%rsi),%xmm0
movl 240-120(%rsi),%eax
leaq 192+128(%rsp),%rbp
vmovdqu (%r8),%xmm2
vmovdqu (%r9),%xmm3
vmovdqu (%r10),%xmm4
vmovdqu (%r11),%xmm5
vmovdqu (%r12),%xmm6
vmovdqu (%r13),%xmm7
vmovdqu (%r14),%xmm8
vmovdqu (%r15),%xmm9
vmovdqu %xmm2,0(%rbp)
vpxor %xmm15,%xmm2,%xmm2
vmovdqu %xmm3,16(%rbp)
vpxor %xmm15,%xmm3,%xmm3
vmovdqu %xmm4,32(%rbp)
vpxor %xmm15,%xmm4,%xmm4
vmovdqu %xmm5,48(%rbp)
vpxor %xmm15,%xmm5,%xmm5
vmovdqu %xmm6,64(%rbp)
vpxor %xmm15,%xmm6,%xmm6
vmovdqu %xmm7,80(%rbp)
vpxor %xmm15,%xmm7,%xmm7
vmovdqu %xmm8,96(%rbp)
vpxor %xmm15,%xmm8,%xmm8
vmovdqu %xmm9,112(%rbp)
vpxor %xmm15,%xmm9,%xmm9
xorq $0x80,%rbp
movl $1,%ecx
jmp .Loop_dec8x
.align 32
.Loop_dec8x:
vaesdec %xmm1,%xmm2,%xmm2
cmpl 32+0(%rsp),%ecx
vaesdec %xmm1,%xmm3,%xmm3
prefetcht0 31(%r8)
vaesdec %xmm1,%xmm4,%xmm4
vaesdec %xmm1,%xmm5,%xmm5
leaq (%r8,%rbx,1),%rbx
cmovgeq %rsp,%r8
vaesdec %xmm1,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesdec %xmm1,%xmm7,%xmm7
subq %r8,%rbx
vaesdec %xmm1,%xmm8,%xmm8
vmovdqu 16(%r8),%xmm10
movq %rbx,64+0(%rsp)
vaesdec %xmm1,%xmm9,%xmm9
vmovups -72(%rsi),%xmm1
leaq 16(%r8,%rbx,1),%r8
vmovdqu %xmm10,128(%rsp)
vaesdec %xmm0,%xmm2,%xmm2
cmpl 32+4(%rsp),%ecx
movq 64+8(%rsp),%rbx
vaesdec %xmm0,%xmm3,%xmm3
prefetcht0 31(%r9)
vaesdec %xmm0,%xmm4,%xmm4
vaesdec %xmm0,%xmm5,%xmm5
leaq (%r9,%rbx,1),%rbx
cmovgeq %rsp,%r9
vaesdec %xmm0,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesdec %xmm0,%xmm7,%xmm7
subq %r9,%rbx
vaesdec %xmm0,%xmm8,%xmm8
vmovdqu 16(%r9),%xmm11
movq %rbx,64+8(%rsp)
vaesdec %xmm0,%xmm9,%xmm9
vmovups -56(%rsi),%xmm0
leaq 16(%r9,%rbx,1),%r9
vmovdqu %xmm11,144(%rsp)
vaesdec %xmm1,%xmm2,%xmm2
cmpl 32+8(%rsp),%ecx
movq 64+16(%rsp),%rbx
vaesdec %xmm1,%xmm3,%xmm3
prefetcht0 31(%r10)
vaesdec %xmm1,%xmm4,%xmm4
prefetcht0 15(%r8)
vaesdec %xmm1,%xmm5,%xmm5
leaq (%r10,%rbx,1),%rbx
cmovgeq %rsp,%r10
vaesdec %xmm1,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesdec %xmm1,%xmm7,%xmm7
subq %r10,%rbx
vaesdec %xmm1,%xmm8,%xmm8
vmovdqu 16(%r10),%xmm12
movq %rbx,64+16(%rsp)
vaesdec %xmm1,%xmm9,%xmm9
vmovups -40(%rsi),%xmm1
leaq 16(%r10,%rbx,1),%r10
vmovdqu %xmm12,160(%rsp)
vaesdec %xmm0,%xmm2,%xmm2
cmpl 32+12(%rsp),%ecx
movq 64+24(%rsp),%rbx
vaesdec %xmm0,%xmm3,%xmm3
prefetcht0 31(%r11)
vaesdec %xmm0,%xmm4,%xmm4
prefetcht0 15(%r9)
vaesdec %xmm0,%xmm5,%xmm5
leaq (%r11,%rbx,1),%rbx
cmovgeq %rsp,%r11
vaesdec %xmm0,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesdec %xmm0,%xmm7,%xmm7
subq %r11,%rbx
vaesdec %xmm0,%xmm8,%xmm8
vmovdqu 16(%r11),%xmm13
movq %rbx,64+24(%rsp)
vaesdec %xmm0,%xmm9,%xmm9
vmovups -24(%rsi),%xmm0
leaq 16(%r11,%rbx,1),%r11
vmovdqu %xmm13,176(%rsp)
vaesdec %xmm1,%xmm2,%xmm2
cmpl 32+16(%rsp),%ecx
movq 64+32(%rsp),%rbx
vaesdec %xmm1,%xmm3,%xmm3
prefetcht0 31(%r12)
vaesdec %xmm1,%xmm4,%xmm4
prefetcht0 15(%r10)
vaesdec %xmm1,%xmm5,%xmm5
leaq (%r12,%rbx,1),%rbx
cmovgeq %rsp,%r12
vaesdec %xmm1,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesdec %xmm1,%xmm7,%xmm7
subq %r12,%rbx
vaesdec %xmm1,%xmm8,%xmm8
vmovdqu 16(%r12),%xmm10
movq %rbx,64+32(%rsp)
vaesdec %xmm1,%xmm9,%xmm9
vmovups -8(%rsi),%xmm1
leaq 16(%r12,%rbx,1),%r12
vaesdec %xmm0,%xmm2,%xmm2
cmpl 32+20(%rsp),%ecx
movq 64+40(%rsp),%rbx
vaesdec %xmm0,%xmm3,%xmm3
prefetcht0 31(%r13)
vaesdec %xmm0,%xmm4,%xmm4
prefetcht0 15(%r11)
vaesdec %xmm0,%xmm5,%xmm5
leaq (%rbx,%r13,1),%rbx
cmovgeq %rsp,%r13
vaesdec %xmm0,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesdec %xmm0,%xmm7,%xmm7
subq %r13,%rbx
vaesdec %xmm0,%xmm8,%xmm8
vmovdqu 16(%r13),%xmm11
movq %rbx,64+40(%rsp)
vaesdec %xmm0,%xmm9,%xmm9
vmovups 8(%rsi),%xmm0
leaq 16(%r13,%rbx,1),%r13
vaesdec %xmm1,%xmm2,%xmm2
cmpl 32+24(%rsp),%ecx
movq 64+48(%rsp),%rbx
vaesdec %xmm1,%xmm3,%xmm3
prefetcht0 31(%r14)
vaesdec %xmm1,%xmm4,%xmm4
prefetcht0 15(%r12)
vaesdec %xmm1,%xmm5,%xmm5
leaq (%r14,%rbx,1),%rbx
cmovgeq %rsp,%r14
vaesdec %xmm1,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesdec %xmm1,%xmm7,%xmm7
subq %r14,%rbx
vaesdec %xmm1,%xmm8,%xmm8
vmovdqu 16(%r14),%xmm12
movq %rbx,64+48(%rsp)
vaesdec %xmm1,%xmm9,%xmm9
vmovups 24(%rsi),%xmm1
leaq 16(%r14,%rbx,1),%r14
vaesdec %xmm0,%xmm2,%xmm2
cmpl 32+28(%rsp),%ecx
movq 64+56(%rsp),%rbx
vaesdec %xmm0,%xmm3,%xmm3
prefetcht0 31(%r15)
vaesdec %xmm0,%xmm4,%xmm4
prefetcht0 15(%r13)
vaesdec %xmm0,%xmm5,%xmm5
leaq (%r15,%rbx,1),%rbx
cmovgeq %rsp,%r15
vaesdec %xmm0,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesdec %xmm0,%xmm7,%xmm7
subq %r15,%rbx
vaesdec %xmm0,%xmm8,%xmm8
vmovdqu 16(%r15),%xmm13
movq %rbx,64+56(%rsp)
vaesdec %xmm0,%xmm9,%xmm9
vmovups 40(%rsi),%xmm0
leaq 16(%r15,%rbx,1),%r15
vmovdqu 32(%rsp),%xmm14
prefetcht0 15(%r14)
prefetcht0 15(%r15)
cmpl $11,%eax
jb .Ldec8x_tail
vaesdec %xmm1,%xmm2,%xmm2
vaesdec %xmm1,%xmm3,%xmm3
vaesdec %xmm1,%xmm4,%xmm4
vaesdec %xmm1,%xmm5,%xmm5
vaesdec %xmm1,%xmm6,%xmm6
vaesdec %xmm1,%xmm7,%xmm7
vaesdec %xmm1,%xmm8,%xmm8
vaesdec %xmm1,%xmm9,%xmm9
vmovups 176-120(%rsi),%xmm1
vaesdec %xmm0,%xmm2,%xmm2
vaesdec %xmm0,%xmm3,%xmm3
vaesdec %xmm0,%xmm4,%xmm4
vaesdec %xmm0,%xmm5,%xmm5
vaesdec %xmm0,%xmm6,%xmm6
vaesdec %xmm0,%xmm7,%xmm7
vaesdec %xmm0,%xmm8,%xmm8
vaesdec %xmm0,%xmm9,%xmm9
vmovups 192-120(%rsi),%xmm0
je .Ldec8x_tail
vaesdec %xmm1,%xmm2,%xmm2
vaesdec %xmm1,%xmm3,%xmm3
vaesdec %xmm1,%xmm4,%xmm4
vaesdec %xmm1,%xmm5,%xmm5
vaesdec %xmm1,%xmm6,%xmm6
vaesdec %xmm1,%xmm7,%xmm7
vaesdec %xmm1,%xmm8,%xmm8
vaesdec %xmm1,%xmm9,%xmm9
vmovups 208-120(%rsi),%xmm1
vaesdec %xmm0,%xmm2,%xmm2
vaesdec %xmm0,%xmm3,%xmm3
vaesdec %xmm0,%xmm4,%xmm4
vaesdec %xmm0,%xmm5,%xmm5
vaesdec %xmm0,%xmm6,%xmm6
vaesdec %xmm0,%xmm7,%xmm7
vaesdec %xmm0,%xmm8,%xmm8
vaesdec %xmm0,%xmm9,%xmm9
vmovups 224-120(%rsi),%xmm0
.Ldec8x_tail:
vaesdec %xmm1,%xmm2,%xmm2
vpxor %xmm15,%xmm15,%xmm15
vaesdec %xmm1,%xmm3,%xmm3
vaesdec %xmm1,%xmm4,%xmm4
vpcmpgtd %xmm15,%xmm14,%xmm15
vaesdec %xmm1,%xmm5,%xmm5
vaesdec %xmm1,%xmm6,%xmm6
vpaddd %xmm14,%xmm15,%xmm15
vmovdqu 48(%rsp),%xmm14
vaesdec %xmm1,%xmm7,%xmm7
movq 64(%rsp),%rbx
vaesdec %xmm1,%xmm8,%xmm8
vaesdec %xmm1,%xmm9,%xmm9
vmovups 16-120(%rsi),%xmm1
vaesdeclast %xmm0,%xmm2,%xmm2
vmovdqa %xmm15,32(%rsp)
vpxor %xmm15,%xmm15,%xmm15
vaesdeclast %xmm0,%xmm3,%xmm3
vpxor 0(%rbp),%xmm2,%xmm2
vaesdeclast %xmm0,%xmm4,%xmm4
vpxor 16(%rbp),%xmm3,%xmm3
vpcmpgtd %xmm15,%xmm14,%xmm15
vaesdeclast %xmm0,%xmm5,%xmm5
vpxor 32(%rbp),%xmm4,%xmm4
vaesdeclast %xmm0,%xmm6,%xmm6
vpxor 48(%rbp),%xmm5,%xmm5
vpaddd %xmm15,%xmm14,%xmm14
vmovdqu -120(%rsi),%xmm15
vaesdeclast %xmm0,%xmm7,%xmm7
vpxor 64(%rbp),%xmm6,%xmm6
vaesdeclast %xmm0,%xmm8,%xmm8
vpxor 80(%rbp),%xmm7,%xmm7
vmovdqa %xmm14,48(%rsp)
vaesdeclast %xmm0,%xmm9,%xmm9
vpxor 96(%rbp),%xmm8,%xmm8
vmovups 32-120(%rsi),%xmm0
vmovups %xmm2,-16(%r8)
subq %rbx,%r8
vmovdqu 128+0(%rsp),%xmm2
vpxor 112(%rbp),%xmm9,%xmm9
vmovups %xmm3,-16(%r9)
subq 72(%rsp),%r9
vmovdqu %xmm2,0(%rbp)
vpxor %xmm15,%xmm2,%xmm2
vmovdqu 128+16(%rsp),%xmm3
vmovups %xmm4,-16(%r10)
subq 80(%rsp),%r10
vmovdqu %xmm3,16(%rbp)
vpxor %xmm15,%xmm3,%xmm3
vmovdqu 128+32(%rsp),%xmm4
vmovups %xmm5,-16(%r11)
subq 88(%rsp),%r11
vmovdqu %xmm4,32(%rbp)
vpxor %xmm15,%xmm4,%xmm4
vmovdqu 128+48(%rsp),%xmm5
vmovups %xmm6,-16(%r12)
subq 96(%rsp),%r12
vmovdqu %xmm5,48(%rbp)
vpxor %xmm15,%xmm5,%xmm5
vmovdqu %xmm10,64(%rbp)
vpxor %xmm10,%xmm15,%xmm6
vmovups %xmm7,-16(%r13)
subq 104(%rsp),%r13
vmovdqu %xmm11,80(%rbp)
vpxor %xmm11,%xmm15,%xmm7
vmovups %xmm8,-16(%r14)
subq 112(%rsp),%r14
vmovdqu %xmm12,96(%rbp)
vpxor %xmm12,%xmm15,%xmm8
vmovups %xmm9,-16(%r15)
subq 120(%rsp),%r15
vmovdqu %xmm13,112(%rbp)
vpxor %xmm13,%xmm15,%xmm9
xorq $128,%rbp
decl %edx
jnz .Loop_dec8x
movq 16(%rsp),%rax
.cfi_def_cfa %rax,8
.Ldec8x_done:
vzeroupper
movq -48(%rax),%r15
.cfi_restore %r15
movq -40(%rax),%r14
.cfi_restore %r14
movq -32(%rax),%r13
.cfi_restore %r13
movq -24(%rax),%r12
.cfi_restore %r12
movq -16(%rax),%rbp
.cfi_restore %rbp
movq -8(%rax),%rbx
.cfi_restore %rbx
leaq (%rax),%rsp
.cfi_def_cfa_register %rsp
.Ldec8x_epilogue:
.byte 0xf3,0xc3
.cfi_endproc
.size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1304,7 +1304,108 @@ gcm_ghash_clmul:
.align 32
gcm_init_avx:
.cfi_startproc
jmp .L_init_clmul
vzeroupper
vmovdqu (%rsi),%xmm2
vpshufd $78,%xmm2,%xmm2
vpshufd $255,%xmm2,%xmm4
vpsrlq $63,%xmm2,%xmm3
vpsllq $1,%xmm2,%xmm2
vpxor %xmm5,%xmm5,%xmm5
vpcmpgtd %xmm4,%xmm5,%xmm5
vpslldq $8,%xmm3,%xmm3
vpor %xmm3,%xmm2,%xmm2
vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5
vpxor %xmm5,%xmm2,%xmm2
vpunpckhqdq %xmm2,%xmm2,%xmm6
vmovdqa %xmm2,%xmm0
vpxor %xmm2,%xmm6,%xmm6
movq $4,%r10
jmp .Linit_start_avx
.align 32
.Linit_loop_avx:
vpalignr $8,%xmm3,%xmm4,%xmm5
vmovdqu %xmm5,-16(%rdi)
vpunpckhqdq %xmm0,%xmm0,%xmm3
vpxor %xmm0,%xmm3,%xmm3
vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
vpxor %xmm0,%xmm1,%xmm4
vpxor %xmm4,%xmm3,%xmm3
vpslldq $8,%xmm3,%xmm4
vpsrldq $8,%xmm3,%xmm3
vpxor %xmm4,%xmm0,%xmm0
vpxor %xmm3,%xmm1,%xmm1
vpsllq $57,%xmm0,%xmm3
vpsllq $62,%xmm0,%xmm4
vpxor %xmm3,%xmm4,%xmm4
vpsllq $63,%xmm0,%xmm3
vpxor %xmm3,%xmm4,%xmm4
vpslldq $8,%xmm4,%xmm3
vpsrldq $8,%xmm4,%xmm4
vpxor %xmm3,%xmm0,%xmm0
vpxor %xmm4,%xmm1,%xmm1
vpsrlq $1,%xmm0,%xmm4
vpxor %xmm0,%xmm1,%xmm1
vpxor %xmm4,%xmm0,%xmm0
vpsrlq $5,%xmm4,%xmm4
vpxor %xmm4,%xmm0,%xmm0
vpsrlq $1,%xmm0,%xmm0
vpxor %xmm1,%xmm0,%xmm0
.Linit_start_avx:
vmovdqa %xmm0,%xmm5
vpunpckhqdq %xmm0,%xmm0,%xmm3
vpxor %xmm0,%xmm3,%xmm3
vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
vpxor %xmm0,%xmm1,%xmm4
vpxor %xmm4,%xmm3,%xmm3
vpslldq $8,%xmm3,%xmm4
vpsrldq $8,%xmm3,%xmm3
vpxor %xmm4,%xmm0,%xmm0
vpxor %xmm3,%xmm1,%xmm1
vpsllq $57,%xmm0,%xmm3
vpsllq $62,%xmm0,%xmm4
vpxor %xmm3,%xmm4,%xmm4
vpsllq $63,%xmm0,%xmm3
vpxor %xmm3,%xmm4,%xmm4
vpslldq $8,%xmm4,%xmm3
vpsrldq $8,%xmm4,%xmm4
vpxor %xmm3,%xmm0,%xmm0
vpxor %xmm4,%xmm1,%xmm1
vpsrlq $1,%xmm0,%xmm4
vpxor %xmm0,%xmm1,%xmm1
vpxor %xmm4,%xmm0,%xmm0
vpsrlq $5,%xmm4,%xmm4
vpxor %xmm4,%xmm0,%xmm0
vpsrlq $1,%xmm0,%xmm0
vpxor %xmm1,%xmm0,%xmm0
vpshufd $78,%xmm5,%xmm3
vpshufd $78,%xmm0,%xmm4
vpxor %xmm5,%xmm3,%xmm3
vmovdqu %xmm5,0(%rdi)
vpxor %xmm0,%xmm4,%xmm4
vmovdqu %xmm0,16(%rdi)
leaq 48(%rdi),%rdi
subq $1,%r10
jnz .Linit_loop_avx
vpalignr $8,%xmm4,%xmm3,%xmm5
vmovdqu %xmm5,-16(%rdi)
vzeroupper
.byte 0xf3,0xc3
.cfi_endproc
.size gcm_init_avx,.-gcm_init_avx
.globl gcm_gmult_avx
@ -1320,7 +1421,377 @@ gcm_gmult_avx:
.align 32
gcm_ghash_avx:
.cfi_startproc
jmp .L_ghash_clmul
vzeroupper
vmovdqu (%rdi),%xmm10
leaq .L0x1c2_polynomial(%rip),%r10
leaq 64(%rsi),%rsi
vmovdqu .Lbswap_mask(%rip),%xmm13
vpshufb %xmm13,%xmm10,%xmm10
cmpq $0x80,%rcx
jb .Lshort_avx
subq $0x80,%rcx
vmovdqu 112(%rdx),%xmm14
vmovdqu 0-64(%rsi),%xmm6
vpshufb %xmm13,%xmm14,%xmm14
vmovdqu 32-64(%rsi),%xmm7
vpunpckhqdq %xmm14,%xmm14,%xmm9
vmovdqu 96(%rdx),%xmm15
vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
vpxor %xmm14,%xmm9,%xmm9
vpshufb %xmm13,%xmm15,%xmm15
vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
vmovdqu 16-64(%rsi),%xmm6
vpunpckhqdq %xmm15,%xmm15,%xmm8
vmovdqu 80(%rdx),%xmm14
vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
vpxor %xmm15,%xmm8,%xmm8
vpshufb %xmm13,%xmm14,%xmm14
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
vpunpckhqdq %xmm14,%xmm14,%xmm9
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
vmovdqu 48-64(%rsi),%xmm6
vpxor %xmm14,%xmm9,%xmm9
vmovdqu 64(%rdx),%xmm15
vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
vmovdqu 80-64(%rsi),%xmm7
vpshufb %xmm13,%xmm15,%xmm15
vpxor %xmm0,%xmm3,%xmm3
vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
vpxor %xmm1,%xmm4,%xmm4
vpunpckhqdq %xmm15,%xmm15,%xmm8
vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
vmovdqu 64-64(%rsi),%xmm6
vpxor %xmm2,%xmm5,%xmm5
vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
vpxor %xmm15,%xmm8,%xmm8
vmovdqu 48(%rdx),%xmm14
vpxor %xmm3,%xmm0,%xmm0
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
vpxor %xmm4,%xmm1,%xmm1
vpshufb %xmm13,%xmm14,%xmm14
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
vmovdqu 96-64(%rsi),%xmm6
vpxor %xmm5,%xmm2,%xmm2
vpunpckhqdq %xmm14,%xmm14,%xmm9
vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
vmovdqu 128-64(%rsi),%xmm7
vpxor %xmm14,%xmm9,%xmm9
vmovdqu 32(%rdx),%xmm15
vpxor %xmm0,%xmm3,%xmm3
vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
vpxor %xmm1,%xmm4,%xmm4
vpshufb %xmm13,%xmm15,%xmm15
vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
vmovdqu 112-64(%rsi),%xmm6
vpxor %xmm2,%xmm5,%xmm5
vpunpckhqdq %xmm15,%xmm15,%xmm8
vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
vpxor %xmm15,%xmm8,%xmm8
vmovdqu 16(%rdx),%xmm14
vpxor %xmm3,%xmm0,%xmm0
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
vpxor %xmm4,%xmm1,%xmm1
vpshufb %xmm13,%xmm14,%xmm14
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
vmovdqu 144-64(%rsi),%xmm6
vpxor %xmm5,%xmm2,%xmm2
vpunpckhqdq %xmm14,%xmm14,%xmm9
vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
vmovdqu 176-64(%rsi),%xmm7
vpxor %xmm14,%xmm9,%xmm9
vmovdqu (%rdx),%xmm15
vpxor %xmm0,%xmm3,%xmm3
vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
vpxor %xmm1,%xmm4,%xmm4
vpshufb %xmm13,%xmm15,%xmm15
vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
vmovdqu 160-64(%rsi),%xmm6
vpxor %xmm2,%xmm5,%xmm5
vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
leaq 128(%rdx),%rdx
cmpq $0x80,%rcx
jb .Ltail_avx
vpxor %xmm10,%xmm15,%xmm15
subq $0x80,%rcx
jmp .Loop8x_avx
.align 32
.Loop8x_avx:
vpunpckhqdq %xmm15,%xmm15,%xmm8
vmovdqu 112(%rdx),%xmm14
vpxor %xmm0,%xmm3,%xmm3
vpxor %xmm15,%xmm8,%xmm8
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10
vpshufb %xmm13,%xmm14,%xmm14
vpxor %xmm1,%xmm4,%xmm4
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11
vmovdqu 0-64(%rsi),%xmm6
vpunpckhqdq %xmm14,%xmm14,%xmm9
vpxor %xmm2,%xmm5,%xmm5
vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12
vmovdqu 32-64(%rsi),%xmm7
vpxor %xmm14,%xmm9,%xmm9
vmovdqu 96(%rdx),%xmm15
vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
vpxor %xmm3,%xmm10,%xmm10
vpshufb %xmm13,%xmm15,%xmm15
vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
vxorps %xmm4,%xmm11,%xmm11
vmovdqu 16-64(%rsi),%xmm6
vpunpckhqdq %xmm15,%xmm15,%xmm8
vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
vpxor %xmm5,%xmm12,%xmm12
vxorps %xmm15,%xmm8,%xmm8
vmovdqu 80(%rdx),%xmm14
vpxor %xmm10,%xmm12,%xmm12
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
vpxor %xmm11,%xmm12,%xmm12
vpslldq $8,%xmm12,%xmm9
vpxor %xmm0,%xmm3,%xmm3
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
vpsrldq $8,%xmm12,%xmm12
vpxor %xmm9,%xmm10,%xmm10
vmovdqu 48-64(%rsi),%xmm6
vpshufb %xmm13,%xmm14,%xmm14
vxorps %xmm12,%xmm11,%xmm11
vpxor %xmm1,%xmm4,%xmm4
vpunpckhqdq %xmm14,%xmm14,%xmm9
vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
vmovdqu 80-64(%rsi),%xmm7
vpxor %xmm14,%xmm9,%xmm9
vpxor %xmm2,%xmm5,%xmm5
vmovdqu 64(%rdx),%xmm15
vpalignr $8,%xmm10,%xmm10,%xmm12
vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
vpshufb %xmm13,%xmm15,%xmm15
vpxor %xmm3,%xmm0,%xmm0
vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
vmovdqu 64-64(%rsi),%xmm6
vpunpckhqdq %xmm15,%xmm15,%xmm8
vpxor %xmm4,%xmm1,%xmm1
vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
vxorps %xmm15,%xmm8,%xmm8
vpxor %xmm5,%xmm2,%xmm2
vmovdqu 48(%rdx),%xmm14
vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
vpshufb %xmm13,%xmm14,%xmm14
vpxor %xmm0,%xmm3,%xmm3
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
vmovdqu 96-64(%rsi),%xmm6
vpunpckhqdq %xmm14,%xmm14,%xmm9
vpxor %xmm1,%xmm4,%xmm4
vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
vmovdqu 128-64(%rsi),%xmm7
vpxor %xmm14,%xmm9,%xmm9
vpxor %xmm2,%xmm5,%xmm5
vmovdqu 32(%rdx),%xmm15
vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
vpshufb %xmm13,%xmm15,%xmm15
vpxor %xmm3,%xmm0,%xmm0
vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
vmovdqu 112-64(%rsi),%xmm6
vpunpckhqdq %xmm15,%xmm15,%xmm8
vpxor %xmm4,%xmm1,%xmm1
vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
vpxor %xmm15,%xmm8,%xmm8
vpxor %xmm5,%xmm2,%xmm2
vxorps %xmm12,%xmm10,%xmm10
vmovdqu 16(%rdx),%xmm14
vpalignr $8,%xmm10,%xmm10,%xmm12
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
vpshufb %xmm13,%xmm14,%xmm14
vpxor %xmm0,%xmm3,%xmm3
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
vmovdqu 144-64(%rsi),%xmm6
vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
vxorps %xmm11,%xmm12,%xmm12
vpunpckhqdq %xmm14,%xmm14,%xmm9
vpxor %xmm1,%xmm4,%xmm4
vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
vmovdqu 176-64(%rsi),%xmm7
vpxor %xmm14,%xmm9,%xmm9
vpxor %xmm2,%xmm5,%xmm5
vmovdqu (%rdx),%xmm15
vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
vpshufb %xmm13,%xmm15,%xmm15
vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
vmovdqu 160-64(%rsi),%xmm6
vpxor %xmm12,%xmm15,%xmm15
vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
vpxor %xmm10,%xmm15,%xmm15
leaq 128(%rdx),%rdx
subq $0x80,%rcx
jnc .Loop8x_avx
addq $0x80,%rcx
jmp .Ltail_no_xor_avx
.align 32
.Lshort_avx:
vmovdqu -16(%rdx,%rcx,1),%xmm14
leaq (%rdx,%rcx,1),%rdx
vmovdqu 0-64(%rsi),%xmm6
vmovdqu 32-64(%rsi),%xmm7
vpshufb %xmm13,%xmm14,%xmm15
vmovdqa %xmm0,%xmm3
vmovdqa %xmm1,%xmm4
vmovdqa %xmm2,%xmm5
subq $0x10,%rcx
jz .Ltail_avx
vpunpckhqdq %xmm15,%xmm15,%xmm8
vpxor %xmm0,%xmm3,%xmm3
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
vpxor %xmm15,%xmm8,%xmm8
vmovdqu -32(%rdx),%xmm14
vpxor %xmm1,%xmm4,%xmm4
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
vmovdqu 16-64(%rsi),%xmm6
vpshufb %xmm13,%xmm14,%xmm15
vpxor %xmm2,%xmm5,%xmm5
vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
vpsrldq $8,%xmm7,%xmm7
subq $0x10,%rcx
jz .Ltail_avx
vpunpckhqdq %xmm15,%xmm15,%xmm8
vpxor %xmm0,%xmm3,%xmm3
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
vpxor %xmm15,%xmm8,%xmm8
vmovdqu -48(%rdx),%xmm14
vpxor %xmm1,%xmm4,%xmm4
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
vmovdqu 48-64(%rsi),%xmm6
vpshufb %xmm13,%xmm14,%xmm15
vpxor %xmm2,%xmm5,%xmm5
vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
vmovdqu 80-64(%rsi),%xmm7
subq $0x10,%rcx
jz .Ltail_avx
vpunpckhqdq %xmm15,%xmm15,%xmm8
vpxor %xmm0,%xmm3,%xmm3
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
vpxor %xmm15,%xmm8,%xmm8
vmovdqu -64(%rdx),%xmm14
vpxor %xmm1,%xmm4,%xmm4
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
vmovdqu 64-64(%rsi),%xmm6
vpshufb %xmm13,%xmm14,%xmm15
vpxor %xmm2,%xmm5,%xmm5
vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
vpsrldq $8,%xmm7,%xmm7
subq $0x10,%rcx
jz .Ltail_avx
vpunpckhqdq %xmm15,%xmm15,%xmm8
vpxor %xmm0,%xmm3,%xmm3
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
vpxor %xmm15,%xmm8,%xmm8
vmovdqu -80(%rdx),%xmm14
vpxor %xmm1,%xmm4,%xmm4
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
vmovdqu 96-64(%rsi),%xmm6
vpshufb %xmm13,%xmm14,%xmm15
vpxor %xmm2,%xmm5,%xmm5
vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
vmovdqu 128-64(%rsi),%xmm7
subq $0x10,%rcx
jz .Ltail_avx
vpunpckhqdq %xmm15,%xmm15,%xmm8
vpxor %xmm0,%xmm3,%xmm3
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
vpxor %xmm15,%xmm8,%xmm8
vmovdqu -96(%rdx),%xmm14
vpxor %xmm1,%xmm4,%xmm4
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
vmovdqu 112-64(%rsi),%xmm6
vpshufb %xmm13,%xmm14,%xmm15
vpxor %xmm2,%xmm5,%xmm5
vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
vpsrldq $8,%xmm7,%xmm7
subq $0x10,%rcx
jz .Ltail_avx
vpunpckhqdq %xmm15,%xmm15,%xmm8
vpxor %xmm0,%xmm3,%xmm3
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
vpxor %xmm15,%xmm8,%xmm8
vmovdqu -112(%rdx),%xmm14
vpxor %xmm1,%xmm4,%xmm4
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
vmovdqu 144-64(%rsi),%xmm6
vpshufb %xmm13,%xmm14,%xmm15
vpxor %xmm2,%xmm5,%xmm5
vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
vmovq 184-64(%rsi),%xmm7
subq $0x10,%rcx
jmp .Ltail_avx
.align 32
.Ltail_avx:
vpxor %xmm10,%xmm15,%xmm15
.Ltail_no_xor_avx:
vpunpckhqdq %xmm15,%xmm15,%xmm8
vpxor %xmm0,%xmm3,%xmm3
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
vpxor %xmm15,%xmm8,%xmm8
vpxor %xmm1,%xmm4,%xmm4
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
vpxor %xmm2,%xmm5,%xmm5
vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
vmovdqu (%r10),%xmm12
vpxor %xmm0,%xmm3,%xmm10
vpxor %xmm1,%xmm4,%xmm11
vpxor %xmm2,%xmm5,%xmm5
vpxor %xmm10,%xmm5,%xmm5
vpxor %xmm11,%xmm5,%xmm5
vpslldq $8,%xmm5,%xmm9
vpsrldq $8,%xmm5,%xmm5
vpxor %xmm9,%xmm10,%xmm10
vpxor %xmm5,%xmm11,%xmm11
vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
vpalignr $8,%xmm10,%xmm10,%xmm10
vpxor %xmm9,%xmm10,%xmm10
vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
vpalignr $8,%xmm10,%xmm10,%xmm10
vpxor %xmm11,%xmm10,%xmm10
vpxor %xmm9,%xmm10,%xmm10
cmpq $0,%rcx
jne .Lshort_avx
vpshufb %xmm13,%xmm10,%xmm10
vmovdqu %xmm10,(%rdi)
vzeroupper
.byte 0xf3,0xc3
.cfi_endproc
.size gcm_ghash_avx,.-gcm_ghash_avx
.align 64

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -35,6 +35,10 @@ rsaz_512_sqr:
movq (%rsi),%rdx
movq 8(%rsi),%rax
movq %rcx,128(%rsp)
movl $0x80100,%r11d
andl OPENSSL_ia32cap_P+8(%rip),%r11d
cmpl $0x80100,%r11d
je .Loop_sqrx
jmp .Loop_sqr
.align 32
@ -405,6 +409,282 @@ rsaz_512_sqr:
decl %r8d
jnz .Loop_sqr
jmp .Lsqr_tail
.align 32
.Loop_sqrx:
movl %r8d,128+8(%rsp)
.byte 102,72,15,110,199
mulxq %rax,%r8,%r9
movq %rax,%rbx
mulxq 16(%rsi),%rcx,%r10
xorq %rbp,%rbp
mulxq 24(%rsi),%rax,%r11
adcxq %rcx,%r9
.byte 0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00
adcxq %rax,%r10
.byte 0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00
adcxq %rcx,%r11
mulxq 48(%rsi),%rcx,%r14
adcxq %rax,%r12
adcxq %rcx,%r13
mulxq 56(%rsi),%rax,%r15
adcxq %rax,%r14
adcxq %rbp,%r15
mulxq %rdx,%rax,%rdi
movq %rbx,%rdx
xorq %rcx,%rcx
adoxq %r8,%r8
adcxq %rdi,%r8
adoxq %rbp,%rcx
adcxq %rbp,%rcx
movq %rax,(%rsp)
movq %r8,8(%rsp)
.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00
adoxq %rax,%r10
adcxq %rbx,%r11
mulxq 24(%rsi),%rdi,%r8
adoxq %rdi,%r11
.byte 0x66
adcxq %r8,%r12
mulxq 32(%rsi),%rax,%rbx
adoxq %rax,%r12
adcxq %rbx,%r13
mulxq 40(%rsi),%rdi,%r8
adoxq %rdi,%r13
adcxq %r8,%r14
.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
adoxq %rax,%r14
adcxq %rbx,%r15
.byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00
adoxq %rdi,%r15
adcxq %rbp,%r8
mulxq %rdx,%rax,%rdi
adoxq %rbp,%r8
.byte 0x48,0x8b,0x96,0x10,0x00,0x00,0x00
xorq %rbx,%rbx
adoxq %r9,%r9
adcxq %rcx,%rax
adoxq %r10,%r10
adcxq %rax,%r9
adoxq %rbp,%rbx
adcxq %rdi,%r10
adcxq %rbp,%rbx
movq %r9,16(%rsp)
.byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00
mulxq 24(%rsi),%rdi,%r9
adoxq %rdi,%r12
adcxq %r9,%r13
mulxq 32(%rsi),%rax,%rcx
adoxq %rax,%r13
adcxq %rcx,%r14
.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00
adoxq %rdi,%r14
adcxq %r9,%r15
.byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00
adoxq %rax,%r15
adcxq %rcx,%r8
mulxq 56(%rsi),%rdi,%r9
adoxq %rdi,%r8
adcxq %rbp,%r9
mulxq %rdx,%rax,%rdi
adoxq %rbp,%r9
movq 24(%rsi),%rdx
xorq %rcx,%rcx
adoxq %r11,%r11
adcxq %rbx,%rax
adoxq %r12,%r12
adcxq %rax,%r11
adoxq %rbp,%rcx
adcxq %rdi,%r12
adcxq %rbp,%rcx
movq %r11,32(%rsp)
movq %r12,40(%rsp)
mulxq 32(%rsi),%rax,%rbx
adoxq %rax,%r14
adcxq %rbx,%r15
mulxq 40(%rsi),%rdi,%r10
adoxq %rdi,%r15
adcxq %r10,%r8
mulxq 48(%rsi),%rax,%rbx
adoxq %rax,%r8
adcxq %rbx,%r9
mulxq 56(%rsi),%rdi,%r10
adoxq %rdi,%r9
adcxq %rbp,%r10
mulxq %rdx,%rax,%rdi
adoxq %rbp,%r10
movq 32(%rsi),%rdx
xorq %rbx,%rbx
adoxq %r13,%r13
adcxq %rcx,%rax
adoxq %r14,%r14
adcxq %rax,%r13
adoxq %rbp,%rbx
adcxq %rdi,%r14
adcxq %rbp,%rbx
movq %r13,48(%rsp)
movq %r14,56(%rsp)
mulxq 40(%rsi),%rdi,%r11
adoxq %rdi,%r8
adcxq %r11,%r9
mulxq 48(%rsi),%rax,%rcx
adoxq %rax,%r9
adcxq %rcx,%r10
mulxq 56(%rsi),%rdi,%r11
adoxq %rdi,%r10
adcxq %rbp,%r11
mulxq %rdx,%rax,%rdi
movq 40(%rsi),%rdx
adoxq %rbp,%r11
xorq %rcx,%rcx
adoxq %r15,%r15
adcxq %rbx,%rax
adoxq %r8,%r8
adcxq %rax,%r15
adoxq %rbp,%rcx
adcxq %rdi,%r8
adcxq %rbp,%rcx
movq %r15,64(%rsp)
movq %r8,72(%rsp)
.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
adoxq %rax,%r10
adcxq %rbx,%r11
.byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00
adoxq %rdi,%r11
adcxq %rbp,%r12
mulxq %rdx,%rax,%rdi
adoxq %rbp,%r12
movq 48(%rsi),%rdx
xorq %rbx,%rbx
adoxq %r9,%r9
adcxq %rcx,%rax
adoxq %r10,%r10
adcxq %rax,%r9
adcxq %rdi,%r10
adoxq %rbp,%rbx
adcxq %rbp,%rbx
movq %r9,80(%rsp)
movq %r10,88(%rsp)
.byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00
adoxq %rax,%r12
adoxq %rbp,%r13
mulxq %rdx,%rax,%rdi
xorq %rcx,%rcx
movq 56(%rsi),%rdx
adoxq %r11,%r11
adcxq %rbx,%rax
adoxq %r12,%r12
adcxq %rax,%r11
adoxq %rbp,%rcx
adcxq %rdi,%r12
adcxq %rbp,%rcx
.byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00
.byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00
mulxq %rdx,%rax,%rdx
xorq %rbx,%rbx
adoxq %r13,%r13
adcxq %rcx,%rax
adoxq %rbp,%rbx
adcxq %r13,%rax
adcxq %rdx,%rbx
.byte 102,72,15,126,199
.byte 102,72,15,126,205
movq 128(%rsp),%rdx
movq (%rsp),%r8
movq 8(%rsp),%r9
movq 16(%rsp),%r10
movq 24(%rsp),%r11
movq 32(%rsp),%r12
movq 40(%rsp),%r13
movq 48(%rsp),%r14
movq 56(%rsp),%r15
movq %rax,112(%rsp)
movq %rbx,120(%rsp)
call __rsaz_512_reducex
addq 64(%rsp),%r8
adcq 72(%rsp),%r9
adcq 80(%rsp),%r10
adcq 88(%rsp),%r11
adcq 96(%rsp),%r12
adcq 104(%rsp),%r13
adcq 112(%rsp),%r14
adcq 120(%rsp),%r15
sbbq %rcx,%rcx
call __rsaz_512_subtract
movq %r8,%rdx
movq %r9,%rax
movl 128+8(%rsp),%r8d
movq %rdi,%rsi
decl %r8d
jnz .Loop_sqrx
.Lsqr_tail:
leaq 128+24+48(%rsp),%rax
.cfi_def_cfa %rax,8
@ -456,6 +736,10 @@ rsaz_512_mul:
.byte 102,72,15,110,199
.byte 102,72,15,110,201
movq %r8,128(%rsp)
movl $0x80100,%r11d
andl OPENSSL_ia32cap_P+8(%rip),%r11d
cmpl $0x80100,%r11d
je .Lmulx
movq (%rdx),%rbx
movq %rdx,%rbp
call __rsaz_512_mul
@ -473,6 +757,29 @@ rsaz_512_mul:
movq 56(%rsp),%r15
call __rsaz_512_reduce
jmp .Lmul_tail
.align 32
.Lmulx:
movq %rdx,%rbp
movq (%rdx),%rdx
call __rsaz_512_mulx
.byte 102,72,15,126,199
.byte 102,72,15,126,205
movq 128(%rsp),%rdx
movq (%rsp),%r8
movq 8(%rsp),%r9
movq 16(%rsp),%r10
movq 24(%rsp),%r11
movq 32(%rsp),%r12
movq 40(%rsp),%r13
movq 48(%rsp),%r14
movq 56(%rsp),%r15
call __rsaz_512_reducex
.Lmul_tail:
addq 64(%rsp),%r8
adcq 72(%rsp),%r9
adcq 80(%rsp),%r10
@ -586,6 +893,10 @@ rsaz_512_mul_gather4:
por %xmm9,%xmm8
pshufd $0x4e,%xmm8,%xmm9
por %xmm9,%xmm8
movl $0x80100,%r11d
andl OPENSSL_ia32cap_P+8(%rip),%r11d
cmpl $0x80100,%r11d
je .Lmulx_gather
.byte 102,76,15,126,195
movq %r8,128(%rsp)
@ -766,6 +1077,142 @@ rsaz_512_mul_gather4:
movq 56(%rsp),%r15
call __rsaz_512_reduce
jmp .Lmul_gather_tail
.align 32
.Lmulx_gather:
.byte 102,76,15,126,194
movq %r8,128(%rsp)
movq %rdi,128+8(%rsp)
movq %rcx,128+16(%rsp)
mulxq (%rsi),%rbx,%r8
movq %rbx,(%rsp)
xorl %edi,%edi
mulxq 8(%rsi),%rax,%r9
mulxq 16(%rsi),%rbx,%r10
adcxq %rax,%r8
mulxq 24(%rsi),%rax,%r11
adcxq %rbx,%r9
mulxq 32(%rsi),%rbx,%r12
adcxq %rax,%r10
mulxq 40(%rsi),%rax,%r13
adcxq %rbx,%r11
mulxq 48(%rsi),%rbx,%r14
adcxq %rax,%r12
mulxq 56(%rsi),%rax,%r15
adcxq %rbx,%r13
adcxq %rax,%r14
.byte 0x67
movq %r8,%rbx
adcxq %rdi,%r15
movq $-7,%rcx
jmp .Loop_mulx_gather
.align 32
.Loop_mulx_gather:
movdqa 0(%rbp),%xmm8
movdqa 16(%rbp),%xmm9
movdqa 32(%rbp),%xmm10
movdqa 48(%rbp),%xmm11
pand %xmm0,%xmm8
movdqa 64(%rbp),%xmm12
pand %xmm1,%xmm9
movdqa 80(%rbp),%xmm13
pand %xmm2,%xmm10
movdqa 96(%rbp),%xmm14
pand %xmm3,%xmm11
movdqa 112(%rbp),%xmm15
leaq 128(%rbp),%rbp
pand %xmm4,%xmm12
pand %xmm5,%xmm13
pand %xmm6,%xmm14
pand %xmm7,%xmm15
por %xmm10,%xmm8
por %xmm11,%xmm9
por %xmm12,%xmm8
por %xmm13,%xmm9
por %xmm14,%xmm8
por %xmm15,%xmm9
por %xmm9,%xmm8
pshufd $0x4e,%xmm8,%xmm9
por %xmm9,%xmm8
.byte 102,76,15,126,194
.byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00
adcxq %rax,%rbx
adoxq %r9,%r8
mulxq 8(%rsi),%rax,%r9
adcxq %rax,%r8
adoxq %r10,%r9
mulxq 16(%rsi),%rax,%r10
adcxq %rax,%r9
adoxq %r11,%r10
.byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00
adcxq %rax,%r10
adoxq %r12,%r11
mulxq 32(%rsi),%rax,%r12
adcxq %rax,%r11
adoxq %r13,%r12
mulxq 40(%rsi),%rax,%r13
adcxq %rax,%r12
adoxq %r14,%r13
.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
adcxq %rax,%r13
.byte 0x67
adoxq %r15,%r14
mulxq 56(%rsi),%rax,%r15
movq %rbx,64(%rsp,%rcx,8)
adcxq %rax,%r14
adoxq %rdi,%r15
movq %r8,%rbx
adcxq %rdi,%r15
incq %rcx
jnz .Loop_mulx_gather
movq %r8,64(%rsp)
movq %r9,64+8(%rsp)
movq %r10,64+16(%rsp)
movq %r11,64+24(%rsp)
movq %r12,64+32(%rsp)
movq %r13,64+40(%rsp)
movq %r14,64+48(%rsp)
movq %r15,64+56(%rsp)
movq 128(%rsp),%rdx
movq 128+8(%rsp),%rdi
movq 128+16(%rsp),%rbp
movq (%rsp),%r8
movq 8(%rsp),%r9
movq 16(%rsp),%r10
movq 24(%rsp),%r11
movq 32(%rsp),%r12
movq 40(%rsp),%r13
movq 48(%rsp),%r14
movq 56(%rsp),%r15
call __rsaz_512_reducex
.Lmul_gather_tail:
addq 64(%rsp),%r8
adcq 72(%rsp),%r9
adcq 80(%rsp),%r10
@ -833,6 +1280,10 @@ rsaz_512_mul_scatter4:
movq %rcx,128(%rsp)
movq %rdi,%rbp
movl $0x80100,%r11d
andl OPENSSL_ia32cap_P+8(%rip),%r11d
cmpl $0x80100,%r11d
je .Lmulx_scatter
movq (%rdi),%rbx
call __rsaz_512_mul
@ -849,6 +1300,29 @@ rsaz_512_mul_scatter4:
movq 56(%rsp),%r15
call __rsaz_512_reduce
jmp .Lmul_scatter_tail
.align 32
.Lmulx_scatter:
movq (%rdi),%rdx
call __rsaz_512_mulx
.byte 102,72,15,126,199
.byte 102,72,15,126,205
movq 128(%rsp),%rdx
movq (%rsp),%r8
movq 8(%rsp),%r9
movq 16(%rsp),%r10
movq 24(%rsp),%r11
movq 32(%rsp),%r12
movq 40(%rsp),%r13
movq 48(%rsp),%r14
movq 56(%rsp),%r15
call __rsaz_512_reducex
.Lmul_scatter_tail:
addq 64(%rsp),%r8
adcq 72(%rsp),%r9
adcq 80(%rsp),%r10
@ -918,6 +1392,7 @@ rsaz_512_mul_by_one:
subq $128+24,%rsp
.cfi_adjust_cfa_offset 128+24
.Lmul_by_one_body:
movl OPENSSL_ia32cap_P+8(%rip),%eax
movq %rdx,%rbp
movq %rcx,128(%rsp)
@ -938,7 +1413,16 @@ rsaz_512_mul_by_one:
movdqa %xmm0,64(%rsp)
movdqa %xmm0,80(%rsp)
movdqa %xmm0,96(%rsp)
andl $0x80100,%eax
cmpl $0x80100,%eax
je .Lby_one_callx
call __rsaz_512_reduce
jmp .Lby_one_tail
.align 32
.Lby_one_callx:
movq 128(%rsp),%rdx
call __rsaz_512_reducex
.Lby_one_tail:
movq %r8,(%rdi)
movq %r9,8(%rdi)
movq %r10,16(%rdi)
@ -1053,6 +1537,64 @@ __rsaz_512_reduce:
.byte 0xf3,0xc3
.cfi_endproc
.size __rsaz_512_reduce,.-__rsaz_512_reduce
.type __rsaz_512_reducex,@function
.align 32
__rsaz_512_reducex:
.cfi_startproc
imulq %r8,%rdx
xorq %rsi,%rsi
movl $8,%ecx
jmp .Lreduction_loopx
.align 32
.Lreduction_loopx:
movq %r8,%rbx
mulxq 0(%rbp),%rax,%r8
adcxq %rbx,%rax
adoxq %r9,%r8
mulxq 8(%rbp),%rax,%r9
adcxq %rax,%r8
adoxq %r10,%r9
mulxq 16(%rbp),%rbx,%r10
adcxq %rbx,%r9
adoxq %r11,%r10
mulxq 24(%rbp),%rbx,%r11
adcxq %rbx,%r10
adoxq %r12,%r11
.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
movq %rdx,%rax
movq %r8,%rdx
adcxq %rbx,%r11
adoxq %r13,%r12
mulxq 128+8(%rsp),%rbx,%rdx
movq %rax,%rdx
mulxq 40(%rbp),%rax,%r13
adcxq %rax,%r12
adoxq %r14,%r13
.byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00
adcxq %rax,%r13
adoxq %r15,%r14
mulxq 56(%rbp),%rax,%r15
movq %rbx,%rdx
adcxq %rax,%r14
adoxq %rsi,%r15
adcxq %rsi,%r15
decl %ecx
jne .Lreduction_loopx
.byte 0xf3,0xc3
.cfi_endproc
.size __rsaz_512_reducex,.-__rsaz_512_reducex
.type __rsaz_512_subtract,@function
.align 32
__rsaz_512_subtract:
@ -1256,6 +1798,128 @@ __rsaz_512_mul:
.byte 0xf3,0xc3
.cfi_endproc
.size __rsaz_512_mul,.-__rsaz_512_mul
.type __rsaz_512_mulx,@function
.align 32
__rsaz_512_mulx:
.cfi_startproc
mulxq (%rsi),%rbx,%r8
movq $-6,%rcx
mulxq 8(%rsi),%rax,%r9
movq %rbx,8(%rsp)
mulxq 16(%rsi),%rbx,%r10
adcq %rax,%r8
mulxq 24(%rsi),%rax,%r11
adcq %rbx,%r9
mulxq 32(%rsi),%rbx,%r12
adcq %rax,%r10
mulxq 40(%rsi),%rax,%r13
adcq %rbx,%r11
mulxq 48(%rsi),%rbx,%r14
adcq %rax,%r12
mulxq 56(%rsi),%rax,%r15
movq 8(%rbp),%rdx
adcq %rbx,%r13
adcq %rax,%r14
adcq $0,%r15
xorq %rdi,%rdi
jmp .Loop_mulx
.align 32
.Loop_mulx:
movq %r8,%rbx
mulxq (%rsi),%rax,%r8
adcxq %rax,%rbx
adoxq %r9,%r8
mulxq 8(%rsi),%rax,%r9
adcxq %rax,%r8
adoxq %r10,%r9
mulxq 16(%rsi),%rax,%r10
adcxq %rax,%r9
adoxq %r11,%r10
mulxq 24(%rsi),%rax,%r11
adcxq %rax,%r10
adoxq %r12,%r11
.byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00
adcxq %rax,%r11
adoxq %r13,%r12
mulxq 40(%rsi),%rax,%r13
adcxq %rax,%r12
adoxq %r14,%r13
mulxq 48(%rsi),%rax,%r14
adcxq %rax,%r13
adoxq %r15,%r14
mulxq 56(%rsi),%rax,%r15
movq 64(%rbp,%rcx,8),%rdx
movq %rbx,8+64-8(%rsp,%rcx,8)
adcxq %rax,%r14
adoxq %rdi,%r15
adcxq %rdi,%r15
incq %rcx
jnz .Loop_mulx
movq %r8,%rbx
mulxq (%rsi),%rax,%r8
adcxq %rax,%rbx
adoxq %r9,%r8
.byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00
adcxq %rax,%r8
adoxq %r10,%r9
.byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00
adcxq %rax,%r9
adoxq %r11,%r10
mulxq 24(%rsi),%rax,%r11
adcxq %rax,%r10
adoxq %r12,%r11
mulxq 32(%rsi),%rax,%r12
adcxq %rax,%r11
adoxq %r13,%r12
mulxq 40(%rsi),%rax,%r13
adcxq %rax,%r12
adoxq %r14,%r13
.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
adcxq %rax,%r13
adoxq %r15,%r14
.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00
adcxq %rax,%r14
adoxq %rdi,%r15
adcxq %rdi,%r15
movq %rbx,8+64-8(%rsp)
movq %r8,8+64(%rsp)
movq %r9,8+64+8(%rsp)
movq %r10,8+64+16(%rsp)
movq %r11,8+64+24(%rsp)
movq %r12,8+64+32(%rsp)
movq %r13,8+64+40(%rsp)
movq %r14,8+64+48(%rsp)
movq %r15,8+64+56(%rsp)
.byte 0xf3,0xc3
.cfi_endproc
.size __rsaz_512_mulx,.-__rsaz_512_mulx
.globl rsaz_512_scatter4
.type rsaz_512_scatter4,@function
.align 16

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -397,32 +397,408 @@ x25519_fe51_mul121666:
.Lfe51_mul121666_epilogue:
.cfi_endproc
.size x25519_fe51_mul121666,.-x25519_fe51_mul121666
.globl x25519_fe64_eligible
.type x25519_fe64_eligible,@function
.align 32
x25519_fe64_eligible:
.cfi_startproc
movl OPENSSL_ia32cap_P+8(%rip),%ecx
xorl %eax,%eax
andl $0x80100,%ecx
cmpl $0x80100,%ecx
cmovel %ecx,%eax
.byte 0xf3,0xc3
.cfi_endproc
.size x25519_fe64_eligible,.-x25519_fe64_eligible
.globl x25519_fe64_mul
.type x25519_fe64_mul,@function
.globl x25519_fe64_sqr
.globl x25519_fe64_mul121666
.globl x25519_fe64_add
.globl x25519_fe64_sub
.globl x25519_fe64_tobytes
.align 32
x25519_fe64_mul:
x25519_fe64_sqr:
x25519_fe64_mul121666:
x25519_fe64_add:
x25519_fe64_sub:
x25519_fe64_tobytes:
.cfi_startproc
.byte 0x0f,0x0b
.byte 0xf3,0xc3
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
pushq %rbx
.cfi_adjust_cfa_offset 8
.cfi_offset %rbx,-24
pushq %r12
.cfi_adjust_cfa_offset 8
.cfi_offset %r12,-32
pushq %r13
.cfi_adjust_cfa_offset 8
.cfi_offset %r13,-40
pushq %r14
.cfi_adjust_cfa_offset 8
.cfi_offset %r14,-48
pushq %r15
.cfi_adjust_cfa_offset 8
.cfi_offset %r15,-56
pushq %rdi
.cfi_adjust_cfa_offset 8
.cfi_offset %rdi,-64
leaq -16(%rsp),%rsp
.cfi_adjust_cfa_offset 16
.Lfe64_mul_body:
movq %rdx,%rax
movq 0(%rdx),%rbp
movq 0(%rsi),%rdx
movq 8(%rax),%rcx
movq 16(%rax),%r14
movq 24(%rax),%r15
mulxq %rbp,%r8,%rax
xorl %edi,%edi
mulxq %rcx,%r9,%rbx
adcxq %rax,%r9
mulxq %r14,%r10,%rax
adcxq %rbx,%r10
mulxq %r15,%r11,%r12
movq 8(%rsi),%rdx
adcxq %rax,%r11
movq %r14,(%rsp)
adcxq %rdi,%r12
mulxq %rbp,%rax,%rbx
adoxq %rax,%r9
adcxq %rbx,%r10
mulxq %rcx,%rax,%rbx
adoxq %rax,%r10
adcxq %rbx,%r11
mulxq %r14,%rax,%rbx
adoxq %rax,%r11
adcxq %rbx,%r12
mulxq %r15,%rax,%r13
movq 16(%rsi),%rdx
adoxq %rax,%r12
adcxq %rdi,%r13
adoxq %rdi,%r13
mulxq %rbp,%rax,%rbx
adcxq %rax,%r10
adoxq %rbx,%r11
mulxq %rcx,%rax,%rbx
adcxq %rax,%r11
adoxq %rbx,%r12
mulxq %r14,%rax,%rbx
adcxq %rax,%r12
adoxq %rbx,%r13
mulxq %r15,%rax,%r14
movq 24(%rsi),%rdx
adcxq %rax,%r13
adoxq %rdi,%r14
adcxq %rdi,%r14
mulxq %rbp,%rax,%rbx
adoxq %rax,%r11
adcxq %rbx,%r12
mulxq %rcx,%rax,%rbx
adoxq %rax,%r12
adcxq %rbx,%r13
mulxq (%rsp),%rax,%rbx
adoxq %rax,%r13
adcxq %rbx,%r14
mulxq %r15,%rax,%r15
movl $38,%edx
adoxq %rax,%r14
adcxq %rdi,%r15
adoxq %rdi,%r15
jmp .Lreduce64
.Lfe64_mul_epilogue:
.cfi_endproc
.size x25519_fe64_mul,.-x25519_fe64_mul
.globl x25519_fe64_sqr
.type x25519_fe64_sqr,@function
.align 32
x25519_fe64_sqr:
.cfi_startproc
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
pushq %rbx
.cfi_adjust_cfa_offset 8
.cfi_offset %rbx,-24
pushq %r12
.cfi_adjust_cfa_offset 8
.cfi_offset %r12,-32
pushq %r13
.cfi_adjust_cfa_offset 8
.cfi_offset %r13,-40
pushq %r14
.cfi_adjust_cfa_offset 8
.cfi_offset %r14,-48
pushq %r15
.cfi_adjust_cfa_offset 8
.cfi_offset %r15,-56
pushq %rdi
.cfi_adjust_cfa_offset 8
.cfi_offset %rdi,-64
leaq -16(%rsp),%rsp
.cfi_adjust_cfa_offset 16
.Lfe64_sqr_body:
movq 0(%rsi),%rdx
movq 8(%rsi),%rcx
movq 16(%rsi),%rbp
movq 24(%rsi),%rsi
mulxq %rdx,%r8,%r15
mulxq %rcx,%r9,%rax
xorl %edi,%edi
mulxq %rbp,%r10,%rbx
adcxq %rax,%r10
mulxq %rsi,%r11,%r12
movq %rcx,%rdx
adcxq %rbx,%r11
adcxq %rdi,%r12
mulxq %rbp,%rax,%rbx
adoxq %rax,%r11
adcxq %rbx,%r12
mulxq %rsi,%rax,%r13
movq %rbp,%rdx
adoxq %rax,%r12
adcxq %rdi,%r13
mulxq %rsi,%rax,%r14
movq %rcx,%rdx
adoxq %rax,%r13
adcxq %rdi,%r14
adoxq %rdi,%r14
adcxq %r9,%r9
adoxq %r15,%r9
adcxq %r10,%r10
mulxq %rdx,%rax,%rbx
movq %rbp,%rdx
adcxq %r11,%r11
adoxq %rax,%r10
adcxq %r12,%r12
adoxq %rbx,%r11
mulxq %rdx,%rax,%rbx
movq %rsi,%rdx
adcxq %r13,%r13
adoxq %rax,%r12
adcxq %r14,%r14
adoxq %rbx,%r13
mulxq %rdx,%rax,%r15
movl $38,%edx
adoxq %rax,%r14
adcxq %rdi,%r15
adoxq %rdi,%r15
jmp .Lreduce64
.align 32
.Lreduce64:
mulxq %r12,%rax,%rbx
adcxq %rax,%r8
adoxq %rbx,%r9
mulxq %r13,%rax,%rbx
adcxq %rax,%r9
adoxq %rbx,%r10
mulxq %r14,%rax,%rbx
adcxq %rax,%r10
adoxq %rbx,%r11
mulxq %r15,%rax,%r12
adcxq %rax,%r11
adoxq %rdi,%r12
adcxq %rdi,%r12
movq 16(%rsp),%rdi
imulq %rdx,%r12
addq %r12,%r8
adcq $0,%r9
adcq $0,%r10
adcq $0,%r11
sbbq %rax,%rax
andq $38,%rax
addq %rax,%r8
movq %r9,8(%rdi)
movq %r10,16(%rdi)
movq %r11,24(%rdi)
movq %r8,0(%rdi)
movq 24(%rsp),%r15
.cfi_restore %r15
movq 32(%rsp),%r14
.cfi_restore %r14
movq 40(%rsp),%r13
.cfi_restore %r13
movq 48(%rsp),%r12
.cfi_restore %r12
movq 56(%rsp),%rbx
.cfi_restore %rbx
movq 64(%rsp),%rbp
.cfi_restore %rbp
leaq 72(%rsp),%rsp
.cfi_adjust_cfa_offset 88
.Lfe64_sqr_epilogue:
.byte 0xf3,0xc3
.cfi_endproc
.size x25519_fe64_sqr,.-x25519_fe64_sqr
.globl x25519_fe64_mul121666
.type x25519_fe64_mul121666,@function
.align 32
x25519_fe64_mul121666:
.Lfe64_mul121666_body:
.cfi_startproc
movl $121666,%edx
mulxq 0(%rsi),%r8,%rcx
mulxq 8(%rsi),%r9,%rax
addq %rcx,%r9
mulxq 16(%rsi),%r10,%rcx
adcq %rax,%r10
mulxq 24(%rsi),%r11,%rax
adcq %rcx,%r11
adcq $0,%rax
imulq $38,%rax,%rax
addq %rax,%r8
adcq $0,%r9
adcq $0,%r10
adcq $0,%r11
sbbq %rax,%rax
andq $38,%rax
addq %rax,%r8
movq %r9,8(%rdi)
movq %r10,16(%rdi)
movq %r11,24(%rdi)
movq %r8,0(%rdi)
.Lfe64_mul121666_epilogue:
.byte 0xf3,0xc3
.cfi_endproc
.size x25519_fe64_mul121666,.-x25519_fe64_mul121666
.globl x25519_fe64_add
.type x25519_fe64_add,@function
.align 32
x25519_fe64_add:
.Lfe64_add_body:
.cfi_startproc
movq 0(%rsi),%r8
movq 8(%rsi),%r9
movq 16(%rsi),%r10
movq 24(%rsi),%r11
addq 0(%rdx),%r8
adcq 8(%rdx),%r9
adcq 16(%rdx),%r10
adcq 24(%rdx),%r11
sbbq %rax,%rax
andq $38,%rax
addq %rax,%r8
adcq $0,%r9
adcq $0,%r10
movq %r9,8(%rdi)
adcq $0,%r11
movq %r10,16(%rdi)
sbbq %rax,%rax
movq %r11,24(%rdi)
andq $38,%rax
addq %rax,%r8
movq %r8,0(%rdi)
.Lfe64_add_epilogue:
.byte 0xf3,0xc3
.cfi_endproc
.size x25519_fe64_add,.-x25519_fe64_add
.globl x25519_fe64_sub
.type x25519_fe64_sub,@function
.align 32
x25519_fe64_sub:
.Lfe64_sub_body:
.cfi_startproc
movq 0(%rsi),%r8
movq 8(%rsi),%r9
movq 16(%rsi),%r10
movq 24(%rsi),%r11
subq 0(%rdx),%r8
sbbq 8(%rdx),%r9
sbbq 16(%rdx),%r10
sbbq 24(%rdx),%r11
sbbq %rax,%rax
andq $38,%rax
subq %rax,%r8
sbbq $0,%r9
sbbq $0,%r10
movq %r9,8(%rdi)
sbbq $0,%r11
movq %r10,16(%rdi)
sbbq %rax,%rax
movq %r11,24(%rdi)
andq $38,%rax
subq %rax,%r8
movq %r8,0(%rdi)
.Lfe64_sub_epilogue:
.byte 0xf3,0xc3
.cfi_endproc
.size x25519_fe64_sub,.-x25519_fe64_sub
.globl x25519_fe64_tobytes
.type x25519_fe64_tobytes,@function
.align 32
x25519_fe64_tobytes:
.Lfe64_to_body:
.cfi_startproc
movq 0(%rsi),%r8
movq 8(%rsi),%r9
movq 16(%rsi),%r10
movq 24(%rsi),%r11
leaq (%r11,%r11,1),%rax
sarq $63,%r11
shrq $1,%rax
andq $19,%r11
addq $19,%r11
addq %r11,%r8
adcq $0,%r9
adcq $0,%r10
adcq $0,%rax
leaq (%rax,%rax,1),%r11
sarq $63,%rax
shrq $1,%r11
notq %rax
andq $19,%rax
subq %rax,%r8
sbbq $0,%r9
sbbq $0,%r10
sbbq $0,%r11
movq %r8,0(%rdi)
movq %r9,8(%rdi)
movq %r10,16(%rdi)
movq %r11,24(%rdi)
.Lfe64_to_epilogue:
.byte 0xf3,0xc3
.cfi_endproc
.size x25519_fe64_tobytes,.-x25519_fe64_tobytes
.byte 88,50,53,53,49,57,32,112,114,105,109,105,116,105,118,101,115,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0

View File

@ -16,6 +16,7 @@ bn_mul_mont:
jnz .Lmul_enter
cmpl $8,%r9d
jb .Lmul_enter
movl OPENSSL_ia32cap_P+8(%rip),%r11d
cmpq %rsi,%rdx
jne .Lmul4x_enter
testl $7,%r9d
@ -264,6 +265,9 @@ bn_mul4x_mont:
movq %rsp,%rax
.cfi_def_cfa_register %rax
.Lmul4x_enter:
andl $0x80100,%r11d
cmpl $0x80100,%r11d
je .Lmulx4x_enter
pushq %rbx
.cfi_offset %rbx,-16
pushq %rbp
@ -689,6 +693,7 @@ bn_mul4x_mont:
.size bn_mul4x_mont,.-bn_mul4x_mont
.type bn_sqr8x_mont,@function
.align 32
bn_sqr8x_mont:
@ -770,6 +775,25 @@ bn_sqr8x_mont:
pxor %xmm0,%xmm0
.byte 102,72,15,110,207
.byte 102,73,15,110,218
movl OPENSSL_ia32cap_P+8(%rip),%eax
andl $0x80100,%eax
cmpl $0x80100,%eax
jne .Lsqr8x_nox
call bn_sqrx8x_internal
leaq (%r8,%rcx,1),%rbx
movq %rcx,%r9
movq %rcx,%rdx
.byte 102,72,15,126,207
sarq $3+2,%rcx
jmp .Lsqr8x_sub
.align 32
.Lsqr8x_nox:
call bn_sqr8x_internal
@ -857,5 +881,361 @@ bn_sqr8x_mont:
.byte 0xf3,0xc3
.cfi_endproc
.size bn_sqr8x_mont,.-bn_sqr8x_mont
.type bn_mulx4x_mont,@function
.align 32
bn_mulx4x_mont:
.cfi_startproc
movq %rsp,%rax
.cfi_def_cfa_register %rax
.Lmulx4x_enter:
pushq %rbx
.cfi_offset %rbx,-16
pushq %rbp
.cfi_offset %rbp,-24
pushq %r12
.cfi_offset %r12,-32
pushq %r13
.cfi_offset %r13,-40
pushq %r14
.cfi_offset %r14,-48
pushq %r15
.cfi_offset %r15,-56
.Lmulx4x_prologue:
shll $3,%r9d
xorq %r10,%r10
subq %r9,%r10
movq (%r8),%r8
leaq -72(%rsp,%r10,1),%rbp
andq $-128,%rbp
movq %rsp,%r11
subq %rbp,%r11
andq $-4096,%r11
leaq (%r11,%rbp,1),%rsp
movq (%rsp),%r10
cmpq %rbp,%rsp
ja .Lmulx4x_page_walk
jmp .Lmulx4x_page_walk_done
.align 16
.Lmulx4x_page_walk:
leaq -4096(%rsp),%rsp
movq (%rsp),%r10
cmpq %rbp,%rsp
ja .Lmulx4x_page_walk
.Lmulx4x_page_walk_done:
leaq (%rdx,%r9,1),%r10
movq %r9,0(%rsp)
shrq $5,%r9
movq %r10,16(%rsp)
subq $1,%r9
movq %r8,24(%rsp)
movq %rdi,32(%rsp)
movq %rax,40(%rsp)
.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
movq %r9,48(%rsp)
jmp .Lmulx4x_body
.align 32
.Lmulx4x_body:
leaq 8(%rdx),%rdi
movq (%rdx),%rdx
leaq 64+32(%rsp),%rbx
movq %rdx,%r9
mulxq 0(%rsi),%r8,%rax
mulxq 8(%rsi),%r11,%r14
addq %rax,%r11
movq %rdi,8(%rsp)
mulxq 16(%rsi),%r12,%r13
adcq %r14,%r12
adcq $0,%r13
movq %r8,%rdi
imulq 24(%rsp),%r8
xorq %rbp,%rbp
mulxq 24(%rsi),%rax,%r14
movq %r8,%rdx
leaq 32(%rsi),%rsi
adcxq %rax,%r13
adcxq %rbp,%r14
mulxq 0(%rcx),%rax,%r10
adcxq %rax,%rdi
adoxq %r11,%r10
mulxq 8(%rcx),%rax,%r11
adcxq %rax,%r10
adoxq %r12,%r11
.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
movq 48(%rsp),%rdi
movq %r10,-32(%rbx)
adcxq %rax,%r11
adoxq %r13,%r12
mulxq 24(%rcx),%rax,%r15
movq %r9,%rdx
movq %r11,-24(%rbx)
adcxq %rax,%r12
adoxq %rbp,%r15
leaq 32(%rcx),%rcx
movq %r12,-16(%rbx)
jmp .Lmulx4x_1st
.align 32
.Lmulx4x_1st:
adcxq %rbp,%r15
mulxq 0(%rsi),%r10,%rax
adcxq %r14,%r10
mulxq 8(%rsi),%r11,%r14
adcxq %rax,%r11
mulxq 16(%rsi),%r12,%rax
adcxq %r14,%r12
mulxq 24(%rsi),%r13,%r14
.byte 0x67,0x67
movq %r8,%rdx
adcxq %rax,%r13
adcxq %rbp,%r14
leaq 32(%rsi),%rsi
leaq 32(%rbx),%rbx
adoxq %r15,%r10
mulxq 0(%rcx),%rax,%r15
adcxq %rax,%r10
adoxq %r15,%r11
mulxq 8(%rcx),%rax,%r15
adcxq %rax,%r11
adoxq %r15,%r12
mulxq 16(%rcx),%rax,%r15
movq %r10,-40(%rbx)
adcxq %rax,%r12
movq %r11,-32(%rbx)
adoxq %r15,%r13
mulxq 24(%rcx),%rax,%r15
movq %r9,%rdx
movq %r12,-24(%rbx)
adcxq %rax,%r13
adoxq %rbp,%r15
leaq 32(%rcx),%rcx
movq %r13,-16(%rbx)
decq %rdi
jnz .Lmulx4x_1st
movq 0(%rsp),%rax
movq 8(%rsp),%rdi
adcq %rbp,%r15
addq %r15,%r14
sbbq %r15,%r15
movq %r14,-8(%rbx)
jmp .Lmulx4x_outer
.align 32
.Lmulx4x_outer:
movq (%rdi),%rdx
leaq 8(%rdi),%rdi
subq %rax,%rsi
movq %r15,(%rbx)
leaq 64+32(%rsp),%rbx
subq %rax,%rcx
mulxq 0(%rsi),%r8,%r11
xorl %ebp,%ebp
movq %rdx,%r9
mulxq 8(%rsi),%r14,%r12
adoxq -32(%rbx),%r8
adcxq %r14,%r11
mulxq 16(%rsi),%r15,%r13
adoxq -24(%rbx),%r11
adcxq %r15,%r12
adoxq -16(%rbx),%r12
adcxq %rbp,%r13
adoxq %rbp,%r13
movq %rdi,8(%rsp)
movq %r8,%r15
imulq 24(%rsp),%r8
xorl %ebp,%ebp
mulxq 24(%rsi),%rax,%r14
movq %r8,%rdx
adcxq %rax,%r13
adoxq -8(%rbx),%r13
adcxq %rbp,%r14
leaq 32(%rsi),%rsi
adoxq %rbp,%r14
mulxq 0(%rcx),%rax,%r10
adcxq %rax,%r15
adoxq %r11,%r10
mulxq 8(%rcx),%rax,%r11
adcxq %rax,%r10
adoxq %r12,%r11
mulxq 16(%rcx),%rax,%r12
movq %r10,-32(%rbx)
adcxq %rax,%r11
adoxq %r13,%r12
mulxq 24(%rcx),%rax,%r15
movq %r9,%rdx
movq %r11,-24(%rbx)
leaq 32(%rcx),%rcx
adcxq %rax,%r12
adoxq %rbp,%r15
movq 48(%rsp),%rdi
movq %r12,-16(%rbx)
jmp .Lmulx4x_inner
.align 32
.Lmulx4x_inner:
mulxq 0(%rsi),%r10,%rax
adcxq %rbp,%r15
adoxq %r14,%r10
mulxq 8(%rsi),%r11,%r14
adcxq 0(%rbx),%r10
adoxq %rax,%r11
mulxq 16(%rsi),%r12,%rax
adcxq 8(%rbx),%r11
adoxq %r14,%r12
mulxq 24(%rsi),%r13,%r14
movq %r8,%rdx
adcxq 16(%rbx),%r12
adoxq %rax,%r13
adcxq 24(%rbx),%r13
adoxq %rbp,%r14
leaq 32(%rsi),%rsi
leaq 32(%rbx),%rbx
adcxq %rbp,%r14
adoxq %r15,%r10
mulxq 0(%rcx),%rax,%r15
adcxq %rax,%r10
adoxq %r15,%r11
mulxq 8(%rcx),%rax,%r15
adcxq %rax,%r11
adoxq %r15,%r12
mulxq 16(%rcx),%rax,%r15
movq %r10,-40(%rbx)
adcxq %rax,%r12
adoxq %r15,%r13
mulxq 24(%rcx),%rax,%r15
movq %r9,%rdx
movq %r11,-32(%rbx)
movq %r12,-24(%rbx)
adcxq %rax,%r13
adoxq %rbp,%r15
leaq 32(%rcx),%rcx
movq %r13,-16(%rbx)
decq %rdi
jnz .Lmulx4x_inner
movq 0(%rsp),%rax
movq 8(%rsp),%rdi
adcq %rbp,%r15
subq 0(%rbx),%rbp
adcq %r15,%r14
sbbq %r15,%r15
movq %r14,-8(%rbx)
cmpq 16(%rsp),%rdi
jne .Lmulx4x_outer
leaq 64(%rsp),%rbx
subq %rax,%rcx
negq %r15
movq %rax,%rdx
shrq $3+2,%rax
movq 32(%rsp),%rdi
jmp .Lmulx4x_sub
.align 32
.Lmulx4x_sub:
movq 0(%rbx),%r11
movq 8(%rbx),%r12
movq 16(%rbx),%r13
movq 24(%rbx),%r14
leaq 32(%rbx),%rbx
sbbq 0(%rcx),%r11
sbbq 8(%rcx),%r12
sbbq 16(%rcx),%r13
sbbq 24(%rcx),%r14
leaq 32(%rcx),%rcx
movq %r11,0(%rdi)
movq %r12,8(%rdi)
movq %r13,16(%rdi)
movq %r14,24(%rdi)
leaq 32(%rdi),%rdi
decq %rax
jnz .Lmulx4x_sub
sbbq $0,%r15
leaq 64(%rsp),%rbx
subq %rdx,%rdi
.byte 102,73,15,110,207
pxor %xmm0,%xmm0
pshufd $0,%xmm1,%xmm1
movq 40(%rsp),%rsi
.cfi_def_cfa %rsi,8
jmp .Lmulx4x_cond_copy
.align 32
.Lmulx4x_cond_copy:
movdqa 0(%rbx),%xmm2
movdqa 16(%rbx),%xmm3
leaq 32(%rbx),%rbx
movdqu 0(%rdi),%xmm4
movdqu 16(%rdi),%xmm5
leaq 32(%rdi),%rdi
movdqa %xmm0,-32(%rbx)
movdqa %xmm0,-16(%rbx)
pcmpeqd %xmm1,%xmm0
pand %xmm1,%xmm2
pand %xmm1,%xmm3
pand %xmm0,%xmm4
pand %xmm0,%xmm5
pxor %xmm0,%xmm0
por %xmm2,%xmm4
por %xmm3,%xmm5
movdqu %xmm4,-32(%rdi)
movdqu %xmm5,-16(%rdi)
subq $32,%rdx
jnz .Lmulx4x_cond_copy
movq %rdx,(%rbx)
movq $1,%rax
movq -48(%rsi),%r15
.cfi_restore %r15
movq -40(%rsi),%r14
.cfi_restore %r14
movq -32(%rsi),%r13
.cfi_restore %r13
movq -24(%rsi),%r12
.cfi_restore %r12
movq -16(%rsi),%rbp
.cfi_restore %rbp
movq -8(%rsi),%rbx
.cfi_restore %rbx
leaq (%rsi),%rsp
.cfi_def_cfa_register %rsp
.Lmulx4x_epilogue:
.byte 0xf3,0xc3
.cfi_endproc
.size bn_mulx4x_mont,.-bn_mulx4x_mont
.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 16

File diff suppressed because it is too large Load Diff

View File

@ -385,6 +385,8 @@ ChaCha20_ssse3:
pushl %esi
pushl %edi
.Lssse3_shortcut:
testl $2048,4(%ebp)
jnz .Lxop_shortcut
movl 20(%esp),%edi
movl 24(%esp),%esi
movl 28(%esp),%ecx
@ -528,6 +530,484 @@ ChaCha20_ssse3:
.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
.byte 114,103,62,0
.globl ChaCha20_xop
.type ChaCha20_xop,@function
.align 16
ChaCha20_xop:
.L_ChaCha20_xop_begin:
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
.Lxop_shortcut:
movl 20(%esp),%edi
movl 24(%esp),%esi
movl 28(%esp),%ecx
movl 32(%esp),%edx
movl 36(%esp),%ebx
vzeroupper
movl %esp,%ebp
subl $524,%esp
andl $-64,%esp
movl %ebp,512(%esp)
leal .Lssse3_data-.Lpic_point(%eax),%eax
vmovdqu (%ebx),%xmm3
cmpl $256,%ecx
jb .L0141x
movl %edx,516(%esp)
movl %ebx,520(%esp)
subl $256,%ecx
leal 384(%esp),%ebp
vmovdqu (%edx),%xmm7
vpshufd $0,%xmm3,%xmm0
vpshufd $85,%xmm3,%xmm1
vpshufd $170,%xmm3,%xmm2
vpshufd $255,%xmm3,%xmm3
vpaddd 48(%eax),%xmm0,%xmm0
vpshufd $0,%xmm7,%xmm4
vpshufd $85,%xmm7,%xmm5
vpsubd 64(%eax),%xmm0,%xmm0
vpshufd $170,%xmm7,%xmm6
vpshufd $255,%xmm7,%xmm7
vmovdqa %xmm0,64(%ebp)
vmovdqa %xmm1,80(%ebp)
vmovdqa %xmm2,96(%ebp)
vmovdqa %xmm3,112(%ebp)
vmovdqu 16(%edx),%xmm3
vmovdqa %xmm4,-64(%ebp)
vmovdqa %xmm5,-48(%ebp)
vmovdqa %xmm6,-32(%ebp)
vmovdqa %xmm7,-16(%ebp)
vmovdqa 32(%eax),%xmm7
leal 128(%esp),%ebx
vpshufd $0,%xmm3,%xmm0
vpshufd $85,%xmm3,%xmm1
vpshufd $170,%xmm3,%xmm2
vpshufd $255,%xmm3,%xmm3
vpshufd $0,%xmm7,%xmm4
vpshufd $85,%xmm7,%xmm5
vpshufd $170,%xmm7,%xmm6
vpshufd $255,%xmm7,%xmm7
vmovdqa %xmm0,(%ebp)
vmovdqa %xmm1,16(%ebp)
vmovdqa %xmm2,32(%ebp)
vmovdqa %xmm3,48(%ebp)
vmovdqa %xmm4,-128(%ebp)
vmovdqa %xmm5,-112(%ebp)
vmovdqa %xmm6,-96(%ebp)
vmovdqa %xmm7,-80(%ebp)
leal 128(%esi),%esi
leal 128(%edi),%edi
jmp .L015outer_loop
.align 32
.L015outer_loop:
vmovdqa -112(%ebp),%xmm1
vmovdqa -96(%ebp),%xmm2
vmovdqa -80(%ebp),%xmm3
vmovdqa -48(%ebp),%xmm5
vmovdqa -32(%ebp),%xmm6
vmovdqa -16(%ebp),%xmm7
vmovdqa %xmm1,-112(%ebx)
vmovdqa %xmm2,-96(%ebx)
vmovdqa %xmm3,-80(%ebx)
vmovdqa %xmm5,-48(%ebx)
vmovdqa %xmm6,-32(%ebx)
vmovdqa %xmm7,-16(%ebx)
vmovdqa 32(%ebp),%xmm2
vmovdqa 48(%ebp),%xmm3
vmovdqa 64(%ebp),%xmm4
vmovdqa 80(%ebp),%xmm5
vmovdqa 96(%ebp),%xmm6
vmovdqa 112(%ebp),%xmm7
vpaddd 64(%eax),%xmm4,%xmm4
vmovdqa %xmm2,32(%ebx)
vmovdqa %xmm3,48(%ebx)
vmovdqa %xmm4,64(%ebx)
vmovdqa %xmm5,80(%ebx)
vmovdqa %xmm6,96(%ebx)
vmovdqa %xmm7,112(%ebx)
vmovdqa %xmm4,64(%ebp)
vmovdqa -128(%ebp),%xmm0
vmovdqa %xmm4,%xmm6
vmovdqa -64(%ebp),%xmm3
vmovdqa (%ebp),%xmm4
vmovdqa 16(%ebp),%xmm5
movl $10,%edx
nop
.align 32
.L016loop:
vpaddd %xmm3,%xmm0,%xmm0
vpxor %xmm0,%xmm6,%xmm6
.byte 143,232,120,194,246,16
vpaddd %xmm6,%xmm4,%xmm4
vpxor %xmm4,%xmm3,%xmm2
vmovdqa -112(%ebx),%xmm1
.byte 143,232,120,194,210,12
vmovdqa -48(%ebx),%xmm3
vpaddd %xmm2,%xmm0,%xmm0
vmovdqa 80(%ebx),%xmm7
vpxor %xmm0,%xmm6,%xmm6
vpaddd %xmm3,%xmm1,%xmm1
.byte 143,232,120,194,246,8
vmovdqa %xmm0,-128(%ebx)
vpaddd %xmm6,%xmm4,%xmm4
vmovdqa %xmm6,64(%ebx)
vpxor %xmm4,%xmm2,%xmm2
vpxor %xmm1,%xmm7,%xmm7
.byte 143,232,120,194,210,7
vmovdqa %xmm4,(%ebx)
.byte 143,232,120,194,255,16
vmovdqa %xmm2,-64(%ebx)
vpaddd %xmm7,%xmm5,%xmm5
vmovdqa 32(%ebx),%xmm4
vpxor %xmm5,%xmm3,%xmm3
vmovdqa -96(%ebx),%xmm0
.byte 143,232,120,194,219,12
vmovdqa -32(%ebx),%xmm2
vpaddd %xmm3,%xmm1,%xmm1
vmovdqa 96(%ebx),%xmm6
vpxor %xmm1,%xmm7,%xmm7
vpaddd %xmm2,%xmm0,%xmm0
.byte 143,232,120,194,255,8
vmovdqa %xmm1,-112(%ebx)
vpaddd %xmm7,%xmm5,%xmm5
vmovdqa %xmm7,80(%ebx)
vpxor %xmm5,%xmm3,%xmm3
vpxor %xmm0,%xmm6,%xmm6
.byte 143,232,120,194,219,7
vmovdqa %xmm5,16(%ebx)
.byte 143,232,120,194,246,16
vmovdqa %xmm3,-48(%ebx)
vpaddd %xmm6,%xmm4,%xmm4
vmovdqa 48(%ebx),%xmm5
vpxor %xmm4,%xmm2,%xmm2
vmovdqa -80(%ebx),%xmm1
.byte 143,232,120,194,210,12
vmovdqa -16(%ebx),%xmm3
vpaddd %xmm2,%xmm0,%xmm0
vmovdqa 112(%ebx),%xmm7
vpxor %xmm0,%xmm6,%xmm6
vpaddd %xmm3,%xmm1,%xmm1
.byte 143,232,120,194,246,8
vmovdqa %xmm0,-96(%ebx)
vpaddd %xmm6,%xmm4,%xmm4
vmovdqa %xmm6,96(%ebx)
vpxor %xmm4,%xmm2,%xmm2
vpxor %xmm1,%xmm7,%xmm7
.byte 143,232,120,194,210,7
.byte 143,232,120,194,255,16
vmovdqa %xmm2,-32(%ebx)
vpaddd %xmm7,%xmm5,%xmm5
vpxor %xmm5,%xmm3,%xmm3
vmovdqa -128(%ebx),%xmm0
.byte 143,232,120,194,219,12
vmovdqa -48(%ebx),%xmm2
vpaddd %xmm3,%xmm1,%xmm1
vpxor %xmm1,%xmm7,%xmm7
vpaddd %xmm2,%xmm0,%xmm0
.byte 143,232,120,194,255,8
vmovdqa %xmm1,-80(%ebx)
vpaddd %xmm7,%xmm5,%xmm5
vpxor %xmm5,%xmm3,%xmm3
vpxor %xmm0,%xmm7,%xmm6
.byte 143,232,120,194,219,7
.byte 143,232,120,194,246,16
vmovdqa %xmm3,-16(%ebx)
vpaddd %xmm6,%xmm4,%xmm4
vpxor %xmm4,%xmm2,%xmm2
vmovdqa -112(%ebx),%xmm1
.byte 143,232,120,194,210,12
vmovdqa -32(%ebx),%xmm3
vpaddd %xmm2,%xmm0,%xmm0
vmovdqa 64(%ebx),%xmm7
vpxor %xmm0,%xmm6,%xmm6
vpaddd %xmm3,%xmm1,%xmm1
.byte 143,232,120,194,246,8
vmovdqa %xmm0,-128(%ebx)
vpaddd %xmm6,%xmm4,%xmm4
vmovdqa %xmm6,112(%ebx)
vpxor %xmm4,%xmm2,%xmm2
vpxor %xmm1,%xmm7,%xmm7
.byte 143,232,120,194,210,7
vmovdqa %xmm4,32(%ebx)
.byte 143,232,120,194,255,16
vmovdqa %xmm2,-48(%ebx)
vpaddd %xmm7,%xmm5,%xmm5
vmovdqa (%ebx),%xmm4
vpxor %xmm5,%xmm3,%xmm3
vmovdqa -96(%ebx),%xmm0
.byte 143,232,120,194,219,12
vmovdqa -16(%ebx),%xmm2
vpaddd %xmm3,%xmm1,%xmm1
vmovdqa 80(%ebx),%xmm6
vpxor %xmm1,%xmm7,%xmm7
vpaddd %xmm2,%xmm0,%xmm0
.byte 143,232,120,194,255,8
vmovdqa %xmm1,-112(%ebx)
vpaddd %xmm7,%xmm5,%xmm5
vmovdqa %xmm7,64(%ebx)
vpxor %xmm5,%xmm3,%xmm3
vpxor %xmm0,%xmm6,%xmm6
.byte 143,232,120,194,219,7
vmovdqa %xmm5,48(%ebx)
.byte 143,232,120,194,246,16
vmovdqa %xmm3,-32(%ebx)
vpaddd %xmm6,%xmm4,%xmm4
vmovdqa 16(%ebx),%xmm5
vpxor %xmm4,%xmm2,%xmm2
vmovdqa -80(%ebx),%xmm1
.byte 143,232,120,194,210,12
vmovdqa -64(%ebx),%xmm3
vpaddd %xmm2,%xmm0,%xmm0
vmovdqa 96(%ebx),%xmm7
vpxor %xmm0,%xmm6,%xmm6
vpaddd %xmm3,%xmm1,%xmm1
.byte 143,232,120,194,246,8
vmovdqa %xmm0,-96(%ebx)
vpaddd %xmm6,%xmm4,%xmm4
vmovdqa %xmm6,80(%ebx)
vpxor %xmm4,%xmm2,%xmm2
vpxor %xmm1,%xmm7,%xmm7
.byte 143,232,120,194,210,7
.byte 143,232,120,194,255,16
vmovdqa %xmm2,-16(%ebx)
vpaddd %xmm7,%xmm5,%xmm5
vpxor %xmm5,%xmm3,%xmm3
vmovdqa -128(%ebx),%xmm0
.byte 143,232,120,194,219,12
vpaddd %xmm3,%xmm1,%xmm1
vmovdqa 64(%ebx),%xmm6
vpxor %xmm1,%xmm7,%xmm7
.byte 143,232,120,194,255,8
vmovdqa %xmm1,-80(%ebx)
vpaddd %xmm7,%xmm5,%xmm5
vmovdqa %xmm7,96(%ebx)
vpxor %xmm5,%xmm3,%xmm3
.byte 143,232,120,194,219,7
decl %edx
jnz .L016loop
vmovdqa %xmm3,-64(%ebx)
vmovdqa %xmm4,(%ebx)
vmovdqa %xmm5,16(%ebx)
vmovdqa %xmm6,64(%ebx)
vmovdqa %xmm7,96(%ebx)
vmovdqa -112(%ebx),%xmm1
vmovdqa -96(%ebx),%xmm2
vmovdqa -80(%ebx),%xmm3
vpaddd -128(%ebp),%xmm0,%xmm0
vpaddd -112(%ebp),%xmm1,%xmm1
vpaddd -96(%ebp),%xmm2,%xmm2
vpaddd -80(%ebp),%xmm3,%xmm3
vpunpckldq %xmm1,%xmm0,%xmm6
vpunpckldq %xmm3,%xmm2,%xmm7
vpunpckhdq %xmm1,%xmm0,%xmm0
vpunpckhdq %xmm3,%xmm2,%xmm2
vpunpcklqdq %xmm7,%xmm6,%xmm1
vpunpckhqdq %xmm7,%xmm6,%xmm6
vpunpcklqdq %xmm2,%xmm0,%xmm7
vpunpckhqdq %xmm2,%xmm0,%xmm3
vpxor -128(%esi),%xmm1,%xmm4
vpxor -64(%esi),%xmm6,%xmm5
vpxor (%esi),%xmm7,%xmm6
vpxor 64(%esi),%xmm3,%xmm7
leal 16(%esi),%esi
vmovdqa -64(%ebx),%xmm0
vmovdqa -48(%ebx),%xmm1
vmovdqa -32(%ebx),%xmm2
vmovdqa -16(%ebx),%xmm3
vmovdqu %xmm4,-128(%edi)
vmovdqu %xmm5,-64(%edi)
vmovdqu %xmm6,(%edi)
vmovdqu %xmm7,64(%edi)
leal 16(%edi),%edi
vpaddd -64(%ebp),%xmm0,%xmm0
vpaddd -48(%ebp),%xmm1,%xmm1
vpaddd -32(%ebp),%xmm2,%xmm2
vpaddd -16(%ebp),%xmm3,%xmm3
vpunpckldq %xmm1,%xmm0,%xmm6
vpunpckldq %xmm3,%xmm2,%xmm7
vpunpckhdq %xmm1,%xmm0,%xmm0
vpunpckhdq %xmm3,%xmm2,%xmm2
vpunpcklqdq %xmm7,%xmm6,%xmm1
vpunpckhqdq %xmm7,%xmm6,%xmm6
vpunpcklqdq %xmm2,%xmm0,%xmm7
vpunpckhqdq %xmm2,%xmm0,%xmm3
vpxor -128(%esi),%xmm1,%xmm4
vpxor -64(%esi),%xmm6,%xmm5
vpxor (%esi),%xmm7,%xmm6
vpxor 64(%esi),%xmm3,%xmm7
leal 16(%esi),%esi
vmovdqa (%ebx),%xmm0
vmovdqa 16(%ebx),%xmm1
vmovdqa 32(%ebx),%xmm2
vmovdqa 48(%ebx),%xmm3
vmovdqu %xmm4,-128(%edi)
vmovdqu %xmm5,-64(%edi)
vmovdqu %xmm6,(%edi)
vmovdqu %xmm7,64(%edi)
leal 16(%edi),%edi
vpaddd (%ebp),%xmm0,%xmm0
vpaddd 16(%ebp),%xmm1,%xmm1
vpaddd 32(%ebp),%xmm2,%xmm2
vpaddd 48(%ebp),%xmm3,%xmm3
vpunpckldq %xmm1,%xmm0,%xmm6
vpunpckldq %xmm3,%xmm2,%xmm7
vpunpckhdq %xmm1,%xmm0,%xmm0
vpunpckhdq %xmm3,%xmm2,%xmm2
vpunpcklqdq %xmm7,%xmm6,%xmm1
vpunpckhqdq %xmm7,%xmm6,%xmm6
vpunpcklqdq %xmm2,%xmm0,%xmm7
vpunpckhqdq %xmm2,%xmm0,%xmm3
vpxor -128(%esi),%xmm1,%xmm4
vpxor -64(%esi),%xmm6,%xmm5
vpxor (%esi),%xmm7,%xmm6
vpxor 64(%esi),%xmm3,%xmm7
leal 16(%esi),%esi
vmovdqa 64(%ebx),%xmm0
vmovdqa 80(%ebx),%xmm1
vmovdqa 96(%ebx),%xmm2
vmovdqa 112(%ebx),%xmm3
vmovdqu %xmm4,-128(%edi)
vmovdqu %xmm5,-64(%edi)
vmovdqu %xmm6,(%edi)
vmovdqu %xmm7,64(%edi)
leal 16(%edi),%edi
vpaddd 64(%ebp),%xmm0,%xmm0
vpaddd 80(%ebp),%xmm1,%xmm1
vpaddd 96(%ebp),%xmm2,%xmm2
vpaddd 112(%ebp),%xmm3,%xmm3
vpunpckldq %xmm1,%xmm0,%xmm6
vpunpckldq %xmm3,%xmm2,%xmm7
vpunpckhdq %xmm1,%xmm0,%xmm0
vpunpckhdq %xmm3,%xmm2,%xmm2
vpunpcklqdq %xmm7,%xmm6,%xmm1
vpunpckhqdq %xmm7,%xmm6,%xmm6
vpunpcklqdq %xmm2,%xmm0,%xmm7
vpunpckhqdq %xmm2,%xmm0,%xmm3
vpxor -128(%esi),%xmm1,%xmm4
vpxor -64(%esi),%xmm6,%xmm5
vpxor (%esi),%xmm7,%xmm6
vpxor 64(%esi),%xmm3,%xmm7
leal 208(%esi),%esi
vmovdqu %xmm4,-128(%edi)
vmovdqu %xmm5,-64(%edi)
vmovdqu %xmm6,(%edi)
vmovdqu %xmm7,64(%edi)
leal 208(%edi),%edi
subl $256,%ecx
jnc .L015outer_loop
addl $256,%ecx
jz .L017done
movl 520(%esp),%ebx
leal -128(%esi),%esi
movl 516(%esp),%edx
leal -128(%edi),%edi
vmovd 64(%ebp),%xmm2
vmovdqu (%ebx),%xmm3
vpaddd 96(%eax),%xmm2,%xmm2
vpand 112(%eax),%xmm3,%xmm3
vpor %xmm2,%xmm3,%xmm3
.L0141x:
vmovdqa 32(%eax),%xmm0
vmovdqu (%edx),%xmm1
vmovdqu 16(%edx),%xmm2
vmovdqa (%eax),%xmm6
vmovdqa 16(%eax),%xmm7
movl %ebp,48(%esp)
vmovdqa %xmm0,(%esp)
vmovdqa %xmm1,16(%esp)
vmovdqa %xmm2,32(%esp)
vmovdqa %xmm3,48(%esp)
movl $10,%edx
jmp .L018loop1x
.align 16
.L019outer1x:
vmovdqa 80(%eax),%xmm3
vmovdqa (%esp),%xmm0
vmovdqa 16(%esp),%xmm1
vmovdqa 32(%esp),%xmm2
vpaddd 48(%esp),%xmm3,%xmm3
movl $10,%edx
vmovdqa %xmm3,48(%esp)
jmp .L018loop1x
.align 16
.L018loop1x:
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
.byte 143,232,120,194,219,16
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
.byte 143,232,120,194,201,12
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
.byte 143,232,120,194,219,8
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
.byte 143,232,120,194,201,7
vpshufd $78,%xmm2,%xmm2
vpshufd $57,%xmm1,%xmm1
vpshufd $147,%xmm3,%xmm3
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
.byte 143,232,120,194,219,16
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
.byte 143,232,120,194,201,12
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
.byte 143,232,120,194,219,8
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
.byte 143,232,120,194,201,7
vpshufd $78,%xmm2,%xmm2
vpshufd $147,%xmm1,%xmm1
vpshufd $57,%xmm3,%xmm3
decl %edx
jnz .L018loop1x
vpaddd (%esp),%xmm0,%xmm0
vpaddd 16(%esp),%xmm1,%xmm1
vpaddd 32(%esp),%xmm2,%xmm2
vpaddd 48(%esp),%xmm3,%xmm3
cmpl $64,%ecx
jb .L020tail
vpxor (%esi),%xmm0,%xmm0
vpxor 16(%esi),%xmm1,%xmm1
vpxor 32(%esi),%xmm2,%xmm2
vpxor 48(%esi),%xmm3,%xmm3
leal 64(%esi),%esi
vmovdqu %xmm0,(%edi)
vmovdqu %xmm1,16(%edi)
vmovdqu %xmm2,32(%edi)
vmovdqu %xmm3,48(%edi)
leal 64(%edi),%edi
subl $64,%ecx
jnz .L019outer1x
jmp .L017done
.L020tail:
vmovdqa %xmm0,(%esp)
vmovdqa %xmm1,16(%esp)
vmovdqa %xmm2,32(%esp)
vmovdqa %xmm3,48(%esp)
xorl %eax,%eax
xorl %edx,%edx
xorl %ebp,%ebp
.L021tail_loop:
movb (%esp,%ebp,1),%al
movb (%esi,%ebp,1),%dl
leal 1(%ebp),%ebp
xorb %dl,%al
movb %al,-1(%edi,%ebp,1)
decl %ecx
jnz .L021tail_loop
.L017done:
vzeroupper
movl 512(%esp),%esp
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size ChaCha20_xop,.-.L_ChaCha20_xop_begin
.comm OPENSSL_ia32cap_P,16,4
#else
.text
@ -914,6 +1394,8 @@ ChaCha20_ssse3:
pushl %esi
pushl %edi
.Lssse3_shortcut:
testl $2048,4(%ebp)
jnz .Lxop_shortcut
movl 20(%esp),%edi
movl 24(%esp),%esi
movl 28(%esp),%ecx
@ -1057,5 +1539,483 @@ ChaCha20_ssse3:
.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
.byte 114,103,62,0
.globl ChaCha20_xop
.type ChaCha20_xop,@function
.align 16
ChaCha20_xop:
.L_ChaCha20_xop_begin:
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
.Lxop_shortcut:
movl 20(%esp),%edi
movl 24(%esp),%esi
movl 28(%esp),%ecx
movl 32(%esp),%edx
movl 36(%esp),%ebx
vzeroupper
movl %esp,%ebp
subl $524,%esp
andl $-64,%esp
movl %ebp,512(%esp)
leal .Lssse3_data-.Lpic_point(%eax),%eax
vmovdqu (%ebx),%xmm3
cmpl $256,%ecx
jb .L0141x
movl %edx,516(%esp)
movl %ebx,520(%esp)
subl $256,%ecx
leal 384(%esp),%ebp
vmovdqu (%edx),%xmm7
vpshufd $0,%xmm3,%xmm0
vpshufd $85,%xmm3,%xmm1
vpshufd $170,%xmm3,%xmm2
vpshufd $255,%xmm3,%xmm3
vpaddd 48(%eax),%xmm0,%xmm0
vpshufd $0,%xmm7,%xmm4
vpshufd $85,%xmm7,%xmm5
vpsubd 64(%eax),%xmm0,%xmm0
vpshufd $170,%xmm7,%xmm6
vpshufd $255,%xmm7,%xmm7
vmovdqa %xmm0,64(%ebp)
vmovdqa %xmm1,80(%ebp)
vmovdqa %xmm2,96(%ebp)
vmovdqa %xmm3,112(%ebp)
vmovdqu 16(%edx),%xmm3
vmovdqa %xmm4,-64(%ebp)
vmovdqa %xmm5,-48(%ebp)
vmovdqa %xmm6,-32(%ebp)
vmovdqa %xmm7,-16(%ebp)
vmovdqa 32(%eax),%xmm7
leal 128(%esp),%ebx
vpshufd $0,%xmm3,%xmm0
vpshufd $85,%xmm3,%xmm1
vpshufd $170,%xmm3,%xmm2
vpshufd $255,%xmm3,%xmm3
vpshufd $0,%xmm7,%xmm4
vpshufd $85,%xmm7,%xmm5
vpshufd $170,%xmm7,%xmm6
vpshufd $255,%xmm7,%xmm7
vmovdqa %xmm0,(%ebp)
vmovdqa %xmm1,16(%ebp)
vmovdqa %xmm2,32(%ebp)
vmovdqa %xmm3,48(%ebp)
vmovdqa %xmm4,-128(%ebp)
vmovdqa %xmm5,-112(%ebp)
vmovdqa %xmm6,-96(%ebp)
vmovdqa %xmm7,-80(%ebp)
leal 128(%esi),%esi
leal 128(%edi),%edi
jmp .L015outer_loop
.align 32
.L015outer_loop:
vmovdqa -112(%ebp),%xmm1
vmovdqa -96(%ebp),%xmm2
vmovdqa -80(%ebp),%xmm3
vmovdqa -48(%ebp),%xmm5
vmovdqa -32(%ebp),%xmm6
vmovdqa -16(%ebp),%xmm7
vmovdqa %xmm1,-112(%ebx)
vmovdqa %xmm2,-96(%ebx)
vmovdqa %xmm3,-80(%ebx)
vmovdqa %xmm5,-48(%ebx)
vmovdqa %xmm6,-32(%ebx)
vmovdqa %xmm7,-16(%ebx)
vmovdqa 32(%ebp),%xmm2
vmovdqa 48(%ebp),%xmm3
vmovdqa 64(%ebp),%xmm4
vmovdqa 80(%ebp),%xmm5
vmovdqa 96(%ebp),%xmm6
vmovdqa 112(%ebp),%xmm7
vpaddd 64(%eax),%xmm4,%xmm4
vmovdqa %xmm2,32(%ebx)
vmovdqa %xmm3,48(%ebx)
vmovdqa %xmm4,64(%ebx)
vmovdqa %xmm5,80(%ebx)
vmovdqa %xmm6,96(%ebx)
vmovdqa %xmm7,112(%ebx)
vmovdqa %xmm4,64(%ebp)
vmovdqa -128(%ebp),%xmm0
vmovdqa %xmm4,%xmm6
vmovdqa -64(%ebp),%xmm3
vmovdqa (%ebp),%xmm4
vmovdqa 16(%ebp),%xmm5
movl $10,%edx
nop
.align 32
.L016loop:
vpaddd %xmm3,%xmm0,%xmm0
vpxor %xmm0,%xmm6,%xmm6
.byte 143,232,120,194,246,16
vpaddd %xmm6,%xmm4,%xmm4
vpxor %xmm4,%xmm3,%xmm2
vmovdqa -112(%ebx),%xmm1
.byte 143,232,120,194,210,12
vmovdqa -48(%ebx),%xmm3
vpaddd %xmm2,%xmm0,%xmm0
vmovdqa 80(%ebx),%xmm7
vpxor %xmm0,%xmm6,%xmm6
vpaddd %xmm3,%xmm1,%xmm1
.byte 143,232,120,194,246,8
vmovdqa %xmm0,-128(%ebx)
vpaddd %xmm6,%xmm4,%xmm4
vmovdqa %xmm6,64(%ebx)
vpxor %xmm4,%xmm2,%xmm2
vpxor %xmm1,%xmm7,%xmm7
.byte 143,232,120,194,210,7
vmovdqa %xmm4,(%ebx)
.byte 143,232,120,194,255,16
vmovdqa %xmm2,-64(%ebx)
vpaddd %xmm7,%xmm5,%xmm5
vmovdqa 32(%ebx),%xmm4
vpxor %xmm5,%xmm3,%xmm3
vmovdqa -96(%ebx),%xmm0
.byte 143,232,120,194,219,12
vmovdqa -32(%ebx),%xmm2
vpaddd %xmm3,%xmm1,%xmm1
vmovdqa 96(%ebx),%xmm6
vpxor %xmm1,%xmm7,%xmm7
vpaddd %xmm2,%xmm0,%xmm0
.byte 143,232,120,194,255,8
vmovdqa %xmm1,-112(%ebx)
vpaddd %xmm7,%xmm5,%xmm5
vmovdqa %xmm7,80(%ebx)
vpxor %xmm5,%xmm3,%xmm3
vpxor %xmm0,%xmm6,%xmm6
.byte 143,232,120,194,219,7
vmovdqa %xmm5,16(%ebx)
.byte 143,232,120,194,246,16
vmovdqa %xmm3,-48(%ebx)
vpaddd %xmm6,%xmm4,%xmm4
vmovdqa 48(%ebx),%xmm5
vpxor %xmm4,%xmm2,%xmm2
vmovdqa -80(%ebx),%xmm1
.byte 143,232,120,194,210,12
vmovdqa -16(%ebx),%xmm3
vpaddd %xmm2,%xmm0,%xmm0
vmovdqa 112(%ebx),%xmm7
vpxor %xmm0,%xmm6,%xmm6
vpaddd %xmm3,%xmm1,%xmm1
.byte 143,232,120,194,246,8
vmovdqa %xmm0,-96(%ebx)
vpaddd %xmm6,%xmm4,%xmm4
vmovdqa %xmm6,96(%ebx)
vpxor %xmm4,%xmm2,%xmm2
vpxor %xmm1,%xmm7,%xmm7
.byte 143,232,120,194,210,7
.byte 143,232,120,194,255,16
vmovdqa %xmm2,-32(%ebx)
vpaddd %xmm7,%xmm5,%xmm5
vpxor %xmm5,%xmm3,%xmm3
vmovdqa -128(%ebx),%xmm0
.byte 143,232,120,194,219,12
vmovdqa -48(%ebx),%xmm2
vpaddd %xmm3,%xmm1,%xmm1
vpxor %xmm1,%xmm7,%xmm7
vpaddd %xmm2,%xmm0,%xmm0
.byte 143,232,120,194,255,8
vmovdqa %xmm1,-80(%ebx)
vpaddd %xmm7,%xmm5,%xmm5
vpxor %xmm5,%xmm3,%xmm3
vpxor %xmm0,%xmm7,%xmm6
.byte 143,232,120,194,219,7
.byte 143,232,120,194,246,16
vmovdqa %xmm3,-16(%ebx)
vpaddd %xmm6,%xmm4,%xmm4
vpxor %xmm4,%xmm2,%xmm2
vmovdqa -112(%ebx),%xmm1
.byte 143,232,120,194,210,12
vmovdqa -32(%ebx),%xmm3
vpaddd %xmm2,%xmm0,%xmm0
vmovdqa 64(%ebx),%xmm7
vpxor %xmm0,%xmm6,%xmm6
vpaddd %xmm3,%xmm1,%xmm1
.byte 143,232,120,194,246,8
vmovdqa %xmm0,-128(%ebx)
vpaddd %xmm6,%xmm4,%xmm4
vmovdqa %xmm6,112(%ebx)
vpxor %xmm4,%xmm2,%xmm2
vpxor %xmm1,%xmm7,%xmm7
.byte 143,232,120,194,210,7
vmovdqa %xmm4,32(%ebx)
.byte 143,232,120,194,255,16
vmovdqa %xmm2,-48(%ebx)
vpaddd %xmm7,%xmm5,%xmm5
vmovdqa (%ebx),%xmm4
vpxor %xmm5,%xmm3,%xmm3
vmovdqa -96(%ebx),%xmm0
.byte 143,232,120,194,219,12
vmovdqa -16(%ebx),%xmm2
vpaddd %xmm3,%xmm1,%xmm1
vmovdqa 80(%ebx),%xmm6
vpxor %xmm1,%xmm7,%xmm7
vpaddd %xmm2,%xmm0,%xmm0
.byte 143,232,120,194,255,8
vmovdqa %xmm1,-112(%ebx)
vpaddd %xmm7,%xmm5,%xmm5
vmovdqa %xmm7,64(%ebx)
vpxor %xmm5,%xmm3,%xmm3
vpxor %xmm0,%xmm6,%xmm6
.byte 143,232,120,194,219,7
vmovdqa %xmm5,48(%ebx)
.byte 143,232,120,194,246,16
vmovdqa %xmm3,-32(%ebx)
vpaddd %xmm6,%xmm4,%xmm4
vmovdqa 16(%ebx),%xmm5
vpxor %xmm4,%xmm2,%xmm2
vmovdqa -80(%ebx),%xmm1
.byte 143,232,120,194,210,12
vmovdqa -64(%ebx),%xmm3
vpaddd %xmm2,%xmm0,%xmm0
vmovdqa 96(%ebx),%xmm7
vpxor %xmm0,%xmm6,%xmm6
vpaddd %xmm3,%xmm1,%xmm1
.byte 143,232,120,194,246,8
vmovdqa %xmm0,-96(%ebx)
vpaddd %xmm6,%xmm4,%xmm4
vmovdqa %xmm6,80(%ebx)
vpxor %xmm4,%xmm2,%xmm2
vpxor %xmm1,%xmm7,%xmm7
.byte 143,232,120,194,210,7
.byte 143,232,120,194,255,16
vmovdqa %xmm2,-16(%ebx)
vpaddd %xmm7,%xmm5,%xmm5
vpxor %xmm5,%xmm3,%xmm3
vmovdqa -128(%ebx),%xmm0
.byte 143,232,120,194,219,12
vpaddd %xmm3,%xmm1,%xmm1
vmovdqa 64(%ebx),%xmm6
vpxor %xmm1,%xmm7,%xmm7
.byte 143,232,120,194,255,8
vmovdqa %xmm1,-80(%ebx)
vpaddd %xmm7,%xmm5,%xmm5
vmovdqa %xmm7,96(%ebx)
vpxor %xmm5,%xmm3,%xmm3
.byte 143,232,120,194,219,7
decl %edx
jnz .L016loop
vmovdqa %xmm3,-64(%ebx)
vmovdqa %xmm4,(%ebx)
vmovdqa %xmm5,16(%ebx)
vmovdqa %xmm6,64(%ebx)
vmovdqa %xmm7,96(%ebx)
vmovdqa -112(%ebx),%xmm1
vmovdqa -96(%ebx),%xmm2
vmovdqa -80(%ebx),%xmm3
vpaddd -128(%ebp),%xmm0,%xmm0
vpaddd -112(%ebp),%xmm1,%xmm1
vpaddd -96(%ebp),%xmm2,%xmm2
vpaddd -80(%ebp),%xmm3,%xmm3
vpunpckldq %xmm1,%xmm0,%xmm6
vpunpckldq %xmm3,%xmm2,%xmm7
vpunpckhdq %xmm1,%xmm0,%xmm0
vpunpckhdq %xmm3,%xmm2,%xmm2
vpunpcklqdq %xmm7,%xmm6,%xmm1
vpunpckhqdq %xmm7,%xmm6,%xmm6
vpunpcklqdq %xmm2,%xmm0,%xmm7
vpunpckhqdq %xmm2,%xmm0,%xmm3
vpxor -128(%esi),%xmm1,%xmm4
vpxor -64(%esi),%xmm6,%xmm5
vpxor (%esi),%xmm7,%xmm6
vpxor 64(%esi),%xmm3,%xmm7
leal 16(%esi),%esi
vmovdqa -64(%ebx),%xmm0
vmovdqa -48(%ebx),%xmm1
vmovdqa -32(%ebx),%xmm2
vmovdqa -16(%ebx),%xmm3
vmovdqu %xmm4,-128(%edi)
vmovdqu %xmm5,-64(%edi)
vmovdqu %xmm6,(%edi)
vmovdqu %xmm7,64(%edi)
leal 16(%edi),%edi
vpaddd -64(%ebp),%xmm0,%xmm0
vpaddd -48(%ebp),%xmm1,%xmm1
vpaddd -32(%ebp),%xmm2,%xmm2
vpaddd -16(%ebp),%xmm3,%xmm3
vpunpckldq %xmm1,%xmm0,%xmm6
vpunpckldq %xmm3,%xmm2,%xmm7
vpunpckhdq %xmm1,%xmm0,%xmm0
vpunpckhdq %xmm3,%xmm2,%xmm2
vpunpcklqdq %xmm7,%xmm6,%xmm1
vpunpckhqdq %xmm7,%xmm6,%xmm6
vpunpcklqdq %xmm2,%xmm0,%xmm7
vpunpckhqdq %xmm2,%xmm0,%xmm3
vpxor -128(%esi),%xmm1,%xmm4
vpxor -64(%esi),%xmm6,%xmm5
vpxor (%esi),%xmm7,%xmm6
vpxor 64(%esi),%xmm3,%xmm7
leal 16(%esi),%esi
vmovdqa (%ebx),%xmm0
vmovdqa 16(%ebx),%xmm1
vmovdqa 32(%ebx),%xmm2
vmovdqa 48(%ebx),%xmm3
vmovdqu %xmm4,-128(%edi)
vmovdqu %xmm5,-64(%edi)
vmovdqu %xmm6,(%edi)
vmovdqu %xmm7,64(%edi)
leal 16(%edi),%edi
vpaddd (%ebp),%xmm0,%xmm0
vpaddd 16(%ebp),%xmm1,%xmm1
vpaddd 32(%ebp),%xmm2,%xmm2
vpaddd 48(%ebp),%xmm3,%xmm3
vpunpckldq %xmm1,%xmm0,%xmm6
vpunpckldq %xmm3,%xmm2,%xmm7
vpunpckhdq %xmm1,%xmm0,%xmm0
vpunpckhdq %xmm3,%xmm2,%xmm2
vpunpcklqdq %xmm7,%xmm6,%xmm1
vpunpckhqdq %xmm7,%xmm6,%xmm6
vpunpcklqdq %xmm2,%xmm0,%xmm7
vpunpckhqdq %xmm2,%xmm0,%xmm3
vpxor -128(%esi),%xmm1,%xmm4
vpxor -64(%esi),%xmm6,%xmm5
vpxor (%esi),%xmm7,%xmm6
vpxor 64(%esi),%xmm3,%xmm7
leal 16(%esi),%esi
vmovdqa 64(%ebx),%xmm0
vmovdqa 80(%ebx),%xmm1
vmovdqa 96(%ebx),%xmm2
vmovdqa 112(%ebx),%xmm3
vmovdqu %xmm4,-128(%edi)
vmovdqu %xmm5,-64(%edi)
vmovdqu %xmm6,(%edi)
vmovdqu %xmm7,64(%edi)
leal 16(%edi),%edi
vpaddd 64(%ebp),%xmm0,%xmm0
vpaddd 80(%ebp),%xmm1,%xmm1
vpaddd 96(%ebp),%xmm2,%xmm2
vpaddd 112(%ebp),%xmm3,%xmm3
vpunpckldq %xmm1,%xmm0,%xmm6
vpunpckldq %xmm3,%xmm2,%xmm7
vpunpckhdq %xmm1,%xmm0,%xmm0
vpunpckhdq %xmm3,%xmm2,%xmm2
vpunpcklqdq %xmm7,%xmm6,%xmm1
vpunpckhqdq %xmm7,%xmm6,%xmm6
vpunpcklqdq %xmm2,%xmm0,%xmm7
vpunpckhqdq %xmm2,%xmm0,%xmm3
vpxor -128(%esi),%xmm1,%xmm4
vpxor -64(%esi),%xmm6,%xmm5
vpxor (%esi),%xmm7,%xmm6
vpxor 64(%esi),%xmm3,%xmm7
leal 208(%esi),%esi
vmovdqu %xmm4,-128(%edi)
vmovdqu %xmm5,-64(%edi)
vmovdqu %xmm6,(%edi)
vmovdqu %xmm7,64(%edi)
leal 208(%edi),%edi
subl $256,%ecx
jnc .L015outer_loop
addl $256,%ecx
jz .L017done
movl 520(%esp),%ebx
leal -128(%esi),%esi
movl 516(%esp),%edx
leal -128(%edi),%edi
vmovd 64(%ebp),%xmm2
vmovdqu (%ebx),%xmm3
vpaddd 96(%eax),%xmm2,%xmm2
vpand 112(%eax),%xmm3,%xmm3
vpor %xmm2,%xmm3,%xmm3
.L0141x:
vmovdqa 32(%eax),%xmm0
vmovdqu (%edx),%xmm1
vmovdqu 16(%edx),%xmm2
vmovdqa (%eax),%xmm6
vmovdqa 16(%eax),%xmm7
movl %ebp,48(%esp)
vmovdqa %xmm0,(%esp)
vmovdqa %xmm1,16(%esp)
vmovdqa %xmm2,32(%esp)
vmovdqa %xmm3,48(%esp)
movl $10,%edx
jmp .L018loop1x
.align 16
.L019outer1x:
vmovdqa 80(%eax),%xmm3
vmovdqa (%esp),%xmm0
vmovdqa 16(%esp),%xmm1
vmovdqa 32(%esp),%xmm2
vpaddd 48(%esp),%xmm3,%xmm3
movl $10,%edx
vmovdqa %xmm3,48(%esp)
jmp .L018loop1x
.align 16
.L018loop1x:
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
.byte 143,232,120,194,219,16
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
.byte 143,232,120,194,201,12
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
.byte 143,232,120,194,219,8
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
.byte 143,232,120,194,201,7
vpshufd $78,%xmm2,%xmm2
vpshufd $57,%xmm1,%xmm1
vpshufd $147,%xmm3,%xmm3
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
.byte 143,232,120,194,219,16
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
.byte 143,232,120,194,201,12
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
.byte 143,232,120,194,219,8
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
.byte 143,232,120,194,201,7
vpshufd $78,%xmm2,%xmm2
vpshufd $147,%xmm1,%xmm1
vpshufd $57,%xmm3,%xmm3
decl %edx
jnz .L018loop1x
vpaddd (%esp),%xmm0,%xmm0
vpaddd 16(%esp),%xmm1,%xmm1
vpaddd 32(%esp),%xmm2,%xmm2
vpaddd 48(%esp),%xmm3,%xmm3
cmpl $64,%ecx
jb .L020tail
vpxor (%esi),%xmm0,%xmm0
vpxor 16(%esi),%xmm1,%xmm1
vpxor 32(%esi),%xmm2,%xmm2
vpxor 48(%esi),%xmm3,%xmm3
leal 64(%esi),%esi
vmovdqu %xmm0,(%edi)
vmovdqu %xmm1,16(%edi)
vmovdqu %xmm2,32(%edi)
vmovdqu %xmm3,48(%edi)
leal 64(%edi),%edi
subl $64,%ecx
jnz .L019outer1x
jmp .L017done
.L020tail:
vmovdqa %xmm0,(%esp)
vmovdqa %xmm1,16(%esp)
vmovdqa %xmm2,32(%esp)
vmovdqa %xmm3,48(%esp)
xorl %eax,%eax
xorl %edx,%edx
xorl %ebp,%ebp
.L021tail_loop:
movb (%esp,%ebp,1),%al
movb (%esi,%ebp,1),%dl
leal 1(%ebp),%ebp
xorb %dl,%al
movb %al,-1(%edi,%ebp,1)
decl %ecx
jnz .L021tail_loop
.L017done:
vzeroupper
movl 512(%esp),%esp
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size ChaCha20_xop,.-.L_ChaCha20_xop_begin
.comm OPENSSL_ia32cap_P,16,4
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff