Regen X86 assembly files after r364822.
This commit is contained in:
parent
63c1bb5162
commit
3971092e11
@ -2,20 +2,790 @@
|
||||
/* Do not modify. This file is auto-generated from aesni-gcm-x86_64.pl. */
|
||||
.text
|
||||
|
||||
.globl aesni_gcm_encrypt
|
||||
.type aesni_gcm_encrypt,@function
|
||||
aesni_gcm_encrypt:
|
||||
.type _aesni_ctr32_ghash_6x,@function
|
||||
.align 32
|
||||
_aesni_ctr32_ghash_6x:
|
||||
.cfi_startproc
|
||||
xorl %eax,%eax
|
||||
vmovdqu 32(%r11),%xmm2
|
||||
subq $6,%rdx
|
||||
vpxor %xmm4,%xmm4,%xmm4
|
||||
vmovdqu 0-128(%rcx),%xmm15
|
||||
vpaddb %xmm2,%xmm1,%xmm10
|
||||
vpaddb %xmm2,%xmm10,%xmm11
|
||||
vpaddb %xmm2,%xmm11,%xmm12
|
||||
vpaddb %xmm2,%xmm12,%xmm13
|
||||
vpaddb %xmm2,%xmm13,%xmm14
|
||||
vpxor %xmm15,%xmm1,%xmm9
|
||||
vmovdqu %xmm4,16+8(%rsp)
|
||||
jmp .Loop6x
|
||||
|
||||
.align 32
|
||||
.Loop6x:
|
||||
addl $100663296,%ebx
|
||||
jc .Lhandle_ctr32
|
||||
vmovdqu 0-32(%r9),%xmm3
|
||||
vpaddb %xmm2,%xmm14,%xmm1
|
||||
vpxor %xmm15,%xmm10,%xmm10
|
||||
vpxor %xmm15,%xmm11,%xmm11
|
||||
|
||||
.Lresume_ctr32:
|
||||
vmovdqu %xmm1,(%r8)
|
||||
vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5
|
||||
vpxor %xmm15,%xmm12,%xmm12
|
||||
vmovups 16-128(%rcx),%xmm2
|
||||
vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6
|
||||
xorq %r12,%r12
|
||||
cmpq %r14,%r15
|
||||
|
||||
vaesenc %xmm2,%xmm9,%xmm9
|
||||
vmovdqu 48+8(%rsp),%xmm0
|
||||
vpxor %xmm15,%xmm13,%xmm13
|
||||
vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1
|
||||
vaesenc %xmm2,%xmm10,%xmm10
|
||||
vpxor %xmm15,%xmm14,%xmm14
|
||||
setnc %r12b
|
||||
vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
|
||||
vaesenc %xmm2,%xmm11,%xmm11
|
||||
vmovdqu 16-32(%r9),%xmm3
|
||||
negq %r12
|
||||
vaesenc %xmm2,%xmm12,%xmm12
|
||||
vpxor %xmm5,%xmm6,%xmm6
|
||||
vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5
|
||||
vpxor %xmm4,%xmm8,%xmm8
|
||||
vaesenc %xmm2,%xmm13,%xmm13
|
||||
vpxor %xmm5,%xmm1,%xmm4
|
||||
andq $0x60,%r12
|
||||
vmovups 32-128(%rcx),%xmm15
|
||||
vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1
|
||||
vaesenc %xmm2,%xmm14,%xmm14
|
||||
|
||||
vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2
|
||||
leaq (%r14,%r12,1),%r14
|
||||
vaesenc %xmm15,%xmm9,%xmm9
|
||||
vpxor 16+8(%rsp),%xmm8,%xmm8
|
||||
vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3
|
||||
vmovdqu 64+8(%rsp),%xmm0
|
||||
vaesenc %xmm15,%xmm10,%xmm10
|
||||
movbeq 88(%r14),%r13
|
||||
vaesenc %xmm15,%xmm11,%xmm11
|
||||
movbeq 80(%r14),%r12
|
||||
vaesenc %xmm15,%xmm12,%xmm12
|
||||
movq %r13,32+8(%rsp)
|
||||
vaesenc %xmm15,%xmm13,%xmm13
|
||||
movq %r12,40+8(%rsp)
|
||||
vmovdqu 48-32(%r9),%xmm5
|
||||
vaesenc %xmm15,%xmm14,%xmm14
|
||||
|
||||
vmovups 48-128(%rcx),%xmm15
|
||||
vpxor %xmm1,%xmm6,%xmm6
|
||||
vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1
|
||||
vaesenc %xmm15,%xmm9,%xmm9
|
||||
vpxor %xmm2,%xmm6,%xmm6
|
||||
vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2
|
||||
vaesenc %xmm15,%xmm10,%xmm10
|
||||
vpxor %xmm3,%xmm7,%xmm7
|
||||
vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3
|
||||
vaesenc %xmm15,%xmm11,%xmm11
|
||||
vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5
|
||||
vmovdqu 80+8(%rsp),%xmm0
|
||||
vaesenc %xmm15,%xmm12,%xmm12
|
||||
vaesenc %xmm15,%xmm13,%xmm13
|
||||
vpxor %xmm1,%xmm4,%xmm4
|
||||
vmovdqu 64-32(%r9),%xmm1
|
||||
vaesenc %xmm15,%xmm14,%xmm14
|
||||
|
||||
vmovups 64-128(%rcx),%xmm15
|
||||
vpxor %xmm2,%xmm6,%xmm6
|
||||
vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
|
||||
vaesenc %xmm15,%xmm9,%xmm9
|
||||
vpxor %xmm3,%xmm6,%xmm6
|
||||
vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
|
||||
vaesenc %xmm15,%xmm10,%xmm10
|
||||
movbeq 72(%r14),%r13
|
||||
vpxor %xmm5,%xmm7,%xmm7
|
||||
vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5
|
||||
vaesenc %xmm15,%xmm11,%xmm11
|
||||
movbeq 64(%r14),%r12
|
||||
vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1
|
||||
vmovdqu 96+8(%rsp),%xmm0
|
||||
vaesenc %xmm15,%xmm12,%xmm12
|
||||
movq %r13,48+8(%rsp)
|
||||
vaesenc %xmm15,%xmm13,%xmm13
|
||||
movq %r12,56+8(%rsp)
|
||||
vpxor %xmm2,%xmm4,%xmm4
|
||||
vmovdqu 96-32(%r9),%xmm2
|
||||
vaesenc %xmm15,%xmm14,%xmm14
|
||||
|
||||
vmovups 80-128(%rcx),%xmm15
|
||||
vpxor %xmm3,%xmm6,%xmm6
|
||||
vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3
|
||||
vaesenc %xmm15,%xmm9,%xmm9
|
||||
vpxor %xmm5,%xmm6,%xmm6
|
||||
vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5
|
||||
vaesenc %xmm15,%xmm10,%xmm10
|
||||
movbeq 56(%r14),%r13
|
||||
vpxor %xmm1,%xmm7,%xmm7
|
||||
vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1
|
||||
vpxor 112+8(%rsp),%xmm8,%xmm8
|
||||
vaesenc %xmm15,%xmm11,%xmm11
|
||||
movbeq 48(%r14),%r12
|
||||
vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2
|
||||
vaesenc %xmm15,%xmm12,%xmm12
|
||||
movq %r13,64+8(%rsp)
|
||||
vaesenc %xmm15,%xmm13,%xmm13
|
||||
movq %r12,72+8(%rsp)
|
||||
vpxor %xmm3,%xmm4,%xmm4
|
||||
vmovdqu 112-32(%r9),%xmm3
|
||||
vaesenc %xmm15,%xmm14,%xmm14
|
||||
|
||||
vmovups 96-128(%rcx),%xmm15
|
||||
vpxor %xmm5,%xmm6,%xmm6
|
||||
vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5
|
||||
vaesenc %xmm15,%xmm9,%xmm9
|
||||
vpxor %xmm1,%xmm6,%xmm6
|
||||
vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1
|
||||
vaesenc %xmm15,%xmm10,%xmm10
|
||||
movbeq 40(%r14),%r13
|
||||
vpxor %xmm2,%xmm7,%xmm7
|
||||
vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2
|
||||
vaesenc %xmm15,%xmm11,%xmm11
|
||||
movbeq 32(%r14),%r12
|
||||
vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8
|
||||
vaesenc %xmm15,%xmm12,%xmm12
|
||||
movq %r13,80+8(%rsp)
|
||||
vaesenc %xmm15,%xmm13,%xmm13
|
||||
movq %r12,88+8(%rsp)
|
||||
vpxor %xmm5,%xmm6,%xmm6
|
||||
vaesenc %xmm15,%xmm14,%xmm14
|
||||
vpxor %xmm1,%xmm6,%xmm6
|
||||
|
||||
vmovups 112-128(%rcx),%xmm15
|
||||
vpslldq $8,%xmm6,%xmm5
|
||||
vpxor %xmm2,%xmm4,%xmm4
|
||||
vmovdqu 16(%r11),%xmm3
|
||||
|
||||
vaesenc %xmm15,%xmm9,%xmm9
|
||||
vpxor %xmm8,%xmm7,%xmm7
|
||||
vaesenc %xmm15,%xmm10,%xmm10
|
||||
vpxor %xmm5,%xmm4,%xmm4
|
||||
movbeq 24(%r14),%r13
|
||||
vaesenc %xmm15,%xmm11,%xmm11
|
||||
movbeq 16(%r14),%r12
|
||||
vpalignr $8,%xmm4,%xmm4,%xmm0
|
||||
vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
|
||||
movq %r13,96+8(%rsp)
|
||||
vaesenc %xmm15,%xmm12,%xmm12
|
||||
movq %r12,104+8(%rsp)
|
||||
vaesenc %xmm15,%xmm13,%xmm13
|
||||
vmovups 128-128(%rcx),%xmm1
|
||||
vaesenc %xmm15,%xmm14,%xmm14
|
||||
|
||||
vaesenc %xmm1,%xmm9,%xmm9
|
||||
vmovups 144-128(%rcx),%xmm15
|
||||
vaesenc %xmm1,%xmm10,%xmm10
|
||||
vpsrldq $8,%xmm6,%xmm6
|
||||
vaesenc %xmm1,%xmm11,%xmm11
|
||||
vpxor %xmm6,%xmm7,%xmm7
|
||||
vaesenc %xmm1,%xmm12,%xmm12
|
||||
vpxor %xmm0,%xmm4,%xmm4
|
||||
movbeq 8(%r14),%r13
|
||||
vaesenc %xmm1,%xmm13,%xmm13
|
||||
movbeq 0(%r14),%r12
|
||||
vaesenc %xmm1,%xmm14,%xmm14
|
||||
vmovups 160-128(%rcx),%xmm1
|
||||
cmpl $11,%ebp
|
||||
jb .Lenc_tail
|
||||
|
||||
vaesenc %xmm15,%xmm9,%xmm9
|
||||
vaesenc %xmm15,%xmm10,%xmm10
|
||||
vaesenc %xmm15,%xmm11,%xmm11
|
||||
vaesenc %xmm15,%xmm12,%xmm12
|
||||
vaesenc %xmm15,%xmm13,%xmm13
|
||||
vaesenc %xmm15,%xmm14,%xmm14
|
||||
|
||||
vaesenc %xmm1,%xmm9,%xmm9
|
||||
vaesenc %xmm1,%xmm10,%xmm10
|
||||
vaesenc %xmm1,%xmm11,%xmm11
|
||||
vaesenc %xmm1,%xmm12,%xmm12
|
||||
vaesenc %xmm1,%xmm13,%xmm13
|
||||
vmovups 176-128(%rcx),%xmm15
|
||||
vaesenc %xmm1,%xmm14,%xmm14
|
||||
vmovups 192-128(%rcx),%xmm1
|
||||
je .Lenc_tail
|
||||
|
||||
vaesenc %xmm15,%xmm9,%xmm9
|
||||
vaesenc %xmm15,%xmm10,%xmm10
|
||||
vaesenc %xmm15,%xmm11,%xmm11
|
||||
vaesenc %xmm15,%xmm12,%xmm12
|
||||
vaesenc %xmm15,%xmm13,%xmm13
|
||||
vaesenc %xmm15,%xmm14,%xmm14
|
||||
|
||||
vaesenc %xmm1,%xmm9,%xmm9
|
||||
vaesenc %xmm1,%xmm10,%xmm10
|
||||
vaesenc %xmm1,%xmm11,%xmm11
|
||||
vaesenc %xmm1,%xmm12,%xmm12
|
||||
vaesenc %xmm1,%xmm13,%xmm13
|
||||
vmovups 208-128(%rcx),%xmm15
|
||||
vaesenc %xmm1,%xmm14,%xmm14
|
||||
vmovups 224-128(%rcx),%xmm1
|
||||
jmp .Lenc_tail
|
||||
|
||||
.align 32
|
||||
.Lhandle_ctr32:
|
||||
vmovdqu (%r11),%xmm0
|
||||
vpshufb %xmm0,%xmm1,%xmm6
|
||||
vmovdqu 48(%r11),%xmm5
|
||||
vpaddd 64(%r11),%xmm6,%xmm10
|
||||
vpaddd %xmm5,%xmm6,%xmm11
|
||||
vmovdqu 0-32(%r9),%xmm3
|
||||
vpaddd %xmm5,%xmm10,%xmm12
|
||||
vpshufb %xmm0,%xmm10,%xmm10
|
||||
vpaddd %xmm5,%xmm11,%xmm13
|
||||
vpshufb %xmm0,%xmm11,%xmm11
|
||||
vpxor %xmm15,%xmm10,%xmm10
|
||||
vpaddd %xmm5,%xmm12,%xmm14
|
||||
vpshufb %xmm0,%xmm12,%xmm12
|
||||
vpxor %xmm15,%xmm11,%xmm11
|
||||
vpaddd %xmm5,%xmm13,%xmm1
|
||||
vpshufb %xmm0,%xmm13,%xmm13
|
||||
vpshufb %xmm0,%xmm14,%xmm14
|
||||
vpshufb %xmm0,%xmm1,%xmm1
|
||||
jmp .Lresume_ctr32
|
||||
|
||||
.align 32
|
||||
.Lenc_tail:
|
||||
vaesenc %xmm15,%xmm9,%xmm9
|
||||
vmovdqu %xmm7,16+8(%rsp)
|
||||
vpalignr $8,%xmm4,%xmm4,%xmm8
|
||||
vaesenc %xmm15,%xmm10,%xmm10
|
||||
vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
|
||||
vpxor 0(%rdi),%xmm1,%xmm2
|
||||
vaesenc %xmm15,%xmm11,%xmm11
|
||||
vpxor 16(%rdi),%xmm1,%xmm0
|
||||
vaesenc %xmm15,%xmm12,%xmm12
|
||||
vpxor 32(%rdi),%xmm1,%xmm5
|
||||
vaesenc %xmm15,%xmm13,%xmm13
|
||||
vpxor 48(%rdi),%xmm1,%xmm6
|
||||
vaesenc %xmm15,%xmm14,%xmm14
|
||||
vpxor 64(%rdi),%xmm1,%xmm7
|
||||
vpxor 80(%rdi),%xmm1,%xmm3
|
||||
vmovdqu (%r8),%xmm1
|
||||
|
||||
vaesenclast %xmm2,%xmm9,%xmm9
|
||||
vmovdqu 32(%r11),%xmm2
|
||||
vaesenclast %xmm0,%xmm10,%xmm10
|
||||
vpaddb %xmm2,%xmm1,%xmm0
|
||||
movq %r13,112+8(%rsp)
|
||||
leaq 96(%rdi),%rdi
|
||||
vaesenclast %xmm5,%xmm11,%xmm11
|
||||
vpaddb %xmm2,%xmm0,%xmm5
|
||||
movq %r12,120+8(%rsp)
|
||||
leaq 96(%rsi),%rsi
|
||||
vmovdqu 0-128(%rcx),%xmm15
|
||||
vaesenclast %xmm6,%xmm12,%xmm12
|
||||
vpaddb %xmm2,%xmm5,%xmm6
|
||||
vaesenclast %xmm7,%xmm13,%xmm13
|
||||
vpaddb %xmm2,%xmm6,%xmm7
|
||||
vaesenclast %xmm3,%xmm14,%xmm14
|
||||
vpaddb %xmm2,%xmm7,%xmm3
|
||||
|
||||
addq $0x60,%r10
|
||||
subq $0x6,%rdx
|
||||
jc .L6x_done
|
||||
|
||||
vmovups %xmm9,-96(%rsi)
|
||||
vpxor %xmm15,%xmm1,%xmm9
|
||||
vmovups %xmm10,-80(%rsi)
|
||||
vmovdqa %xmm0,%xmm10
|
||||
vmovups %xmm11,-64(%rsi)
|
||||
vmovdqa %xmm5,%xmm11
|
||||
vmovups %xmm12,-48(%rsi)
|
||||
vmovdqa %xmm6,%xmm12
|
||||
vmovups %xmm13,-32(%rsi)
|
||||
vmovdqa %xmm7,%xmm13
|
||||
vmovups %xmm14,-16(%rsi)
|
||||
vmovdqa %xmm3,%xmm14
|
||||
vmovdqu 32+8(%rsp),%xmm7
|
||||
jmp .Loop6x
|
||||
|
||||
.L6x_done:
|
||||
vpxor 16+8(%rsp),%xmm8,%xmm8
|
||||
vpxor %xmm4,%xmm8,%xmm8
|
||||
|
||||
.byte 0xf3,0xc3
|
||||
.cfi_endproc
|
||||
.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
|
||||
|
||||
.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
|
||||
.globl aesni_gcm_decrypt
|
||||
.type aesni_gcm_decrypt,@function
|
||||
.align 32
|
||||
aesni_gcm_decrypt:
|
||||
.cfi_startproc
|
||||
xorl %eax,%eax
|
||||
xorq %r10,%r10
|
||||
cmpq $0x60,%rdx
|
||||
jb .Lgcm_dec_abort
|
||||
|
||||
leaq (%rsp),%rax
|
||||
.cfi_def_cfa_register %rax
|
||||
pushq %rbx
|
||||
.cfi_offset %rbx,-16
|
||||
pushq %rbp
|
||||
.cfi_offset %rbp,-24
|
||||
pushq %r12
|
||||
.cfi_offset %r12,-32
|
||||
pushq %r13
|
||||
.cfi_offset %r13,-40
|
||||
pushq %r14
|
||||
.cfi_offset %r14,-48
|
||||
pushq %r15
|
||||
.cfi_offset %r15,-56
|
||||
vzeroupper
|
||||
|
||||
vmovdqu (%r8),%xmm1
|
||||
addq $-128,%rsp
|
||||
movl 12(%r8),%ebx
|
||||
leaq .Lbswap_mask(%rip),%r11
|
||||
leaq -128(%rcx),%r14
|
||||
movq $0xf80,%r15
|
||||
vmovdqu (%r9),%xmm8
|
||||
andq $-128,%rsp
|
||||
vmovdqu (%r11),%xmm0
|
||||
leaq 128(%rcx),%rcx
|
||||
leaq 32+32(%r9),%r9
|
||||
movl 240-128(%rcx),%ebp
|
||||
vpshufb %xmm0,%xmm8,%xmm8
|
||||
|
||||
andq %r15,%r14
|
||||
andq %rsp,%r15
|
||||
subq %r14,%r15
|
||||
jc .Ldec_no_key_aliasing
|
||||
cmpq $768,%r15
|
||||
jnc .Ldec_no_key_aliasing
|
||||
subq %r15,%rsp
|
||||
.Ldec_no_key_aliasing:
|
||||
|
||||
vmovdqu 80(%rdi),%xmm7
|
||||
leaq (%rdi),%r14
|
||||
vmovdqu 64(%rdi),%xmm4
|
||||
leaq -192(%rdi,%rdx,1),%r15
|
||||
vmovdqu 48(%rdi),%xmm5
|
||||
shrq $4,%rdx
|
||||
xorq %r10,%r10
|
||||
vmovdqu 32(%rdi),%xmm6
|
||||
vpshufb %xmm0,%xmm7,%xmm7
|
||||
vmovdqu 16(%rdi),%xmm2
|
||||
vpshufb %xmm0,%xmm4,%xmm4
|
||||
vmovdqu (%rdi),%xmm3
|
||||
vpshufb %xmm0,%xmm5,%xmm5
|
||||
vmovdqu %xmm4,48(%rsp)
|
||||
vpshufb %xmm0,%xmm6,%xmm6
|
||||
vmovdqu %xmm5,64(%rsp)
|
||||
vpshufb %xmm0,%xmm2,%xmm2
|
||||
vmovdqu %xmm6,80(%rsp)
|
||||
vpshufb %xmm0,%xmm3,%xmm3
|
||||
vmovdqu %xmm2,96(%rsp)
|
||||
vmovdqu %xmm3,112(%rsp)
|
||||
|
||||
call _aesni_ctr32_ghash_6x
|
||||
|
||||
vmovups %xmm9,-96(%rsi)
|
||||
vmovups %xmm10,-80(%rsi)
|
||||
vmovups %xmm11,-64(%rsi)
|
||||
vmovups %xmm12,-48(%rsi)
|
||||
vmovups %xmm13,-32(%rsi)
|
||||
vmovups %xmm14,-16(%rsi)
|
||||
|
||||
vpshufb (%r11),%xmm8,%xmm8
|
||||
vmovdqu %xmm8,-64(%r9)
|
||||
|
||||
vzeroupper
|
||||
movq -48(%rax),%r15
|
||||
.cfi_restore %r15
|
||||
movq -40(%rax),%r14
|
||||
.cfi_restore %r14
|
||||
movq -32(%rax),%r13
|
||||
.cfi_restore %r13
|
||||
movq -24(%rax),%r12
|
||||
.cfi_restore %r12
|
||||
movq -16(%rax),%rbp
|
||||
.cfi_restore %rbp
|
||||
movq -8(%rax),%rbx
|
||||
.cfi_restore %rbx
|
||||
leaq (%rax),%rsp
|
||||
.cfi_def_cfa_register %rsp
|
||||
.Lgcm_dec_abort:
|
||||
movq %r10,%rax
|
||||
.byte 0xf3,0xc3
|
||||
.cfi_endproc
|
||||
.size aesni_gcm_decrypt,.-aesni_gcm_decrypt
|
||||
.type _aesni_ctr32_6x,@function
|
||||
.align 32
|
||||
_aesni_ctr32_6x:
|
||||
.cfi_startproc
|
||||
vmovdqu 0-128(%rcx),%xmm4
|
||||
vmovdqu 32(%r11),%xmm2
|
||||
leaq -1(%rbp),%r13
|
||||
vmovups 16-128(%rcx),%xmm15
|
||||
leaq 32-128(%rcx),%r12
|
||||
vpxor %xmm4,%xmm1,%xmm9
|
||||
addl $100663296,%ebx
|
||||
jc .Lhandle_ctr32_2
|
||||
vpaddb %xmm2,%xmm1,%xmm10
|
||||
vpaddb %xmm2,%xmm10,%xmm11
|
||||
vpxor %xmm4,%xmm10,%xmm10
|
||||
vpaddb %xmm2,%xmm11,%xmm12
|
||||
vpxor %xmm4,%xmm11,%xmm11
|
||||
vpaddb %xmm2,%xmm12,%xmm13
|
||||
vpxor %xmm4,%xmm12,%xmm12
|
||||
vpaddb %xmm2,%xmm13,%xmm14
|
||||
vpxor %xmm4,%xmm13,%xmm13
|
||||
vpaddb %xmm2,%xmm14,%xmm1
|
||||
vpxor %xmm4,%xmm14,%xmm14
|
||||
jmp .Loop_ctr32
|
||||
|
||||
.align 16
|
||||
.Loop_ctr32:
|
||||
vaesenc %xmm15,%xmm9,%xmm9
|
||||
vaesenc %xmm15,%xmm10,%xmm10
|
||||
vaesenc %xmm15,%xmm11,%xmm11
|
||||
vaesenc %xmm15,%xmm12,%xmm12
|
||||
vaesenc %xmm15,%xmm13,%xmm13
|
||||
vaesenc %xmm15,%xmm14,%xmm14
|
||||
vmovups (%r12),%xmm15
|
||||
leaq 16(%r12),%r12
|
||||
decl %r13d
|
||||
jnz .Loop_ctr32
|
||||
|
||||
vmovdqu (%r12),%xmm3
|
||||
vaesenc %xmm15,%xmm9,%xmm9
|
||||
vpxor 0(%rdi),%xmm3,%xmm4
|
||||
vaesenc %xmm15,%xmm10,%xmm10
|
||||
vpxor 16(%rdi),%xmm3,%xmm5
|
||||
vaesenc %xmm15,%xmm11,%xmm11
|
||||
vpxor 32(%rdi),%xmm3,%xmm6
|
||||
vaesenc %xmm15,%xmm12,%xmm12
|
||||
vpxor 48(%rdi),%xmm3,%xmm8
|
||||
vaesenc %xmm15,%xmm13,%xmm13
|
||||
vpxor 64(%rdi),%xmm3,%xmm2
|
||||
vaesenc %xmm15,%xmm14,%xmm14
|
||||
vpxor 80(%rdi),%xmm3,%xmm3
|
||||
leaq 96(%rdi),%rdi
|
||||
|
||||
vaesenclast %xmm4,%xmm9,%xmm9
|
||||
vaesenclast %xmm5,%xmm10,%xmm10
|
||||
vaesenclast %xmm6,%xmm11,%xmm11
|
||||
vaesenclast %xmm8,%xmm12,%xmm12
|
||||
vaesenclast %xmm2,%xmm13,%xmm13
|
||||
vaesenclast %xmm3,%xmm14,%xmm14
|
||||
vmovups %xmm9,0(%rsi)
|
||||
vmovups %xmm10,16(%rsi)
|
||||
vmovups %xmm11,32(%rsi)
|
||||
vmovups %xmm12,48(%rsi)
|
||||
vmovups %xmm13,64(%rsi)
|
||||
vmovups %xmm14,80(%rsi)
|
||||
leaq 96(%rsi),%rsi
|
||||
|
||||
.byte 0xf3,0xc3
|
||||
.align 32
|
||||
.Lhandle_ctr32_2:
|
||||
vpshufb %xmm0,%xmm1,%xmm6
|
||||
vmovdqu 48(%r11),%xmm5
|
||||
vpaddd 64(%r11),%xmm6,%xmm10
|
||||
vpaddd %xmm5,%xmm6,%xmm11
|
||||
vpaddd %xmm5,%xmm10,%xmm12
|
||||
vpshufb %xmm0,%xmm10,%xmm10
|
||||
vpaddd %xmm5,%xmm11,%xmm13
|
||||
vpshufb %xmm0,%xmm11,%xmm11
|
||||
vpxor %xmm4,%xmm10,%xmm10
|
||||
vpaddd %xmm5,%xmm12,%xmm14
|
||||
vpshufb %xmm0,%xmm12,%xmm12
|
||||
vpxor %xmm4,%xmm11,%xmm11
|
||||
vpaddd %xmm5,%xmm13,%xmm1
|
||||
vpshufb %xmm0,%xmm13,%xmm13
|
||||
vpxor %xmm4,%xmm12,%xmm12
|
||||
vpshufb %xmm0,%xmm14,%xmm14
|
||||
vpxor %xmm4,%xmm13,%xmm13
|
||||
vpshufb %xmm0,%xmm1,%xmm1
|
||||
vpxor %xmm4,%xmm14,%xmm14
|
||||
jmp .Loop_ctr32
|
||||
.cfi_endproc
|
||||
.size _aesni_ctr32_6x,.-_aesni_ctr32_6x
|
||||
|
||||
.globl aesni_gcm_encrypt
|
||||
.type aesni_gcm_encrypt,@function
|
||||
.align 32
|
||||
aesni_gcm_encrypt:
|
||||
.cfi_startproc
|
||||
xorq %r10,%r10
|
||||
cmpq $288,%rdx
|
||||
jb .Lgcm_enc_abort
|
||||
|
||||
leaq (%rsp),%rax
|
||||
.cfi_def_cfa_register %rax
|
||||
pushq %rbx
|
||||
.cfi_offset %rbx,-16
|
||||
pushq %rbp
|
||||
.cfi_offset %rbp,-24
|
||||
pushq %r12
|
||||
.cfi_offset %r12,-32
|
||||
pushq %r13
|
||||
.cfi_offset %r13,-40
|
||||
pushq %r14
|
||||
.cfi_offset %r14,-48
|
||||
pushq %r15
|
||||
.cfi_offset %r15,-56
|
||||
vzeroupper
|
||||
|
||||
vmovdqu (%r8),%xmm1
|
||||
addq $-128,%rsp
|
||||
movl 12(%r8),%ebx
|
||||
leaq .Lbswap_mask(%rip),%r11
|
||||
leaq -128(%rcx),%r14
|
||||
movq $0xf80,%r15
|
||||
leaq 128(%rcx),%rcx
|
||||
vmovdqu (%r11),%xmm0
|
||||
andq $-128,%rsp
|
||||
movl 240-128(%rcx),%ebp
|
||||
|
||||
andq %r15,%r14
|
||||
andq %rsp,%r15
|
||||
subq %r14,%r15
|
||||
jc .Lenc_no_key_aliasing
|
||||
cmpq $768,%r15
|
||||
jnc .Lenc_no_key_aliasing
|
||||
subq %r15,%rsp
|
||||
.Lenc_no_key_aliasing:
|
||||
|
||||
leaq (%rsi),%r14
|
||||
leaq -192(%rsi,%rdx,1),%r15
|
||||
shrq $4,%rdx
|
||||
|
||||
call _aesni_ctr32_6x
|
||||
vpshufb %xmm0,%xmm9,%xmm8
|
||||
vpshufb %xmm0,%xmm10,%xmm2
|
||||
vmovdqu %xmm8,112(%rsp)
|
||||
vpshufb %xmm0,%xmm11,%xmm4
|
||||
vmovdqu %xmm2,96(%rsp)
|
||||
vpshufb %xmm0,%xmm12,%xmm5
|
||||
vmovdqu %xmm4,80(%rsp)
|
||||
vpshufb %xmm0,%xmm13,%xmm6
|
||||
vmovdqu %xmm5,64(%rsp)
|
||||
vpshufb %xmm0,%xmm14,%xmm7
|
||||
vmovdqu %xmm6,48(%rsp)
|
||||
|
||||
call _aesni_ctr32_6x
|
||||
|
||||
vmovdqu (%r9),%xmm8
|
||||
leaq 32+32(%r9),%r9
|
||||
subq $12,%rdx
|
||||
movq $192,%r10
|
||||
vpshufb %xmm0,%xmm8,%xmm8
|
||||
|
||||
call _aesni_ctr32_ghash_6x
|
||||
vmovdqu 32(%rsp),%xmm7
|
||||
vmovdqu (%r11),%xmm0
|
||||
vmovdqu 0-32(%r9),%xmm3
|
||||
vpunpckhqdq %xmm7,%xmm7,%xmm1
|
||||
vmovdqu 32-32(%r9),%xmm15
|
||||
vmovups %xmm9,-96(%rsi)
|
||||
vpshufb %xmm0,%xmm9,%xmm9
|
||||
vpxor %xmm7,%xmm1,%xmm1
|
||||
vmovups %xmm10,-80(%rsi)
|
||||
vpshufb %xmm0,%xmm10,%xmm10
|
||||
vmovups %xmm11,-64(%rsi)
|
||||
vpshufb %xmm0,%xmm11,%xmm11
|
||||
vmovups %xmm12,-48(%rsi)
|
||||
vpshufb %xmm0,%xmm12,%xmm12
|
||||
vmovups %xmm13,-32(%rsi)
|
||||
vpshufb %xmm0,%xmm13,%xmm13
|
||||
vmovups %xmm14,-16(%rsi)
|
||||
vpshufb %xmm0,%xmm14,%xmm14
|
||||
vmovdqu %xmm9,16(%rsp)
|
||||
vmovdqu 48(%rsp),%xmm6
|
||||
vmovdqu 16-32(%r9),%xmm0
|
||||
vpunpckhqdq %xmm6,%xmm6,%xmm2
|
||||
vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5
|
||||
vpxor %xmm6,%xmm2,%xmm2
|
||||
vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
|
||||
vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
|
||||
|
||||
vmovdqu 64(%rsp),%xmm9
|
||||
vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4
|
||||
vmovdqu 48-32(%r9),%xmm3
|
||||
vpxor %xmm5,%xmm4,%xmm4
|
||||
vpunpckhqdq %xmm9,%xmm9,%xmm5
|
||||
vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6
|
||||
vpxor %xmm9,%xmm5,%xmm5
|
||||
vpxor %xmm7,%xmm6,%xmm6
|
||||
vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
|
||||
vmovdqu 80-32(%r9),%xmm15
|
||||
vpxor %xmm1,%xmm2,%xmm2
|
||||
|
||||
vmovdqu 80(%rsp),%xmm1
|
||||
vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7
|
||||
vmovdqu 64-32(%r9),%xmm0
|
||||
vpxor %xmm4,%xmm7,%xmm7
|
||||
vpunpckhqdq %xmm1,%xmm1,%xmm4
|
||||
vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9
|
||||
vpxor %xmm1,%xmm4,%xmm4
|
||||
vpxor %xmm6,%xmm9,%xmm9
|
||||
vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5
|
||||
vpxor %xmm2,%xmm5,%xmm5
|
||||
|
||||
vmovdqu 96(%rsp),%xmm2
|
||||
vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6
|
||||
vmovdqu 96-32(%r9),%xmm3
|
||||
vpxor %xmm7,%xmm6,%xmm6
|
||||
vpunpckhqdq %xmm2,%xmm2,%xmm7
|
||||
vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1
|
||||
vpxor %xmm2,%xmm7,%xmm7
|
||||
vpxor %xmm9,%xmm1,%xmm1
|
||||
vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4
|
||||
vmovdqu 128-32(%r9),%xmm15
|
||||
vpxor %xmm5,%xmm4,%xmm4
|
||||
|
||||
vpxor 112(%rsp),%xmm8,%xmm8
|
||||
vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5
|
||||
vmovdqu 112-32(%r9),%xmm0
|
||||
vpunpckhqdq %xmm8,%xmm8,%xmm9
|
||||
vpxor %xmm6,%xmm5,%xmm5
|
||||
vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2
|
||||
vpxor %xmm8,%xmm9,%xmm9
|
||||
vpxor %xmm1,%xmm2,%xmm2
|
||||
vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7
|
||||
vpxor %xmm4,%xmm7,%xmm4
|
||||
|
||||
vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6
|
||||
vmovdqu 0-32(%r9),%xmm3
|
||||
vpunpckhqdq %xmm14,%xmm14,%xmm1
|
||||
vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8
|
||||
vpxor %xmm14,%xmm1,%xmm1
|
||||
vpxor %xmm5,%xmm6,%xmm5
|
||||
vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9
|
||||
vmovdqu 32-32(%r9),%xmm15
|
||||
vpxor %xmm2,%xmm8,%xmm7
|
||||
vpxor %xmm4,%xmm9,%xmm6
|
||||
|
||||
vmovdqu 16-32(%r9),%xmm0
|
||||
vpxor %xmm5,%xmm7,%xmm9
|
||||
vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4
|
||||
vpxor %xmm9,%xmm6,%xmm6
|
||||
vpunpckhqdq %xmm13,%xmm13,%xmm2
|
||||
vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14
|
||||
vpxor %xmm13,%xmm2,%xmm2
|
||||
vpslldq $8,%xmm6,%xmm9
|
||||
vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
|
||||
vpxor %xmm9,%xmm5,%xmm8
|
||||
vpsrldq $8,%xmm6,%xmm6
|
||||
vpxor %xmm6,%xmm7,%xmm7
|
||||
|
||||
vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5
|
||||
vmovdqu 48-32(%r9),%xmm3
|
||||
vpxor %xmm4,%xmm5,%xmm5
|
||||
vpunpckhqdq %xmm12,%xmm12,%xmm9
|
||||
vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13
|
||||
vpxor %xmm12,%xmm9,%xmm9
|
||||
vpxor %xmm14,%xmm13,%xmm13
|
||||
vpalignr $8,%xmm8,%xmm8,%xmm14
|
||||
vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
|
||||
vmovdqu 80-32(%r9),%xmm15
|
||||
vpxor %xmm1,%xmm2,%xmm2
|
||||
|
||||
vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4
|
||||
vmovdqu 64-32(%r9),%xmm0
|
||||
vpxor %xmm5,%xmm4,%xmm4
|
||||
vpunpckhqdq %xmm11,%xmm11,%xmm1
|
||||
vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12
|
||||
vpxor %xmm11,%xmm1,%xmm1
|
||||
vpxor %xmm13,%xmm12,%xmm12
|
||||
vxorps 16(%rsp),%xmm7,%xmm7
|
||||
vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9
|
||||
vpxor %xmm2,%xmm9,%xmm9
|
||||
|
||||
vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
|
||||
vxorps %xmm14,%xmm8,%xmm8
|
||||
|
||||
vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5
|
||||
vmovdqu 96-32(%r9),%xmm3
|
||||
vpxor %xmm4,%xmm5,%xmm5
|
||||
vpunpckhqdq %xmm10,%xmm10,%xmm2
|
||||
vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11
|
||||
vpxor %xmm10,%xmm2,%xmm2
|
||||
vpalignr $8,%xmm8,%xmm8,%xmm14
|
||||
vpxor %xmm12,%xmm11,%xmm11
|
||||
vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1
|
||||
vmovdqu 128-32(%r9),%xmm15
|
||||
vpxor %xmm9,%xmm1,%xmm1
|
||||
|
||||
vxorps %xmm7,%xmm14,%xmm14
|
||||
vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
|
||||
vxorps %xmm14,%xmm8,%xmm8
|
||||
|
||||
vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4
|
||||
vmovdqu 112-32(%r9),%xmm0
|
||||
vpxor %xmm5,%xmm4,%xmm4
|
||||
vpunpckhqdq %xmm8,%xmm8,%xmm9
|
||||
vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10
|
||||
vpxor %xmm8,%xmm9,%xmm9
|
||||
vpxor %xmm11,%xmm10,%xmm10
|
||||
vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2
|
||||
vpxor %xmm1,%xmm2,%xmm2
|
||||
|
||||
vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5
|
||||
vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7
|
||||
vpxor %xmm4,%xmm5,%xmm5
|
||||
vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6
|
||||
vpxor %xmm10,%xmm7,%xmm7
|
||||
vpxor %xmm2,%xmm6,%xmm6
|
||||
|
||||
vpxor %xmm5,%xmm7,%xmm4
|
||||
vpxor %xmm4,%xmm6,%xmm6
|
||||
vpslldq $8,%xmm6,%xmm1
|
||||
vmovdqu 16(%r11),%xmm3
|
||||
vpsrldq $8,%xmm6,%xmm6
|
||||
vpxor %xmm1,%xmm5,%xmm8
|
||||
vpxor %xmm6,%xmm7,%xmm7
|
||||
|
||||
vpalignr $8,%xmm8,%xmm8,%xmm2
|
||||
vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
|
||||
vpxor %xmm2,%xmm8,%xmm8
|
||||
|
||||
vpalignr $8,%xmm8,%xmm8,%xmm2
|
||||
vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
|
||||
vpxor %xmm7,%xmm2,%xmm2
|
||||
vpxor %xmm2,%xmm8,%xmm8
|
||||
vpshufb (%r11),%xmm8,%xmm8
|
||||
vmovdqu %xmm8,-64(%r9)
|
||||
|
||||
vzeroupper
|
||||
movq -48(%rax),%r15
|
||||
.cfi_restore %r15
|
||||
movq -40(%rax),%r14
|
||||
.cfi_restore %r14
|
||||
movq -32(%rax),%r13
|
||||
.cfi_restore %r13
|
||||
movq -24(%rax),%r12
|
||||
.cfi_restore %r12
|
||||
movq -16(%rax),%rbp
|
||||
.cfi_restore %rbp
|
||||
movq -8(%rax),%rbx
|
||||
.cfi_restore %rbx
|
||||
leaq (%rax),%rsp
|
||||
.cfi_def_cfa_register %rsp
|
||||
.Lgcm_enc_abort:
|
||||
movq %r10,%rax
|
||||
.byte 0xf3,0xc3
|
||||
.cfi_endproc
|
||||
.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
|
||||
.align 64
|
||||
.Lbswap_mask:
|
||||
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
|
||||
.Lpoly:
|
||||
.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
|
||||
.Lone_msb:
|
||||
.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
|
||||
.Ltwo_lsb:
|
||||
.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
||||
.Lone_lsb:
|
||||
.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
||||
.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
|
||||
.align 64
|
||||
|
@ -9,6 +9,14 @@
|
||||
.align 32
|
||||
aesni_multi_cbc_encrypt:
|
||||
.cfi_startproc
|
||||
cmpl $2,%edx
|
||||
jb .Lenc_non_avx
|
||||
movl OPENSSL_ia32cap_P+4(%rip),%ecx
|
||||
testl $268435456,%ecx
|
||||
jnz _avx_cbc_enc_shortcut
|
||||
jmp .Lenc_non_avx
|
||||
.align 16
|
||||
.Lenc_non_avx:
|
||||
movq %rsp,%rax
|
||||
.cfi_def_cfa_register %rax
|
||||
pushq %rbx
|
||||
@ -283,6 +291,14 @@ aesni_multi_cbc_encrypt:
|
||||
.align 32
|
||||
aesni_multi_cbc_decrypt:
|
||||
.cfi_startproc
|
||||
cmpl $2,%edx
|
||||
jb .Ldec_non_avx
|
||||
movl OPENSSL_ia32cap_P+4(%rip),%ecx
|
||||
testl $268435456,%ecx
|
||||
jnz _avx_cbc_dec_shortcut
|
||||
jmp .Ldec_non_avx
|
||||
.align 16
|
||||
.Ldec_non_avx:
|
||||
movq %rsp,%rax
|
||||
.cfi_def_cfa_register %rax
|
||||
pushq %rbx
|
||||
@ -542,3 +558,952 @@ aesni_multi_cbc_decrypt:
|
||||
.byte 0xf3,0xc3
|
||||
.cfi_endproc
|
||||
.size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
|
||||
.type aesni_multi_cbc_encrypt_avx,@function
|
||||
.align 32
|
||||
aesni_multi_cbc_encrypt_avx:
|
||||
.cfi_startproc
|
||||
_avx_cbc_enc_shortcut:
|
||||
movq %rsp,%rax
|
||||
.cfi_def_cfa_register %rax
|
||||
pushq %rbx
|
||||
.cfi_offset %rbx,-16
|
||||
pushq %rbp
|
||||
.cfi_offset %rbp,-24
|
||||
pushq %r12
|
||||
.cfi_offset %r12,-32
|
||||
pushq %r13
|
||||
.cfi_offset %r13,-40
|
||||
pushq %r14
|
||||
.cfi_offset %r14,-48
|
||||
pushq %r15
|
||||
.cfi_offset %r15,-56
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
subq $192,%rsp
|
||||
andq $-128,%rsp
|
||||
movq %rax,16(%rsp)
|
||||
.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08
|
||||
|
||||
.Lenc8x_body:
|
||||
vzeroupper
|
||||
vmovdqu (%rsi),%xmm15
|
||||
leaq 120(%rsi),%rsi
|
||||
leaq 160(%rdi),%rdi
|
||||
shrl $1,%edx
|
||||
|
||||
.Lenc8x_loop_grande:
|
||||
|
||||
xorl %edx,%edx
|
||||
movl -144(%rdi),%ecx
|
||||
movq -160(%rdi),%r8
|
||||
cmpl %edx,%ecx
|
||||
movq -152(%rdi),%rbx
|
||||
cmovgl %ecx,%edx
|
||||
testl %ecx,%ecx
|
||||
vmovdqu -136(%rdi),%xmm2
|
||||
movl %ecx,32(%rsp)
|
||||
cmovleq %rsp,%r8
|
||||
subq %r8,%rbx
|
||||
movq %rbx,64(%rsp)
|
||||
movl -104(%rdi),%ecx
|
||||
movq -120(%rdi),%r9
|
||||
cmpl %edx,%ecx
|
||||
movq -112(%rdi),%rbp
|
||||
cmovgl %ecx,%edx
|
||||
testl %ecx,%ecx
|
||||
vmovdqu -96(%rdi),%xmm3
|
||||
movl %ecx,36(%rsp)
|
||||
cmovleq %rsp,%r9
|
||||
subq %r9,%rbp
|
||||
movq %rbp,72(%rsp)
|
||||
movl -64(%rdi),%ecx
|
||||
movq -80(%rdi),%r10
|
||||
cmpl %edx,%ecx
|
||||
movq -72(%rdi),%rbp
|
||||
cmovgl %ecx,%edx
|
||||
testl %ecx,%ecx
|
||||
vmovdqu -56(%rdi),%xmm4
|
||||
movl %ecx,40(%rsp)
|
||||
cmovleq %rsp,%r10
|
||||
subq %r10,%rbp
|
||||
movq %rbp,80(%rsp)
|
||||
movl -24(%rdi),%ecx
|
||||
movq -40(%rdi),%r11
|
||||
cmpl %edx,%ecx
|
||||
movq -32(%rdi),%rbp
|
||||
cmovgl %ecx,%edx
|
||||
testl %ecx,%ecx
|
||||
vmovdqu -16(%rdi),%xmm5
|
||||
movl %ecx,44(%rsp)
|
||||
cmovleq %rsp,%r11
|
||||
subq %r11,%rbp
|
||||
movq %rbp,88(%rsp)
|
||||
movl 16(%rdi),%ecx
|
||||
movq 0(%rdi),%r12
|
||||
cmpl %edx,%ecx
|
||||
movq 8(%rdi),%rbp
|
||||
cmovgl %ecx,%edx
|
||||
testl %ecx,%ecx
|
||||
vmovdqu 24(%rdi),%xmm6
|
||||
movl %ecx,48(%rsp)
|
||||
cmovleq %rsp,%r12
|
||||
subq %r12,%rbp
|
||||
movq %rbp,96(%rsp)
|
||||
movl 56(%rdi),%ecx
|
||||
movq 40(%rdi),%r13
|
||||
cmpl %edx,%ecx
|
||||
movq 48(%rdi),%rbp
|
||||
cmovgl %ecx,%edx
|
||||
testl %ecx,%ecx
|
||||
vmovdqu 64(%rdi),%xmm7
|
||||
movl %ecx,52(%rsp)
|
||||
cmovleq %rsp,%r13
|
||||
subq %r13,%rbp
|
||||
movq %rbp,104(%rsp)
|
||||
movl 96(%rdi),%ecx
|
||||
movq 80(%rdi),%r14
|
||||
cmpl %edx,%ecx
|
||||
movq 88(%rdi),%rbp
|
||||
cmovgl %ecx,%edx
|
||||
testl %ecx,%ecx
|
||||
vmovdqu 104(%rdi),%xmm8
|
||||
movl %ecx,56(%rsp)
|
||||
cmovleq %rsp,%r14
|
||||
subq %r14,%rbp
|
||||
movq %rbp,112(%rsp)
|
||||
movl 136(%rdi),%ecx
|
||||
movq 120(%rdi),%r15
|
||||
cmpl %edx,%ecx
|
||||
movq 128(%rdi),%rbp
|
||||
cmovgl %ecx,%edx
|
||||
testl %ecx,%ecx
|
||||
vmovdqu 144(%rdi),%xmm9
|
||||
movl %ecx,60(%rsp)
|
||||
cmovleq %rsp,%r15
|
||||
subq %r15,%rbp
|
||||
movq %rbp,120(%rsp)
|
||||
testl %edx,%edx
|
||||
jz .Lenc8x_done
|
||||
|
||||
vmovups 16-120(%rsi),%xmm1
|
||||
vmovups 32-120(%rsi),%xmm0
|
||||
movl 240-120(%rsi),%eax
|
||||
|
||||
vpxor (%r8),%xmm15,%xmm10
|
||||
leaq 128(%rsp),%rbp
|
||||
vpxor (%r9),%xmm15,%xmm11
|
||||
vpxor (%r10),%xmm15,%xmm12
|
||||
vpxor (%r11),%xmm15,%xmm13
|
||||
vpxor %xmm10,%xmm2,%xmm2
|
||||
vpxor (%r12),%xmm15,%xmm10
|
||||
vpxor %xmm11,%xmm3,%xmm3
|
||||
vpxor (%r13),%xmm15,%xmm11
|
||||
vpxor %xmm12,%xmm4,%xmm4
|
||||
vpxor (%r14),%xmm15,%xmm12
|
||||
vpxor %xmm13,%xmm5,%xmm5
|
||||
vpxor (%r15),%xmm15,%xmm13
|
||||
vpxor %xmm10,%xmm6,%xmm6
|
||||
movl $1,%ecx
|
||||
vpxor %xmm11,%xmm7,%xmm7
|
||||
vpxor %xmm12,%xmm8,%xmm8
|
||||
vpxor %xmm13,%xmm9,%xmm9
|
||||
jmp .Loop_enc8x
|
||||
|
||||
.align 32
|
||||
.Loop_enc8x:
|
||||
vaesenc %xmm1,%xmm2,%xmm2
|
||||
cmpl 32+0(%rsp),%ecx
|
||||
vaesenc %xmm1,%xmm3,%xmm3
|
||||
prefetcht0 31(%r8)
|
||||
vaesenc %xmm1,%xmm4,%xmm4
|
||||
vaesenc %xmm1,%xmm5,%xmm5
|
||||
leaq (%r8,%rbx,1),%rbx
|
||||
cmovgeq %rsp,%r8
|
||||
vaesenc %xmm1,%xmm6,%xmm6
|
||||
cmovgq %rsp,%rbx
|
||||
vaesenc %xmm1,%xmm7,%xmm7
|
||||
subq %r8,%rbx
|
||||
vaesenc %xmm1,%xmm8,%xmm8
|
||||
vpxor 16(%r8),%xmm15,%xmm10
|
||||
movq %rbx,64+0(%rsp)
|
||||
vaesenc %xmm1,%xmm9,%xmm9
|
||||
vmovups -72(%rsi),%xmm1
|
||||
leaq 16(%r8,%rbx,1),%r8
|
||||
vmovdqu %xmm10,0(%rbp)
|
||||
vaesenc %xmm0,%xmm2,%xmm2
|
||||
cmpl 32+4(%rsp),%ecx
|
||||
movq 64+8(%rsp),%rbx
|
||||
vaesenc %xmm0,%xmm3,%xmm3
|
||||
prefetcht0 31(%r9)
|
||||
vaesenc %xmm0,%xmm4,%xmm4
|
||||
vaesenc %xmm0,%xmm5,%xmm5
|
||||
leaq (%r9,%rbx,1),%rbx
|
||||
cmovgeq %rsp,%r9
|
||||
vaesenc %xmm0,%xmm6,%xmm6
|
||||
cmovgq %rsp,%rbx
|
||||
vaesenc %xmm0,%xmm7,%xmm7
|
||||
subq %r9,%rbx
|
||||
vaesenc %xmm0,%xmm8,%xmm8
|
||||
vpxor 16(%r9),%xmm15,%xmm11
|
||||
movq %rbx,64+8(%rsp)
|
||||
vaesenc %xmm0,%xmm9,%xmm9
|
||||
vmovups -56(%rsi),%xmm0
|
||||
leaq 16(%r9,%rbx,1),%r9
|
||||
vmovdqu %xmm11,16(%rbp)
|
||||
vaesenc %xmm1,%xmm2,%xmm2
|
||||
cmpl 32+8(%rsp),%ecx
|
||||
movq 64+16(%rsp),%rbx
|
||||
vaesenc %xmm1,%xmm3,%xmm3
|
||||
prefetcht0 31(%r10)
|
||||
vaesenc %xmm1,%xmm4,%xmm4
|
||||
prefetcht0 15(%r8)
|
||||
vaesenc %xmm1,%xmm5,%xmm5
|
||||
leaq (%r10,%rbx,1),%rbx
|
||||
cmovgeq %rsp,%r10
|
||||
vaesenc %xmm1,%xmm6,%xmm6
|
||||
cmovgq %rsp,%rbx
|
||||
vaesenc %xmm1,%xmm7,%xmm7
|
||||
subq %r10,%rbx
|
||||
vaesenc %xmm1,%xmm8,%xmm8
|
||||
vpxor 16(%r10),%xmm15,%xmm12
|
||||
movq %rbx,64+16(%rsp)
|
||||
vaesenc %xmm1,%xmm9,%xmm9
|
||||
vmovups -40(%rsi),%xmm1
|
||||
leaq 16(%r10,%rbx,1),%r10
|
||||
vmovdqu %xmm12,32(%rbp)
|
||||
vaesenc %xmm0,%xmm2,%xmm2
|
||||
cmpl 32+12(%rsp),%ecx
|
||||
movq 64+24(%rsp),%rbx
|
||||
vaesenc %xmm0,%xmm3,%xmm3
|
||||
prefetcht0 31(%r11)
|
||||
vaesenc %xmm0,%xmm4,%xmm4
|
||||
prefetcht0 15(%r9)
|
||||
vaesenc %xmm0,%xmm5,%xmm5
|
||||
leaq (%r11,%rbx,1),%rbx
|
||||
cmovgeq %rsp,%r11
|
||||
vaesenc %xmm0,%xmm6,%xmm6
|
||||
cmovgq %rsp,%rbx
|
||||
vaesenc %xmm0,%xmm7,%xmm7
|
||||
subq %r11,%rbx
|
||||
vaesenc %xmm0,%xmm8,%xmm8
|
||||
vpxor 16(%r11),%xmm15,%xmm13
|
||||
movq %rbx,64+24(%rsp)
|
||||
vaesenc %xmm0,%xmm9,%xmm9
|
||||
vmovups -24(%rsi),%xmm0
|
||||
leaq 16(%r11,%rbx,1),%r11
|
||||
vmovdqu %xmm13,48(%rbp)
|
||||
vaesenc %xmm1,%xmm2,%xmm2
|
||||
cmpl 32+16(%rsp),%ecx
|
||||
movq 64+32(%rsp),%rbx
|
||||
vaesenc %xmm1,%xmm3,%xmm3
|
||||
prefetcht0 31(%r12)
|
||||
vaesenc %xmm1,%xmm4,%xmm4
|
||||
prefetcht0 15(%r10)
|
||||
vaesenc %xmm1,%xmm5,%xmm5
|
||||
leaq (%r12,%rbx,1),%rbx
|
||||
cmovgeq %rsp,%r12
|
||||
vaesenc %xmm1,%xmm6,%xmm6
|
||||
cmovgq %rsp,%rbx
|
||||
vaesenc %xmm1,%xmm7,%xmm7
|
||||
subq %r12,%rbx
|
||||
vaesenc %xmm1,%xmm8,%xmm8
|
||||
vpxor 16(%r12),%xmm15,%xmm10
|
||||
movq %rbx,64+32(%rsp)
|
||||
vaesenc %xmm1,%xmm9,%xmm9
|
||||
vmovups -8(%rsi),%xmm1
|
||||
leaq 16(%r12,%rbx,1),%r12
|
||||
vaesenc %xmm0,%xmm2,%xmm2
|
||||
cmpl 32+20(%rsp),%ecx
|
||||
movq 64+40(%rsp),%rbx
|
||||
vaesenc %xmm0,%xmm3,%xmm3
|
||||
prefetcht0 31(%r13)
|
||||
vaesenc %xmm0,%xmm4,%xmm4
|
||||
prefetcht0 15(%r11)
|
||||
vaesenc %xmm0,%xmm5,%xmm5
|
||||
leaq (%rbx,%r13,1),%rbx
|
||||
cmovgeq %rsp,%r13
|
||||
vaesenc %xmm0,%xmm6,%xmm6
|
||||
cmovgq %rsp,%rbx
|
||||
vaesenc %xmm0,%xmm7,%xmm7
|
||||
subq %r13,%rbx
|
||||
vaesenc %xmm0,%xmm8,%xmm8
|
||||
vpxor 16(%r13),%xmm15,%xmm11
|
||||
movq %rbx,64+40(%rsp)
|
||||
vaesenc %xmm0,%xmm9,%xmm9
|
||||
vmovups 8(%rsi),%xmm0
|
||||
leaq 16(%r13,%rbx,1),%r13
|
||||
vaesenc %xmm1,%xmm2,%xmm2
|
||||
cmpl 32+24(%rsp),%ecx
|
||||
movq 64+48(%rsp),%rbx
|
||||
vaesenc %xmm1,%xmm3,%xmm3
|
||||
prefetcht0 31(%r14)
|
||||
vaesenc %xmm1,%xmm4,%xmm4
|
||||
prefetcht0 15(%r12)
|
||||
vaesenc %xmm1,%xmm5,%xmm5
|
||||
leaq (%r14,%rbx,1),%rbx
|
||||
cmovgeq %rsp,%r14
|
||||
vaesenc %xmm1,%xmm6,%xmm6
|
||||
cmovgq %rsp,%rbx
|
||||
vaesenc %xmm1,%xmm7,%xmm7
|
||||
subq %r14,%rbx
|
||||
vaesenc %xmm1,%xmm8,%xmm8
|
||||
vpxor 16(%r14),%xmm15,%xmm12
|
||||
movq %rbx,64+48(%rsp)
|
||||
vaesenc %xmm1,%xmm9,%xmm9
|
||||
vmovups 24(%rsi),%xmm1
|
||||
leaq 16(%r14,%rbx,1),%r14
|
||||
vaesenc %xmm0,%xmm2,%xmm2
|
||||
cmpl 32+28(%rsp),%ecx
|
||||
movq 64+56(%rsp),%rbx
|
||||
vaesenc %xmm0,%xmm3,%xmm3
|
||||
prefetcht0 31(%r15)
|
||||
vaesenc %xmm0,%xmm4,%xmm4
|
||||
prefetcht0 15(%r13)
|
||||
vaesenc %xmm0,%xmm5,%xmm5
|
||||
leaq (%r15,%rbx,1),%rbx
|
||||
cmovgeq %rsp,%r15
|
||||
vaesenc %xmm0,%xmm6,%xmm6
|
||||
cmovgq %rsp,%rbx
|
||||
vaesenc %xmm0,%xmm7,%xmm7
|
||||
subq %r15,%rbx
|
||||
vaesenc %xmm0,%xmm8,%xmm8
|
||||
vpxor 16(%r15),%xmm15,%xmm13
|
||||
movq %rbx,64+56(%rsp)
|
||||
vaesenc %xmm0,%xmm9,%xmm9
|
||||
vmovups 40(%rsi),%xmm0
|
||||
leaq 16(%r15,%rbx,1),%r15
|
||||
vmovdqu 32(%rsp),%xmm14
|
||||
prefetcht0 15(%r14)
|
||||
prefetcht0 15(%r15)
|
||||
cmpl $11,%eax
|
||||
jb .Lenc8x_tail
|
||||
|
||||
vaesenc %xmm1,%xmm2,%xmm2
|
||||
vaesenc %xmm1,%xmm3,%xmm3
|
||||
vaesenc %xmm1,%xmm4,%xmm4
|
||||
vaesenc %xmm1,%xmm5,%xmm5
|
||||
vaesenc %xmm1,%xmm6,%xmm6
|
||||
vaesenc %xmm1,%xmm7,%xmm7
|
||||
vaesenc %xmm1,%xmm8,%xmm8
|
||||
vaesenc %xmm1,%xmm9,%xmm9
|
||||
vmovups 176-120(%rsi),%xmm1
|
||||
|
||||
vaesenc %xmm0,%xmm2,%xmm2
|
||||
vaesenc %xmm0,%xmm3,%xmm3
|
||||
vaesenc %xmm0,%xmm4,%xmm4
|
||||
vaesenc %xmm0,%xmm5,%xmm5
|
||||
vaesenc %xmm0,%xmm6,%xmm6
|
||||
vaesenc %xmm0,%xmm7,%xmm7
|
||||
vaesenc %xmm0,%xmm8,%xmm8
|
||||
vaesenc %xmm0,%xmm9,%xmm9
|
||||
vmovups 192-120(%rsi),%xmm0
|
||||
je .Lenc8x_tail
|
||||
|
||||
vaesenc %xmm1,%xmm2,%xmm2
|
||||
vaesenc %xmm1,%xmm3,%xmm3
|
||||
vaesenc %xmm1,%xmm4,%xmm4
|
||||
vaesenc %xmm1,%xmm5,%xmm5
|
||||
vaesenc %xmm1,%xmm6,%xmm6
|
||||
vaesenc %xmm1,%xmm7,%xmm7
|
||||
vaesenc %xmm1,%xmm8,%xmm8
|
||||
vaesenc %xmm1,%xmm9,%xmm9
|
||||
vmovups 208-120(%rsi),%xmm1
|
||||
|
||||
vaesenc %xmm0,%xmm2,%xmm2
|
||||
vaesenc %xmm0,%xmm3,%xmm3
|
||||
vaesenc %xmm0,%xmm4,%xmm4
|
||||
vaesenc %xmm0,%xmm5,%xmm5
|
||||
vaesenc %xmm0,%xmm6,%xmm6
|
||||
vaesenc %xmm0,%xmm7,%xmm7
|
||||
vaesenc %xmm0,%xmm8,%xmm8
|
||||
vaesenc %xmm0,%xmm9,%xmm9
|
||||
vmovups 224-120(%rsi),%xmm0
|
||||
|
||||
.Lenc8x_tail:
|
||||
vaesenc %xmm1,%xmm2,%xmm2
|
||||
vpxor %xmm15,%xmm15,%xmm15
|
||||
vaesenc %xmm1,%xmm3,%xmm3
|
||||
vaesenc %xmm1,%xmm4,%xmm4
|
||||
vpcmpgtd %xmm15,%xmm14,%xmm15
|
||||
vaesenc %xmm1,%xmm5,%xmm5
|
||||
vaesenc %xmm1,%xmm6,%xmm6
|
||||
vpaddd %xmm14,%xmm15,%xmm15
|
||||
vmovdqu 48(%rsp),%xmm14
|
||||
vaesenc %xmm1,%xmm7,%xmm7
|
||||
movq 64(%rsp),%rbx
|
||||
vaesenc %xmm1,%xmm8,%xmm8
|
||||
vaesenc %xmm1,%xmm9,%xmm9
|
||||
vmovups 16-120(%rsi),%xmm1
|
||||
|
||||
vaesenclast %xmm0,%xmm2,%xmm2
|
||||
vmovdqa %xmm15,32(%rsp)
|
||||
vpxor %xmm15,%xmm15,%xmm15
|
||||
vaesenclast %xmm0,%xmm3,%xmm3
|
||||
vaesenclast %xmm0,%xmm4,%xmm4
|
||||
vpcmpgtd %xmm15,%xmm14,%xmm15
|
||||
vaesenclast %xmm0,%xmm5,%xmm5
|
||||
vaesenclast %xmm0,%xmm6,%xmm6
|
||||
vpaddd %xmm15,%xmm14,%xmm14
|
||||
vmovdqu -120(%rsi),%xmm15
|
||||
vaesenclast %xmm0,%xmm7,%xmm7
|
||||
vaesenclast %xmm0,%xmm8,%xmm8
|
||||
vmovdqa %xmm14,48(%rsp)
|
||||
vaesenclast %xmm0,%xmm9,%xmm9
|
||||
vmovups 32-120(%rsi),%xmm0
|
||||
|
||||
vmovups %xmm2,-16(%r8)
|
||||
subq %rbx,%r8
|
||||
vpxor 0(%rbp),%xmm2,%xmm2
|
||||
vmovups %xmm3,-16(%r9)
|
||||
subq 72(%rsp),%r9
|
||||
vpxor 16(%rbp),%xmm3,%xmm3
|
||||
vmovups %xmm4,-16(%r10)
|
||||
subq 80(%rsp),%r10
|
||||
vpxor 32(%rbp),%xmm4,%xmm4
|
||||
vmovups %xmm5,-16(%r11)
|
||||
subq 88(%rsp),%r11
|
||||
vpxor 48(%rbp),%xmm5,%xmm5
|
||||
vmovups %xmm6,-16(%r12)
|
||||
subq 96(%rsp),%r12
|
||||
vpxor %xmm10,%xmm6,%xmm6
|
||||
vmovups %xmm7,-16(%r13)
|
||||
subq 104(%rsp),%r13
|
||||
vpxor %xmm11,%xmm7,%xmm7
|
||||
vmovups %xmm8,-16(%r14)
|
||||
subq 112(%rsp),%r14
|
||||
vpxor %xmm12,%xmm8,%xmm8
|
||||
vmovups %xmm9,-16(%r15)
|
||||
subq 120(%rsp),%r15
|
||||
vpxor %xmm13,%xmm9,%xmm9
|
||||
|
||||
decl %edx
|
||||
jnz .Loop_enc8x
|
||||
|
||||
movq 16(%rsp),%rax
|
||||
.cfi_def_cfa %rax,8
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
.Lenc8x_done:
|
||||
vzeroupper
|
||||
movq -48(%rax),%r15
|
||||
.cfi_restore %r15
|
||||
movq -40(%rax),%r14
|
||||
.cfi_restore %r14
|
||||
movq -32(%rax),%r13
|
||||
.cfi_restore %r13
|
||||
movq -24(%rax),%r12
|
||||
.cfi_restore %r12
|
||||
movq -16(%rax),%rbp
|
||||
.cfi_restore %rbp
|
||||
movq -8(%rax),%rbx
|
||||
.cfi_restore %rbx
|
||||
leaq (%rax),%rsp
|
||||
.cfi_def_cfa_register %rsp
|
||||
.Lenc8x_epilogue:
|
||||
.byte 0xf3,0xc3
|
||||
.cfi_endproc
|
||||
.size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
|
||||
|
||||
.type aesni_multi_cbc_decrypt_avx,@function
|
||||
.align 32
|
||||
aesni_multi_cbc_decrypt_avx:
|
||||
.cfi_startproc
|
||||
_avx_cbc_dec_shortcut:
|
||||
movq %rsp,%rax
|
||||
.cfi_def_cfa_register %rax
|
||||
pushq %rbx
|
||||
.cfi_offset %rbx,-16
|
||||
pushq %rbp
|
||||
.cfi_offset %rbp,-24
|
||||
pushq %r12
|
||||
.cfi_offset %r12,-32
|
||||
pushq %r13
|
||||
.cfi_offset %r13,-40
|
||||
pushq %r14
|
||||
.cfi_offset %r14,-48
|
||||
pushq %r15
|
||||
.cfi_offset %r15,-56
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
subq $256,%rsp
|
||||
andq $-256,%rsp
|
||||
subq $192,%rsp
|
||||
movq %rax,16(%rsp)
|
||||
.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08
|
||||
|
||||
.Ldec8x_body:
|
||||
vzeroupper
|
||||
vmovdqu (%rsi),%xmm15
|
||||
leaq 120(%rsi),%rsi
|
||||
leaq 160(%rdi),%rdi
|
||||
shrl $1,%edx
|
||||
|
||||
.Ldec8x_loop_grande:
|
||||
|
||||
xorl %edx,%edx
|
||||
movl -144(%rdi),%ecx
|
||||
movq -160(%rdi),%r8
|
||||
cmpl %edx,%ecx
|
||||
movq -152(%rdi),%rbx
|
||||
cmovgl %ecx,%edx
|
||||
testl %ecx,%ecx
|
||||
vmovdqu -136(%rdi),%xmm2
|
||||
movl %ecx,32(%rsp)
|
||||
cmovleq %rsp,%r8
|
||||
subq %r8,%rbx
|
||||
movq %rbx,64(%rsp)
|
||||
vmovdqu %xmm2,192(%rsp)
|
||||
movl -104(%rdi),%ecx
|
||||
movq -120(%rdi),%r9
|
||||
cmpl %edx,%ecx
|
||||
movq -112(%rdi),%rbp
|
||||
cmovgl %ecx,%edx
|
||||
testl %ecx,%ecx
|
||||
vmovdqu -96(%rdi),%xmm3
|
||||
movl %ecx,36(%rsp)
|
||||
cmovleq %rsp,%r9
|
||||
subq %r9,%rbp
|
||||
movq %rbp,72(%rsp)
|
||||
vmovdqu %xmm3,208(%rsp)
|
||||
movl -64(%rdi),%ecx
|
||||
movq -80(%rdi),%r10
|
||||
cmpl %edx,%ecx
|
||||
movq -72(%rdi),%rbp
|
||||
cmovgl %ecx,%edx
|
||||
testl %ecx,%ecx
|
||||
vmovdqu -56(%rdi),%xmm4
|
||||
movl %ecx,40(%rsp)
|
||||
cmovleq %rsp,%r10
|
||||
subq %r10,%rbp
|
||||
movq %rbp,80(%rsp)
|
||||
vmovdqu %xmm4,224(%rsp)
|
||||
movl -24(%rdi),%ecx
|
||||
movq -40(%rdi),%r11
|
||||
cmpl %edx,%ecx
|
||||
movq -32(%rdi),%rbp
|
||||
cmovgl %ecx,%edx
|
||||
testl %ecx,%ecx
|
||||
vmovdqu -16(%rdi),%xmm5
|
||||
movl %ecx,44(%rsp)
|
||||
cmovleq %rsp,%r11
|
||||
subq %r11,%rbp
|
||||
movq %rbp,88(%rsp)
|
||||
vmovdqu %xmm5,240(%rsp)
|
||||
movl 16(%rdi),%ecx
|
||||
movq 0(%rdi),%r12
|
||||
cmpl %edx,%ecx
|
||||
movq 8(%rdi),%rbp
|
||||
cmovgl %ecx,%edx
|
||||
testl %ecx,%ecx
|
||||
vmovdqu 24(%rdi),%xmm6
|
||||
movl %ecx,48(%rsp)
|
||||
cmovleq %rsp,%r12
|
||||
subq %r12,%rbp
|
||||
movq %rbp,96(%rsp)
|
||||
vmovdqu %xmm6,256(%rsp)
|
||||
movl 56(%rdi),%ecx
|
||||
movq 40(%rdi),%r13
|
||||
cmpl %edx,%ecx
|
||||
movq 48(%rdi),%rbp
|
||||
cmovgl %ecx,%edx
|
||||
testl %ecx,%ecx
|
||||
vmovdqu 64(%rdi),%xmm7
|
||||
movl %ecx,52(%rsp)
|
||||
cmovleq %rsp,%r13
|
||||
subq %r13,%rbp
|
||||
movq %rbp,104(%rsp)
|
||||
vmovdqu %xmm7,272(%rsp)
|
||||
movl 96(%rdi),%ecx
|
||||
movq 80(%rdi),%r14
|
||||
cmpl %edx,%ecx
|
||||
movq 88(%rdi),%rbp
|
||||
cmovgl %ecx,%edx
|
||||
testl %ecx,%ecx
|
||||
vmovdqu 104(%rdi),%xmm8
|
||||
movl %ecx,56(%rsp)
|
||||
cmovleq %rsp,%r14
|
||||
subq %r14,%rbp
|
||||
movq %rbp,112(%rsp)
|
||||
vmovdqu %xmm8,288(%rsp)
|
||||
movl 136(%rdi),%ecx
|
||||
movq 120(%rdi),%r15
|
||||
cmpl %edx,%ecx
|
||||
movq 128(%rdi),%rbp
|
||||
cmovgl %ecx,%edx
|
||||
testl %ecx,%ecx
|
||||
vmovdqu 144(%rdi),%xmm9
|
||||
movl %ecx,60(%rsp)
|
||||
cmovleq %rsp,%r15
|
||||
subq %r15,%rbp
|
||||
movq %rbp,120(%rsp)
|
||||
vmovdqu %xmm9,304(%rsp)
|
||||
testl %edx,%edx
|
||||
jz .Ldec8x_done
|
||||
|
||||
vmovups 16-120(%rsi),%xmm1
|
||||
vmovups 32-120(%rsi),%xmm0
|
||||
movl 240-120(%rsi),%eax
|
||||
leaq 192+128(%rsp),%rbp
|
||||
|
||||
vmovdqu (%r8),%xmm2
|
||||
vmovdqu (%r9),%xmm3
|
||||
vmovdqu (%r10),%xmm4
|
||||
vmovdqu (%r11),%xmm5
|
||||
vmovdqu (%r12),%xmm6
|
||||
vmovdqu (%r13),%xmm7
|
||||
vmovdqu (%r14),%xmm8
|
||||
vmovdqu (%r15),%xmm9
|
||||
vmovdqu %xmm2,0(%rbp)
|
||||
vpxor %xmm15,%xmm2,%xmm2
|
||||
vmovdqu %xmm3,16(%rbp)
|
||||
vpxor %xmm15,%xmm3,%xmm3
|
||||
vmovdqu %xmm4,32(%rbp)
|
||||
vpxor %xmm15,%xmm4,%xmm4
|
||||
vmovdqu %xmm5,48(%rbp)
|
||||
vpxor %xmm15,%xmm5,%xmm5
|
||||
vmovdqu %xmm6,64(%rbp)
|
||||
vpxor %xmm15,%xmm6,%xmm6
|
||||
vmovdqu %xmm7,80(%rbp)
|
||||
vpxor %xmm15,%xmm7,%xmm7
|
||||
vmovdqu %xmm8,96(%rbp)
|
||||
vpxor %xmm15,%xmm8,%xmm8
|
||||
vmovdqu %xmm9,112(%rbp)
|
||||
vpxor %xmm15,%xmm9,%xmm9
|
||||
xorq $0x80,%rbp
|
||||
movl $1,%ecx
|
||||
jmp .Loop_dec8x
|
||||
|
||||
.align 32
|
||||
.Loop_dec8x:
|
||||
vaesdec %xmm1,%xmm2,%xmm2
|
||||
cmpl 32+0(%rsp),%ecx
|
||||
vaesdec %xmm1,%xmm3,%xmm3
|
||||
prefetcht0 31(%r8)
|
||||
vaesdec %xmm1,%xmm4,%xmm4
|
||||
vaesdec %xmm1,%xmm5,%xmm5
|
||||
leaq (%r8,%rbx,1),%rbx
|
||||
cmovgeq %rsp,%r8
|
||||
vaesdec %xmm1,%xmm6,%xmm6
|
||||
cmovgq %rsp,%rbx
|
||||
vaesdec %xmm1,%xmm7,%xmm7
|
||||
subq %r8,%rbx
|
||||
vaesdec %xmm1,%xmm8,%xmm8
|
||||
vmovdqu 16(%r8),%xmm10
|
||||
movq %rbx,64+0(%rsp)
|
||||
vaesdec %xmm1,%xmm9,%xmm9
|
||||
vmovups -72(%rsi),%xmm1
|
||||
leaq 16(%r8,%rbx,1),%r8
|
||||
vmovdqu %xmm10,128(%rsp)
|
||||
vaesdec %xmm0,%xmm2,%xmm2
|
||||
cmpl 32+4(%rsp),%ecx
|
||||
movq 64+8(%rsp),%rbx
|
||||
vaesdec %xmm0,%xmm3,%xmm3
|
||||
prefetcht0 31(%r9)
|
||||
vaesdec %xmm0,%xmm4,%xmm4
|
||||
vaesdec %xmm0,%xmm5,%xmm5
|
||||
leaq (%r9,%rbx,1),%rbx
|
||||
cmovgeq %rsp,%r9
|
||||
vaesdec %xmm0,%xmm6,%xmm6
|
||||
cmovgq %rsp,%rbx
|
||||
vaesdec %xmm0,%xmm7,%xmm7
|
||||
subq %r9,%rbx
|
||||
vaesdec %xmm0,%xmm8,%xmm8
|
||||
vmovdqu 16(%r9),%xmm11
|
||||
movq %rbx,64+8(%rsp)
|
||||
vaesdec %xmm0,%xmm9,%xmm9
|
||||
vmovups -56(%rsi),%xmm0
|
||||
leaq 16(%r9,%rbx,1),%r9
|
||||
vmovdqu %xmm11,144(%rsp)
|
||||
vaesdec %xmm1,%xmm2,%xmm2
|
||||
cmpl 32+8(%rsp),%ecx
|
||||
movq 64+16(%rsp),%rbx
|
||||
vaesdec %xmm1,%xmm3,%xmm3
|
||||
prefetcht0 31(%r10)
|
||||
vaesdec %xmm1,%xmm4,%xmm4
|
||||
prefetcht0 15(%r8)
|
||||
vaesdec %xmm1,%xmm5,%xmm5
|
||||
leaq (%r10,%rbx,1),%rbx
|
||||
cmovgeq %rsp,%r10
|
||||
vaesdec %xmm1,%xmm6,%xmm6
|
||||
cmovgq %rsp,%rbx
|
||||
vaesdec %xmm1,%xmm7,%xmm7
|
||||
subq %r10,%rbx
|
||||
vaesdec %xmm1,%xmm8,%xmm8
|
||||
vmovdqu 16(%r10),%xmm12
|
||||
movq %rbx,64+16(%rsp)
|
||||
vaesdec %xmm1,%xmm9,%xmm9
|
||||
vmovups -40(%rsi),%xmm1
|
||||
leaq 16(%r10,%rbx,1),%r10
|
||||
vmovdqu %xmm12,160(%rsp)
|
||||
vaesdec %xmm0,%xmm2,%xmm2
|
||||
cmpl 32+12(%rsp),%ecx
|
||||
movq 64+24(%rsp),%rbx
|
||||
vaesdec %xmm0,%xmm3,%xmm3
|
||||
prefetcht0 31(%r11)
|
||||
vaesdec %xmm0,%xmm4,%xmm4
|
||||
prefetcht0 15(%r9)
|
||||
vaesdec %xmm0,%xmm5,%xmm5
|
||||
leaq (%r11,%rbx,1),%rbx
|
||||
cmovgeq %rsp,%r11
|
||||
vaesdec %xmm0,%xmm6,%xmm6
|
||||
cmovgq %rsp,%rbx
|
||||
vaesdec %xmm0,%xmm7,%xmm7
|
||||
subq %r11,%rbx
|
||||
vaesdec %xmm0,%xmm8,%xmm8
|
||||
vmovdqu 16(%r11),%xmm13
|
||||
movq %rbx,64+24(%rsp)
|
||||
vaesdec %xmm0,%xmm9,%xmm9
|
||||
vmovups -24(%rsi),%xmm0
|
||||
leaq 16(%r11,%rbx,1),%r11
|
||||
vmovdqu %xmm13,176(%rsp)
|
||||
vaesdec %xmm1,%xmm2,%xmm2
|
||||
cmpl 32+16(%rsp),%ecx
|
||||
movq 64+32(%rsp),%rbx
|
||||
vaesdec %xmm1,%xmm3,%xmm3
|
||||
prefetcht0 31(%r12)
|
||||
vaesdec %xmm1,%xmm4,%xmm4
|
||||
prefetcht0 15(%r10)
|
||||
vaesdec %xmm1,%xmm5,%xmm5
|
||||
leaq (%r12,%rbx,1),%rbx
|
||||
cmovgeq %rsp,%r12
|
||||
vaesdec %xmm1,%xmm6,%xmm6
|
||||
cmovgq %rsp,%rbx
|
||||
vaesdec %xmm1,%xmm7,%xmm7
|
||||
subq %r12,%rbx
|
||||
vaesdec %xmm1,%xmm8,%xmm8
|
||||
vmovdqu 16(%r12),%xmm10
|
||||
movq %rbx,64+32(%rsp)
|
||||
vaesdec %xmm1,%xmm9,%xmm9
|
||||
vmovups -8(%rsi),%xmm1
|
||||
leaq 16(%r12,%rbx,1),%r12
|
||||
vaesdec %xmm0,%xmm2,%xmm2
|
||||
cmpl 32+20(%rsp),%ecx
|
||||
movq 64+40(%rsp),%rbx
|
||||
vaesdec %xmm0,%xmm3,%xmm3
|
||||
prefetcht0 31(%r13)
|
||||
vaesdec %xmm0,%xmm4,%xmm4
|
||||
prefetcht0 15(%r11)
|
||||
vaesdec %xmm0,%xmm5,%xmm5
|
||||
leaq (%rbx,%r13,1),%rbx
|
||||
cmovgeq %rsp,%r13
|
||||
vaesdec %xmm0,%xmm6,%xmm6
|
||||
cmovgq %rsp,%rbx
|
||||
vaesdec %xmm0,%xmm7,%xmm7
|
||||
subq %r13,%rbx
|
||||
vaesdec %xmm0,%xmm8,%xmm8
|
||||
vmovdqu 16(%r13),%xmm11
|
||||
movq %rbx,64+40(%rsp)
|
||||
vaesdec %xmm0,%xmm9,%xmm9
|
||||
vmovups 8(%rsi),%xmm0
|
||||
leaq 16(%r13,%rbx,1),%r13
|
||||
vaesdec %xmm1,%xmm2,%xmm2
|
||||
cmpl 32+24(%rsp),%ecx
|
||||
movq 64+48(%rsp),%rbx
|
||||
vaesdec %xmm1,%xmm3,%xmm3
|
||||
prefetcht0 31(%r14)
|
||||
vaesdec %xmm1,%xmm4,%xmm4
|
||||
prefetcht0 15(%r12)
|
||||
vaesdec %xmm1,%xmm5,%xmm5
|
||||
leaq (%r14,%rbx,1),%rbx
|
||||
cmovgeq %rsp,%r14
|
||||
vaesdec %xmm1,%xmm6,%xmm6
|
||||
cmovgq %rsp,%rbx
|
||||
vaesdec %xmm1,%xmm7,%xmm7
|
||||
subq %r14,%rbx
|
||||
vaesdec %xmm1,%xmm8,%xmm8
|
||||
vmovdqu 16(%r14),%xmm12
|
||||
movq %rbx,64+48(%rsp)
|
||||
vaesdec %xmm1,%xmm9,%xmm9
|
||||
vmovups 24(%rsi),%xmm1
|
||||
leaq 16(%r14,%rbx,1),%r14
|
||||
vaesdec %xmm0,%xmm2,%xmm2
|
||||
cmpl 32+28(%rsp),%ecx
|
||||
movq 64+56(%rsp),%rbx
|
||||
vaesdec %xmm0,%xmm3,%xmm3
|
||||
prefetcht0 31(%r15)
|
||||
vaesdec %xmm0,%xmm4,%xmm4
|
||||
prefetcht0 15(%r13)
|
||||
vaesdec %xmm0,%xmm5,%xmm5
|
||||
leaq (%r15,%rbx,1),%rbx
|
||||
cmovgeq %rsp,%r15
|
||||
vaesdec %xmm0,%xmm6,%xmm6
|
||||
cmovgq %rsp,%rbx
|
||||
vaesdec %xmm0,%xmm7,%xmm7
|
||||
subq %r15,%rbx
|
||||
vaesdec %xmm0,%xmm8,%xmm8
|
||||
vmovdqu 16(%r15),%xmm13
|
||||
movq %rbx,64+56(%rsp)
|
||||
vaesdec %xmm0,%xmm9,%xmm9
|
||||
vmovups 40(%rsi),%xmm0
|
||||
leaq 16(%r15,%rbx,1),%r15
|
||||
vmovdqu 32(%rsp),%xmm14
|
||||
prefetcht0 15(%r14)
|
||||
prefetcht0 15(%r15)
|
||||
cmpl $11,%eax
|
||||
jb .Ldec8x_tail
|
||||
|
||||
vaesdec %xmm1,%xmm2,%xmm2
|
||||
vaesdec %xmm1,%xmm3,%xmm3
|
||||
vaesdec %xmm1,%xmm4,%xmm4
|
||||
vaesdec %xmm1,%xmm5,%xmm5
|
||||
vaesdec %xmm1,%xmm6,%xmm6
|
||||
vaesdec %xmm1,%xmm7,%xmm7
|
||||
vaesdec %xmm1,%xmm8,%xmm8
|
||||
vaesdec %xmm1,%xmm9,%xmm9
|
||||
vmovups 176-120(%rsi),%xmm1
|
||||
|
||||
vaesdec %xmm0,%xmm2,%xmm2
|
||||
vaesdec %xmm0,%xmm3,%xmm3
|
||||
vaesdec %xmm0,%xmm4,%xmm4
|
||||
vaesdec %xmm0,%xmm5,%xmm5
|
||||
vaesdec %xmm0,%xmm6,%xmm6
|
||||
vaesdec %xmm0,%xmm7,%xmm7
|
||||
vaesdec %xmm0,%xmm8,%xmm8
|
||||
vaesdec %xmm0,%xmm9,%xmm9
|
||||
vmovups 192-120(%rsi),%xmm0
|
||||
je .Ldec8x_tail
|
||||
|
||||
vaesdec %xmm1,%xmm2,%xmm2
|
||||
vaesdec %xmm1,%xmm3,%xmm3
|
||||
vaesdec %xmm1,%xmm4,%xmm4
|
||||
vaesdec %xmm1,%xmm5,%xmm5
|
||||
vaesdec %xmm1,%xmm6,%xmm6
|
||||
vaesdec %xmm1,%xmm7,%xmm7
|
||||
vaesdec %xmm1,%xmm8,%xmm8
|
||||
vaesdec %xmm1,%xmm9,%xmm9
|
||||
vmovups 208-120(%rsi),%xmm1
|
||||
|
||||
vaesdec %xmm0,%xmm2,%xmm2
|
||||
vaesdec %xmm0,%xmm3,%xmm3
|
||||
vaesdec %xmm0,%xmm4,%xmm4
|
||||
vaesdec %xmm0,%xmm5,%xmm5
|
||||
vaesdec %xmm0,%xmm6,%xmm6
|
||||
vaesdec %xmm0,%xmm7,%xmm7
|
||||
vaesdec %xmm0,%xmm8,%xmm8
|
||||
vaesdec %xmm0,%xmm9,%xmm9
|
||||
vmovups 224-120(%rsi),%xmm0
|
||||
|
||||
.Ldec8x_tail:
|
||||
vaesdec %xmm1,%xmm2,%xmm2
|
||||
vpxor %xmm15,%xmm15,%xmm15
|
||||
vaesdec %xmm1,%xmm3,%xmm3
|
||||
vaesdec %xmm1,%xmm4,%xmm4
|
||||
vpcmpgtd %xmm15,%xmm14,%xmm15
|
||||
vaesdec %xmm1,%xmm5,%xmm5
|
||||
vaesdec %xmm1,%xmm6,%xmm6
|
||||
vpaddd %xmm14,%xmm15,%xmm15
|
||||
vmovdqu 48(%rsp),%xmm14
|
||||
vaesdec %xmm1,%xmm7,%xmm7
|
||||
movq 64(%rsp),%rbx
|
||||
vaesdec %xmm1,%xmm8,%xmm8
|
||||
vaesdec %xmm1,%xmm9,%xmm9
|
||||
vmovups 16-120(%rsi),%xmm1
|
||||
|
||||
vaesdeclast %xmm0,%xmm2,%xmm2
|
||||
vmovdqa %xmm15,32(%rsp)
|
||||
vpxor %xmm15,%xmm15,%xmm15
|
||||
vaesdeclast %xmm0,%xmm3,%xmm3
|
||||
vpxor 0(%rbp),%xmm2,%xmm2
|
||||
vaesdeclast %xmm0,%xmm4,%xmm4
|
||||
vpxor 16(%rbp),%xmm3,%xmm3
|
||||
vpcmpgtd %xmm15,%xmm14,%xmm15
|
||||
vaesdeclast %xmm0,%xmm5,%xmm5
|
||||
vpxor 32(%rbp),%xmm4,%xmm4
|
||||
vaesdeclast %xmm0,%xmm6,%xmm6
|
||||
vpxor 48(%rbp),%xmm5,%xmm5
|
||||
vpaddd %xmm15,%xmm14,%xmm14
|
||||
vmovdqu -120(%rsi),%xmm15
|
||||
vaesdeclast %xmm0,%xmm7,%xmm7
|
||||
vpxor 64(%rbp),%xmm6,%xmm6
|
||||
vaesdeclast %xmm0,%xmm8,%xmm8
|
||||
vpxor 80(%rbp),%xmm7,%xmm7
|
||||
vmovdqa %xmm14,48(%rsp)
|
||||
vaesdeclast %xmm0,%xmm9,%xmm9
|
||||
vpxor 96(%rbp),%xmm8,%xmm8
|
||||
vmovups 32-120(%rsi),%xmm0
|
||||
|
||||
vmovups %xmm2,-16(%r8)
|
||||
subq %rbx,%r8
|
||||
vmovdqu 128+0(%rsp),%xmm2
|
||||
vpxor 112(%rbp),%xmm9,%xmm9
|
||||
vmovups %xmm3,-16(%r9)
|
||||
subq 72(%rsp),%r9
|
||||
vmovdqu %xmm2,0(%rbp)
|
||||
vpxor %xmm15,%xmm2,%xmm2
|
||||
vmovdqu 128+16(%rsp),%xmm3
|
||||
vmovups %xmm4,-16(%r10)
|
||||
subq 80(%rsp),%r10
|
||||
vmovdqu %xmm3,16(%rbp)
|
||||
vpxor %xmm15,%xmm3,%xmm3
|
||||
vmovdqu 128+32(%rsp),%xmm4
|
||||
vmovups %xmm5,-16(%r11)
|
||||
subq 88(%rsp),%r11
|
||||
vmovdqu %xmm4,32(%rbp)
|
||||
vpxor %xmm15,%xmm4,%xmm4
|
||||
vmovdqu 128+48(%rsp),%xmm5
|
||||
vmovups %xmm6,-16(%r12)
|
||||
subq 96(%rsp),%r12
|
||||
vmovdqu %xmm5,48(%rbp)
|
||||
vpxor %xmm15,%xmm5,%xmm5
|
||||
vmovdqu %xmm10,64(%rbp)
|
||||
vpxor %xmm10,%xmm15,%xmm6
|
||||
vmovups %xmm7,-16(%r13)
|
||||
subq 104(%rsp),%r13
|
||||
vmovdqu %xmm11,80(%rbp)
|
||||
vpxor %xmm11,%xmm15,%xmm7
|
||||
vmovups %xmm8,-16(%r14)
|
||||
subq 112(%rsp),%r14
|
||||
vmovdqu %xmm12,96(%rbp)
|
||||
vpxor %xmm12,%xmm15,%xmm8
|
||||
vmovups %xmm9,-16(%r15)
|
||||
subq 120(%rsp),%r15
|
||||
vmovdqu %xmm13,112(%rbp)
|
||||
vpxor %xmm13,%xmm15,%xmm9
|
||||
|
||||
xorq $128,%rbp
|
||||
decl %edx
|
||||
jnz .Loop_dec8x
|
||||
|
||||
movq 16(%rsp),%rax
|
||||
.cfi_def_cfa %rax,8
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
.Ldec8x_done:
|
||||
vzeroupper
|
||||
movq -48(%rax),%r15
|
||||
.cfi_restore %r15
|
||||
movq -40(%rax),%r14
|
||||
.cfi_restore %r14
|
||||
movq -32(%rax),%r13
|
||||
.cfi_restore %r13
|
||||
movq -24(%rax),%r12
|
||||
.cfi_restore %r12
|
||||
movq -16(%rax),%rbp
|
||||
.cfi_restore %rbp
|
||||
movq -8(%rax),%rbx
|
||||
.cfi_restore %rbx
|
||||
leaq (%rax),%rsp
|
||||
.cfi_def_cfa_register %rsp
|
||||
.Ldec8x_epilogue:
|
||||
.byte 0xf3,0xc3
|
||||
.cfi_endproc
|
||||
.size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1304,7 +1304,108 @@ gcm_ghash_clmul:
|
||||
.align 32
|
||||
gcm_init_avx:
|
||||
.cfi_startproc
|
||||
jmp .L_init_clmul
|
||||
vzeroupper
|
||||
|
||||
vmovdqu (%rsi),%xmm2
|
||||
vpshufd $78,%xmm2,%xmm2
|
||||
|
||||
|
||||
vpshufd $255,%xmm2,%xmm4
|
||||
vpsrlq $63,%xmm2,%xmm3
|
||||
vpsllq $1,%xmm2,%xmm2
|
||||
vpxor %xmm5,%xmm5,%xmm5
|
||||
vpcmpgtd %xmm4,%xmm5,%xmm5
|
||||
vpslldq $8,%xmm3,%xmm3
|
||||
vpor %xmm3,%xmm2,%xmm2
|
||||
|
||||
|
||||
vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5
|
||||
vpxor %xmm5,%xmm2,%xmm2
|
||||
|
||||
vpunpckhqdq %xmm2,%xmm2,%xmm6
|
||||
vmovdqa %xmm2,%xmm0
|
||||
vpxor %xmm2,%xmm6,%xmm6
|
||||
movq $4,%r10
|
||||
jmp .Linit_start_avx
|
||||
.align 32
|
||||
.Linit_loop_avx:
|
||||
vpalignr $8,%xmm3,%xmm4,%xmm5
|
||||
vmovdqu %xmm5,-16(%rdi)
|
||||
vpunpckhqdq %xmm0,%xmm0,%xmm3
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
|
||||
vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
|
||||
vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
|
||||
vpxor %xmm0,%xmm1,%xmm4
|
||||
vpxor %xmm4,%xmm3,%xmm3
|
||||
|
||||
vpslldq $8,%xmm3,%xmm4
|
||||
vpsrldq $8,%xmm3,%xmm3
|
||||
vpxor %xmm4,%xmm0,%xmm0
|
||||
vpxor %xmm3,%xmm1,%xmm1
|
||||
vpsllq $57,%xmm0,%xmm3
|
||||
vpsllq $62,%xmm0,%xmm4
|
||||
vpxor %xmm3,%xmm4,%xmm4
|
||||
vpsllq $63,%xmm0,%xmm3
|
||||
vpxor %xmm3,%xmm4,%xmm4
|
||||
vpslldq $8,%xmm4,%xmm3
|
||||
vpsrldq $8,%xmm4,%xmm4
|
||||
vpxor %xmm3,%xmm0,%xmm0
|
||||
vpxor %xmm4,%xmm1,%xmm1
|
||||
|
||||
vpsrlq $1,%xmm0,%xmm4
|
||||
vpxor %xmm0,%xmm1,%xmm1
|
||||
vpxor %xmm4,%xmm0,%xmm0
|
||||
vpsrlq $5,%xmm4,%xmm4
|
||||
vpxor %xmm4,%xmm0,%xmm0
|
||||
vpsrlq $1,%xmm0,%xmm0
|
||||
vpxor %xmm1,%xmm0,%xmm0
|
||||
.Linit_start_avx:
|
||||
vmovdqa %xmm0,%xmm5
|
||||
vpunpckhqdq %xmm0,%xmm0,%xmm3
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
|
||||
vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
|
||||
vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
|
||||
vpxor %xmm0,%xmm1,%xmm4
|
||||
vpxor %xmm4,%xmm3,%xmm3
|
||||
|
||||
vpslldq $8,%xmm3,%xmm4
|
||||
vpsrldq $8,%xmm3,%xmm3
|
||||
vpxor %xmm4,%xmm0,%xmm0
|
||||
vpxor %xmm3,%xmm1,%xmm1
|
||||
vpsllq $57,%xmm0,%xmm3
|
||||
vpsllq $62,%xmm0,%xmm4
|
||||
vpxor %xmm3,%xmm4,%xmm4
|
||||
vpsllq $63,%xmm0,%xmm3
|
||||
vpxor %xmm3,%xmm4,%xmm4
|
||||
vpslldq $8,%xmm4,%xmm3
|
||||
vpsrldq $8,%xmm4,%xmm4
|
||||
vpxor %xmm3,%xmm0,%xmm0
|
||||
vpxor %xmm4,%xmm1,%xmm1
|
||||
|
||||
vpsrlq $1,%xmm0,%xmm4
|
||||
vpxor %xmm0,%xmm1,%xmm1
|
||||
vpxor %xmm4,%xmm0,%xmm0
|
||||
vpsrlq $5,%xmm4,%xmm4
|
||||
vpxor %xmm4,%xmm0,%xmm0
|
||||
vpsrlq $1,%xmm0,%xmm0
|
||||
vpxor %xmm1,%xmm0,%xmm0
|
||||
vpshufd $78,%xmm5,%xmm3
|
||||
vpshufd $78,%xmm0,%xmm4
|
||||
vpxor %xmm5,%xmm3,%xmm3
|
||||
vmovdqu %xmm5,0(%rdi)
|
||||
vpxor %xmm0,%xmm4,%xmm4
|
||||
vmovdqu %xmm0,16(%rdi)
|
||||
leaq 48(%rdi),%rdi
|
||||
subq $1,%r10
|
||||
jnz .Linit_loop_avx
|
||||
|
||||
vpalignr $8,%xmm4,%xmm3,%xmm5
|
||||
vmovdqu %xmm5,-16(%rdi)
|
||||
|
||||
vzeroupper
|
||||
.byte 0xf3,0xc3
|
||||
.cfi_endproc
|
||||
.size gcm_init_avx,.-gcm_init_avx
|
||||
.globl gcm_gmult_avx
|
||||
@ -1320,7 +1421,377 @@ gcm_gmult_avx:
|
||||
.align 32
|
||||
gcm_ghash_avx:
|
||||
.cfi_startproc
|
||||
jmp .L_ghash_clmul
|
||||
vzeroupper
|
||||
|
||||
vmovdqu (%rdi),%xmm10
|
||||
leaq .L0x1c2_polynomial(%rip),%r10
|
||||
leaq 64(%rsi),%rsi
|
||||
vmovdqu .Lbswap_mask(%rip),%xmm13
|
||||
vpshufb %xmm13,%xmm10,%xmm10
|
||||
cmpq $0x80,%rcx
|
||||
jb .Lshort_avx
|
||||
subq $0x80,%rcx
|
||||
|
||||
vmovdqu 112(%rdx),%xmm14
|
||||
vmovdqu 0-64(%rsi),%xmm6
|
||||
vpshufb %xmm13,%xmm14,%xmm14
|
||||
vmovdqu 32-64(%rsi),%xmm7
|
||||
|
||||
vpunpckhqdq %xmm14,%xmm14,%xmm9
|
||||
vmovdqu 96(%rdx),%xmm15
|
||||
vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
|
||||
vpxor %xmm14,%xmm9,%xmm9
|
||||
vpshufb %xmm13,%xmm15,%xmm15
|
||||
vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
|
||||
vmovdqu 16-64(%rsi),%xmm6
|
||||
vpunpckhqdq %xmm15,%xmm15,%xmm8
|
||||
vmovdqu 80(%rdx),%xmm14
|
||||
vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
|
||||
vpxor %xmm15,%xmm8,%xmm8
|
||||
|
||||
vpshufb %xmm13,%xmm14,%xmm14
|
||||
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
|
||||
vpunpckhqdq %xmm14,%xmm14,%xmm9
|
||||
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
|
||||
vmovdqu 48-64(%rsi),%xmm6
|
||||
vpxor %xmm14,%xmm9,%xmm9
|
||||
vmovdqu 64(%rdx),%xmm15
|
||||
vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
|
||||
vmovdqu 80-64(%rsi),%xmm7
|
||||
|
||||
vpshufb %xmm13,%xmm15,%xmm15
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
|
||||
vpxor %xmm1,%xmm4,%xmm4
|
||||
vpunpckhqdq %xmm15,%xmm15,%xmm8
|
||||
vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
|
||||
vmovdqu 64-64(%rsi),%xmm6
|
||||
vpxor %xmm2,%xmm5,%xmm5
|
||||
vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
|
||||
vpxor %xmm15,%xmm8,%xmm8
|
||||
|
||||
vmovdqu 48(%rdx),%xmm14
|
||||
vpxor %xmm3,%xmm0,%xmm0
|
||||
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
|
||||
vpxor %xmm4,%xmm1,%xmm1
|
||||
vpshufb %xmm13,%xmm14,%xmm14
|
||||
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
|
||||
vmovdqu 96-64(%rsi),%xmm6
|
||||
vpxor %xmm5,%xmm2,%xmm2
|
||||
vpunpckhqdq %xmm14,%xmm14,%xmm9
|
||||
vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
|
||||
vmovdqu 128-64(%rsi),%xmm7
|
||||
vpxor %xmm14,%xmm9,%xmm9
|
||||
|
||||
vmovdqu 32(%rdx),%xmm15
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
|
||||
vpxor %xmm1,%xmm4,%xmm4
|
||||
vpshufb %xmm13,%xmm15,%xmm15
|
||||
vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
|
||||
vmovdqu 112-64(%rsi),%xmm6
|
||||
vpxor %xmm2,%xmm5,%xmm5
|
||||
vpunpckhqdq %xmm15,%xmm15,%xmm8
|
||||
vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
|
||||
vpxor %xmm15,%xmm8,%xmm8
|
||||
|
||||
vmovdqu 16(%rdx),%xmm14
|
||||
vpxor %xmm3,%xmm0,%xmm0
|
||||
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
|
||||
vpxor %xmm4,%xmm1,%xmm1
|
||||
vpshufb %xmm13,%xmm14,%xmm14
|
||||
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
|
||||
vmovdqu 144-64(%rsi),%xmm6
|
||||
vpxor %xmm5,%xmm2,%xmm2
|
||||
vpunpckhqdq %xmm14,%xmm14,%xmm9
|
||||
vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
|
||||
vmovdqu 176-64(%rsi),%xmm7
|
||||
vpxor %xmm14,%xmm9,%xmm9
|
||||
|
||||
vmovdqu (%rdx),%xmm15
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
|
||||
vpxor %xmm1,%xmm4,%xmm4
|
||||
vpshufb %xmm13,%xmm15,%xmm15
|
||||
vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
|
||||
vmovdqu 160-64(%rsi),%xmm6
|
||||
vpxor %xmm2,%xmm5,%xmm5
|
||||
vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
|
||||
|
||||
leaq 128(%rdx),%rdx
|
||||
cmpq $0x80,%rcx
|
||||
jb .Ltail_avx
|
||||
|
||||
vpxor %xmm10,%xmm15,%xmm15
|
||||
subq $0x80,%rcx
|
||||
jmp .Loop8x_avx
|
||||
|
||||
.align 32
|
||||
.Loop8x_avx:
|
||||
vpunpckhqdq %xmm15,%xmm15,%xmm8
|
||||
vmovdqu 112(%rdx),%xmm14
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
vpxor %xmm15,%xmm8,%xmm8
|
||||
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10
|
||||
vpshufb %xmm13,%xmm14,%xmm14
|
||||
vpxor %xmm1,%xmm4,%xmm4
|
||||
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11
|
||||
vmovdqu 0-64(%rsi),%xmm6
|
||||
vpunpckhqdq %xmm14,%xmm14,%xmm9
|
||||
vpxor %xmm2,%xmm5,%xmm5
|
||||
vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12
|
||||
vmovdqu 32-64(%rsi),%xmm7
|
||||
vpxor %xmm14,%xmm9,%xmm9
|
||||
|
||||
vmovdqu 96(%rdx),%xmm15
|
||||
vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
|
||||
vpxor %xmm3,%xmm10,%xmm10
|
||||
vpshufb %xmm13,%xmm15,%xmm15
|
||||
vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
|
||||
vxorps %xmm4,%xmm11,%xmm11
|
||||
vmovdqu 16-64(%rsi),%xmm6
|
||||
vpunpckhqdq %xmm15,%xmm15,%xmm8
|
||||
vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
|
||||
vpxor %xmm5,%xmm12,%xmm12
|
||||
vxorps %xmm15,%xmm8,%xmm8
|
||||
|
||||
vmovdqu 80(%rdx),%xmm14
|
||||
vpxor %xmm10,%xmm12,%xmm12
|
||||
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
|
||||
vpxor %xmm11,%xmm12,%xmm12
|
||||
vpslldq $8,%xmm12,%xmm9
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
|
||||
vpsrldq $8,%xmm12,%xmm12
|
||||
vpxor %xmm9,%xmm10,%xmm10
|
||||
vmovdqu 48-64(%rsi),%xmm6
|
||||
vpshufb %xmm13,%xmm14,%xmm14
|
||||
vxorps %xmm12,%xmm11,%xmm11
|
||||
vpxor %xmm1,%xmm4,%xmm4
|
||||
vpunpckhqdq %xmm14,%xmm14,%xmm9
|
||||
vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
|
||||
vmovdqu 80-64(%rsi),%xmm7
|
||||
vpxor %xmm14,%xmm9,%xmm9
|
||||
vpxor %xmm2,%xmm5,%xmm5
|
||||
|
||||
vmovdqu 64(%rdx),%xmm15
|
||||
vpalignr $8,%xmm10,%xmm10,%xmm12
|
||||
vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
|
||||
vpshufb %xmm13,%xmm15,%xmm15
|
||||
vpxor %xmm3,%xmm0,%xmm0
|
||||
vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
|
||||
vmovdqu 64-64(%rsi),%xmm6
|
||||
vpunpckhqdq %xmm15,%xmm15,%xmm8
|
||||
vpxor %xmm4,%xmm1,%xmm1
|
||||
vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
|
||||
vxorps %xmm15,%xmm8,%xmm8
|
||||
vpxor %xmm5,%xmm2,%xmm2
|
||||
|
||||
vmovdqu 48(%rdx),%xmm14
|
||||
vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
|
||||
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
|
||||
vpshufb %xmm13,%xmm14,%xmm14
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
|
||||
vmovdqu 96-64(%rsi),%xmm6
|
||||
vpunpckhqdq %xmm14,%xmm14,%xmm9
|
||||
vpxor %xmm1,%xmm4,%xmm4
|
||||
vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
|
||||
vmovdqu 128-64(%rsi),%xmm7
|
||||
vpxor %xmm14,%xmm9,%xmm9
|
||||
vpxor %xmm2,%xmm5,%xmm5
|
||||
|
||||
vmovdqu 32(%rdx),%xmm15
|
||||
vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
|
||||
vpshufb %xmm13,%xmm15,%xmm15
|
||||
vpxor %xmm3,%xmm0,%xmm0
|
||||
vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
|
||||
vmovdqu 112-64(%rsi),%xmm6
|
||||
vpunpckhqdq %xmm15,%xmm15,%xmm8
|
||||
vpxor %xmm4,%xmm1,%xmm1
|
||||
vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
|
||||
vpxor %xmm15,%xmm8,%xmm8
|
||||
vpxor %xmm5,%xmm2,%xmm2
|
||||
vxorps %xmm12,%xmm10,%xmm10
|
||||
|
||||
vmovdqu 16(%rdx),%xmm14
|
||||
vpalignr $8,%xmm10,%xmm10,%xmm12
|
||||
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
|
||||
vpshufb %xmm13,%xmm14,%xmm14
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
|
||||
vmovdqu 144-64(%rsi),%xmm6
|
||||
vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
|
||||
vxorps %xmm11,%xmm12,%xmm12
|
||||
vpunpckhqdq %xmm14,%xmm14,%xmm9
|
||||
vpxor %xmm1,%xmm4,%xmm4
|
||||
vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
|
||||
vmovdqu 176-64(%rsi),%xmm7
|
||||
vpxor %xmm14,%xmm9,%xmm9
|
||||
vpxor %xmm2,%xmm5,%xmm5
|
||||
|
||||
vmovdqu (%rdx),%xmm15
|
||||
vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
|
||||
vpshufb %xmm13,%xmm15,%xmm15
|
||||
vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
|
||||
vmovdqu 160-64(%rsi),%xmm6
|
||||
vpxor %xmm12,%xmm15,%xmm15
|
||||
vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
|
||||
vpxor %xmm10,%xmm15,%xmm15
|
||||
|
||||
leaq 128(%rdx),%rdx
|
||||
subq $0x80,%rcx
|
||||
jnc .Loop8x_avx
|
||||
|
||||
addq $0x80,%rcx
|
||||
jmp .Ltail_no_xor_avx
|
||||
|
||||
.align 32
|
||||
.Lshort_avx:
|
||||
vmovdqu -16(%rdx,%rcx,1),%xmm14
|
||||
leaq (%rdx,%rcx,1),%rdx
|
||||
vmovdqu 0-64(%rsi),%xmm6
|
||||
vmovdqu 32-64(%rsi),%xmm7
|
||||
vpshufb %xmm13,%xmm14,%xmm15
|
||||
|
||||
vmovdqa %xmm0,%xmm3
|
||||
vmovdqa %xmm1,%xmm4
|
||||
vmovdqa %xmm2,%xmm5
|
||||
subq $0x10,%rcx
|
||||
jz .Ltail_avx
|
||||
|
||||
vpunpckhqdq %xmm15,%xmm15,%xmm8
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
|
||||
vpxor %xmm15,%xmm8,%xmm8
|
||||
vmovdqu -32(%rdx),%xmm14
|
||||
vpxor %xmm1,%xmm4,%xmm4
|
||||
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
|
||||
vmovdqu 16-64(%rsi),%xmm6
|
||||
vpshufb %xmm13,%xmm14,%xmm15
|
||||
vpxor %xmm2,%xmm5,%xmm5
|
||||
vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
|
||||
vpsrldq $8,%xmm7,%xmm7
|
||||
subq $0x10,%rcx
|
||||
jz .Ltail_avx
|
||||
|
||||
vpunpckhqdq %xmm15,%xmm15,%xmm8
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
|
||||
vpxor %xmm15,%xmm8,%xmm8
|
||||
vmovdqu -48(%rdx),%xmm14
|
||||
vpxor %xmm1,%xmm4,%xmm4
|
||||
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
|
||||
vmovdqu 48-64(%rsi),%xmm6
|
||||
vpshufb %xmm13,%xmm14,%xmm15
|
||||
vpxor %xmm2,%xmm5,%xmm5
|
||||
vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
|
||||
vmovdqu 80-64(%rsi),%xmm7
|
||||
subq $0x10,%rcx
|
||||
jz .Ltail_avx
|
||||
|
||||
vpunpckhqdq %xmm15,%xmm15,%xmm8
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
|
||||
vpxor %xmm15,%xmm8,%xmm8
|
||||
vmovdqu -64(%rdx),%xmm14
|
||||
vpxor %xmm1,%xmm4,%xmm4
|
||||
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
|
||||
vmovdqu 64-64(%rsi),%xmm6
|
||||
vpshufb %xmm13,%xmm14,%xmm15
|
||||
vpxor %xmm2,%xmm5,%xmm5
|
||||
vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
|
||||
vpsrldq $8,%xmm7,%xmm7
|
||||
subq $0x10,%rcx
|
||||
jz .Ltail_avx
|
||||
|
||||
vpunpckhqdq %xmm15,%xmm15,%xmm8
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
|
||||
vpxor %xmm15,%xmm8,%xmm8
|
||||
vmovdqu -80(%rdx),%xmm14
|
||||
vpxor %xmm1,%xmm4,%xmm4
|
||||
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
|
||||
vmovdqu 96-64(%rsi),%xmm6
|
||||
vpshufb %xmm13,%xmm14,%xmm15
|
||||
vpxor %xmm2,%xmm5,%xmm5
|
||||
vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
|
||||
vmovdqu 128-64(%rsi),%xmm7
|
||||
subq $0x10,%rcx
|
||||
jz .Ltail_avx
|
||||
|
||||
vpunpckhqdq %xmm15,%xmm15,%xmm8
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
|
||||
vpxor %xmm15,%xmm8,%xmm8
|
||||
vmovdqu -96(%rdx),%xmm14
|
||||
vpxor %xmm1,%xmm4,%xmm4
|
||||
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
|
||||
vmovdqu 112-64(%rsi),%xmm6
|
||||
vpshufb %xmm13,%xmm14,%xmm15
|
||||
vpxor %xmm2,%xmm5,%xmm5
|
||||
vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
|
||||
vpsrldq $8,%xmm7,%xmm7
|
||||
subq $0x10,%rcx
|
||||
jz .Ltail_avx
|
||||
|
||||
vpunpckhqdq %xmm15,%xmm15,%xmm8
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
|
||||
vpxor %xmm15,%xmm8,%xmm8
|
||||
vmovdqu -112(%rdx),%xmm14
|
||||
vpxor %xmm1,%xmm4,%xmm4
|
||||
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
|
||||
vmovdqu 144-64(%rsi),%xmm6
|
||||
vpshufb %xmm13,%xmm14,%xmm15
|
||||
vpxor %xmm2,%xmm5,%xmm5
|
||||
vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
|
||||
vmovq 184-64(%rsi),%xmm7
|
||||
subq $0x10,%rcx
|
||||
jmp .Ltail_avx
|
||||
|
||||
.align 32
|
||||
.Ltail_avx:
|
||||
vpxor %xmm10,%xmm15,%xmm15
|
||||
.Ltail_no_xor_avx:
|
||||
vpunpckhqdq %xmm15,%xmm15,%xmm8
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
|
||||
vpxor %xmm15,%xmm8,%xmm8
|
||||
vpxor %xmm1,%xmm4,%xmm4
|
||||
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
|
||||
vpxor %xmm2,%xmm5,%xmm5
|
||||
vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
|
||||
|
||||
vmovdqu (%r10),%xmm12
|
||||
|
||||
vpxor %xmm0,%xmm3,%xmm10
|
||||
vpxor %xmm1,%xmm4,%xmm11
|
||||
vpxor %xmm2,%xmm5,%xmm5
|
||||
|
||||
vpxor %xmm10,%xmm5,%xmm5
|
||||
vpxor %xmm11,%xmm5,%xmm5
|
||||
vpslldq $8,%xmm5,%xmm9
|
||||
vpsrldq $8,%xmm5,%xmm5
|
||||
vpxor %xmm9,%xmm10,%xmm10
|
||||
vpxor %xmm5,%xmm11,%xmm11
|
||||
|
||||
vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
|
||||
vpalignr $8,%xmm10,%xmm10,%xmm10
|
||||
vpxor %xmm9,%xmm10,%xmm10
|
||||
|
||||
vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
|
||||
vpalignr $8,%xmm10,%xmm10,%xmm10
|
||||
vpxor %xmm11,%xmm10,%xmm10
|
||||
vpxor %xmm9,%xmm10,%xmm10
|
||||
|
||||
cmpq $0,%rcx
|
||||
jne .Lshort_avx
|
||||
|
||||
vpshufb %xmm13,%xmm10,%xmm10
|
||||
vmovdqu %xmm10,(%rdi)
|
||||
vzeroupper
|
||||
.byte 0xf3,0xc3
|
||||
.cfi_endproc
|
||||
.size gcm_ghash_avx,.-gcm_ghash_avx
|
||||
.align 64
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -35,6 +35,10 @@ rsaz_512_sqr:
|
||||
movq (%rsi),%rdx
|
||||
movq 8(%rsi),%rax
|
||||
movq %rcx,128(%rsp)
|
||||
movl $0x80100,%r11d
|
||||
andl OPENSSL_ia32cap_P+8(%rip),%r11d
|
||||
cmpl $0x80100,%r11d
|
||||
je .Loop_sqrx
|
||||
jmp .Loop_sqr
|
||||
|
||||
.align 32
|
||||
@ -405,6 +409,282 @@ rsaz_512_sqr:
|
||||
|
||||
decl %r8d
|
||||
jnz .Loop_sqr
|
||||
jmp .Lsqr_tail
|
||||
|
||||
.align 32
|
||||
.Loop_sqrx:
|
||||
movl %r8d,128+8(%rsp)
|
||||
.byte 102,72,15,110,199
|
||||
|
||||
mulxq %rax,%r8,%r9
|
||||
movq %rax,%rbx
|
||||
|
||||
mulxq 16(%rsi),%rcx,%r10
|
||||
xorq %rbp,%rbp
|
||||
|
||||
mulxq 24(%rsi),%rax,%r11
|
||||
adcxq %rcx,%r9
|
||||
|
||||
.byte 0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00
|
||||
adcxq %rax,%r10
|
||||
|
||||
.byte 0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00
|
||||
adcxq %rcx,%r11
|
||||
|
||||
mulxq 48(%rsi),%rcx,%r14
|
||||
adcxq %rax,%r12
|
||||
adcxq %rcx,%r13
|
||||
|
||||
mulxq 56(%rsi),%rax,%r15
|
||||
adcxq %rax,%r14
|
||||
adcxq %rbp,%r15
|
||||
|
||||
mulxq %rdx,%rax,%rdi
|
||||
movq %rbx,%rdx
|
||||
xorq %rcx,%rcx
|
||||
adoxq %r8,%r8
|
||||
adcxq %rdi,%r8
|
||||
adoxq %rbp,%rcx
|
||||
adcxq %rbp,%rcx
|
||||
|
||||
movq %rax,(%rsp)
|
||||
movq %r8,8(%rsp)
|
||||
|
||||
|
||||
.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00
|
||||
adoxq %rax,%r10
|
||||
adcxq %rbx,%r11
|
||||
|
||||
mulxq 24(%rsi),%rdi,%r8
|
||||
adoxq %rdi,%r11
|
||||
.byte 0x66
|
||||
adcxq %r8,%r12
|
||||
|
||||
mulxq 32(%rsi),%rax,%rbx
|
||||
adoxq %rax,%r12
|
||||
adcxq %rbx,%r13
|
||||
|
||||
mulxq 40(%rsi),%rdi,%r8
|
||||
adoxq %rdi,%r13
|
||||
adcxq %r8,%r14
|
||||
|
||||
.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
|
||||
adoxq %rax,%r14
|
||||
adcxq %rbx,%r15
|
||||
|
||||
.byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00
|
||||
adoxq %rdi,%r15
|
||||
adcxq %rbp,%r8
|
||||
mulxq %rdx,%rax,%rdi
|
||||
adoxq %rbp,%r8
|
||||
.byte 0x48,0x8b,0x96,0x10,0x00,0x00,0x00
|
||||
|
||||
xorq %rbx,%rbx
|
||||
adoxq %r9,%r9
|
||||
|
||||
adcxq %rcx,%rax
|
||||
adoxq %r10,%r10
|
||||
adcxq %rax,%r9
|
||||
adoxq %rbp,%rbx
|
||||
adcxq %rdi,%r10
|
||||
adcxq %rbp,%rbx
|
||||
|
||||
movq %r9,16(%rsp)
|
||||
.byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00
|
||||
|
||||
|
||||
mulxq 24(%rsi),%rdi,%r9
|
||||
adoxq %rdi,%r12
|
||||
adcxq %r9,%r13
|
||||
|
||||
mulxq 32(%rsi),%rax,%rcx
|
||||
adoxq %rax,%r13
|
||||
adcxq %rcx,%r14
|
||||
|
||||
.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00
|
||||
adoxq %rdi,%r14
|
||||
adcxq %r9,%r15
|
||||
|
||||
.byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00
|
||||
adoxq %rax,%r15
|
||||
adcxq %rcx,%r8
|
||||
|
||||
mulxq 56(%rsi),%rdi,%r9
|
||||
adoxq %rdi,%r8
|
||||
adcxq %rbp,%r9
|
||||
mulxq %rdx,%rax,%rdi
|
||||
adoxq %rbp,%r9
|
||||
movq 24(%rsi),%rdx
|
||||
|
||||
xorq %rcx,%rcx
|
||||
adoxq %r11,%r11
|
||||
|
||||
adcxq %rbx,%rax
|
||||
adoxq %r12,%r12
|
||||
adcxq %rax,%r11
|
||||
adoxq %rbp,%rcx
|
||||
adcxq %rdi,%r12
|
||||
adcxq %rbp,%rcx
|
||||
|
||||
movq %r11,32(%rsp)
|
||||
movq %r12,40(%rsp)
|
||||
|
||||
|
||||
mulxq 32(%rsi),%rax,%rbx
|
||||
adoxq %rax,%r14
|
||||
adcxq %rbx,%r15
|
||||
|
||||
mulxq 40(%rsi),%rdi,%r10
|
||||
adoxq %rdi,%r15
|
||||
adcxq %r10,%r8
|
||||
|
||||
mulxq 48(%rsi),%rax,%rbx
|
||||
adoxq %rax,%r8
|
||||
adcxq %rbx,%r9
|
||||
|
||||
mulxq 56(%rsi),%rdi,%r10
|
||||
adoxq %rdi,%r9
|
||||
adcxq %rbp,%r10
|
||||
mulxq %rdx,%rax,%rdi
|
||||
adoxq %rbp,%r10
|
||||
movq 32(%rsi),%rdx
|
||||
|
||||
xorq %rbx,%rbx
|
||||
adoxq %r13,%r13
|
||||
|
||||
adcxq %rcx,%rax
|
||||
adoxq %r14,%r14
|
||||
adcxq %rax,%r13
|
||||
adoxq %rbp,%rbx
|
||||
adcxq %rdi,%r14
|
||||
adcxq %rbp,%rbx
|
||||
|
||||
movq %r13,48(%rsp)
|
||||
movq %r14,56(%rsp)
|
||||
|
||||
|
||||
mulxq 40(%rsi),%rdi,%r11
|
||||
adoxq %rdi,%r8
|
||||
adcxq %r11,%r9
|
||||
|
||||
mulxq 48(%rsi),%rax,%rcx
|
||||
adoxq %rax,%r9
|
||||
adcxq %rcx,%r10
|
||||
|
||||
mulxq 56(%rsi),%rdi,%r11
|
||||
adoxq %rdi,%r10
|
||||
adcxq %rbp,%r11
|
||||
mulxq %rdx,%rax,%rdi
|
||||
movq 40(%rsi),%rdx
|
||||
adoxq %rbp,%r11
|
||||
|
||||
xorq %rcx,%rcx
|
||||
adoxq %r15,%r15
|
||||
|
||||
adcxq %rbx,%rax
|
||||
adoxq %r8,%r8
|
||||
adcxq %rax,%r15
|
||||
adoxq %rbp,%rcx
|
||||
adcxq %rdi,%r8
|
||||
adcxq %rbp,%rcx
|
||||
|
||||
movq %r15,64(%rsp)
|
||||
movq %r8,72(%rsp)
|
||||
|
||||
|
||||
.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
|
||||
adoxq %rax,%r10
|
||||
adcxq %rbx,%r11
|
||||
|
||||
.byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00
|
||||
adoxq %rdi,%r11
|
||||
adcxq %rbp,%r12
|
||||
mulxq %rdx,%rax,%rdi
|
||||
adoxq %rbp,%r12
|
||||
movq 48(%rsi),%rdx
|
||||
|
||||
xorq %rbx,%rbx
|
||||
adoxq %r9,%r9
|
||||
|
||||
adcxq %rcx,%rax
|
||||
adoxq %r10,%r10
|
||||
adcxq %rax,%r9
|
||||
adcxq %rdi,%r10
|
||||
adoxq %rbp,%rbx
|
||||
adcxq %rbp,%rbx
|
||||
|
||||
movq %r9,80(%rsp)
|
||||
movq %r10,88(%rsp)
|
||||
|
||||
|
||||
.byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00
|
||||
adoxq %rax,%r12
|
||||
adoxq %rbp,%r13
|
||||
|
||||
mulxq %rdx,%rax,%rdi
|
||||
xorq %rcx,%rcx
|
||||
movq 56(%rsi),%rdx
|
||||
adoxq %r11,%r11
|
||||
|
||||
adcxq %rbx,%rax
|
||||
adoxq %r12,%r12
|
||||
adcxq %rax,%r11
|
||||
adoxq %rbp,%rcx
|
||||
adcxq %rdi,%r12
|
||||
adcxq %rbp,%rcx
|
||||
|
||||
.byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00
|
||||
.byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00
|
||||
|
||||
|
||||
mulxq %rdx,%rax,%rdx
|
||||
xorq %rbx,%rbx
|
||||
adoxq %r13,%r13
|
||||
|
||||
adcxq %rcx,%rax
|
||||
adoxq %rbp,%rbx
|
||||
adcxq %r13,%rax
|
||||
adcxq %rdx,%rbx
|
||||
|
||||
.byte 102,72,15,126,199
|
||||
.byte 102,72,15,126,205
|
||||
|
||||
movq 128(%rsp),%rdx
|
||||
movq (%rsp),%r8
|
||||
movq 8(%rsp),%r9
|
||||
movq 16(%rsp),%r10
|
||||
movq 24(%rsp),%r11
|
||||
movq 32(%rsp),%r12
|
||||
movq 40(%rsp),%r13
|
||||
movq 48(%rsp),%r14
|
||||
movq 56(%rsp),%r15
|
||||
|
||||
movq %rax,112(%rsp)
|
||||
movq %rbx,120(%rsp)
|
||||
|
||||
call __rsaz_512_reducex
|
||||
|
||||
addq 64(%rsp),%r8
|
||||
adcq 72(%rsp),%r9
|
||||
adcq 80(%rsp),%r10
|
||||
adcq 88(%rsp),%r11
|
||||
adcq 96(%rsp),%r12
|
||||
adcq 104(%rsp),%r13
|
||||
adcq 112(%rsp),%r14
|
||||
adcq 120(%rsp),%r15
|
||||
sbbq %rcx,%rcx
|
||||
|
||||
call __rsaz_512_subtract
|
||||
|
||||
movq %r8,%rdx
|
||||
movq %r9,%rax
|
||||
movl 128+8(%rsp),%r8d
|
||||
movq %rdi,%rsi
|
||||
|
||||
decl %r8d
|
||||
jnz .Loop_sqrx
|
||||
|
||||
.Lsqr_tail:
|
||||
|
||||
leaq 128+24+48(%rsp),%rax
|
||||
.cfi_def_cfa %rax,8
|
||||
@ -456,6 +736,10 @@ rsaz_512_mul:
|
||||
.byte 102,72,15,110,199
|
||||
.byte 102,72,15,110,201
|
||||
movq %r8,128(%rsp)
|
||||
movl $0x80100,%r11d
|
||||
andl OPENSSL_ia32cap_P+8(%rip),%r11d
|
||||
cmpl $0x80100,%r11d
|
||||
je .Lmulx
|
||||
movq (%rdx),%rbx
|
||||
movq %rdx,%rbp
|
||||
call __rsaz_512_mul
|
||||
@ -473,6 +757,29 @@ rsaz_512_mul:
|
||||
movq 56(%rsp),%r15
|
||||
|
||||
call __rsaz_512_reduce
|
||||
jmp .Lmul_tail
|
||||
|
||||
.align 32
|
||||
.Lmulx:
|
||||
movq %rdx,%rbp
|
||||
movq (%rdx),%rdx
|
||||
call __rsaz_512_mulx
|
||||
|
||||
.byte 102,72,15,126,199
|
||||
.byte 102,72,15,126,205
|
||||
|
||||
movq 128(%rsp),%rdx
|
||||
movq (%rsp),%r8
|
||||
movq 8(%rsp),%r9
|
||||
movq 16(%rsp),%r10
|
||||
movq 24(%rsp),%r11
|
||||
movq 32(%rsp),%r12
|
||||
movq 40(%rsp),%r13
|
||||
movq 48(%rsp),%r14
|
||||
movq 56(%rsp),%r15
|
||||
|
||||
call __rsaz_512_reducex
|
||||
.Lmul_tail:
|
||||
addq 64(%rsp),%r8
|
||||
adcq 72(%rsp),%r9
|
||||
adcq 80(%rsp),%r10
|
||||
@ -586,6 +893,10 @@ rsaz_512_mul_gather4:
|
||||
por %xmm9,%xmm8
|
||||
pshufd $0x4e,%xmm8,%xmm9
|
||||
por %xmm9,%xmm8
|
||||
movl $0x80100,%r11d
|
||||
andl OPENSSL_ia32cap_P+8(%rip),%r11d
|
||||
cmpl $0x80100,%r11d
|
||||
je .Lmulx_gather
|
||||
.byte 102,76,15,126,195
|
||||
|
||||
movq %r8,128(%rsp)
|
||||
@ -766,6 +1077,142 @@ rsaz_512_mul_gather4:
|
||||
movq 56(%rsp),%r15
|
||||
|
||||
call __rsaz_512_reduce
|
||||
jmp .Lmul_gather_tail
|
||||
|
||||
.align 32
|
||||
.Lmulx_gather:
|
||||
.byte 102,76,15,126,194
|
||||
|
||||
movq %r8,128(%rsp)
|
||||
movq %rdi,128+8(%rsp)
|
||||
movq %rcx,128+16(%rsp)
|
||||
|
||||
mulxq (%rsi),%rbx,%r8
|
||||
movq %rbx,(%rsp)
|
||||
xorl %edi,%edi
|
||||
|
||||
mulxq 8(%rsi),%rax,%r9
|
||||
|
||||
mulxq 16(%rsi),%rbx,%r10
|
||||
adcxq %rax,%r8
|
||||
|
||||
mulxq 24(%rsi),%rax,%r11
|
||||
adcxq %rbx,%r9
|
||||
|
||||
mulxq 32(%rsi),%rbx,%r12
|
||||
adcxq %rax,%r10
|
||||
|
||||
mulxq 40(%rsi),%rax,%r13
|
||||
adcxq %rbx,%r11
|
||||
|
||||
mulxq 48(%rsi),%rbx,%r14
|
||||
adcxq %rax,%r12
|
||||
|
||||
mulxq 56(%rsi),%rax,%r15
|
||||
adcxq %rbx,%r13
|
||||
adcxq %rax,%r14
|
||||
.byte 0x67
|
||||
movq %r8,%rbx
|
||||
adcxq %rdi,%r15
|
||||
|
||||
movq $-7,%rcx
|
||||
jmp .Loop_mulx_gather
|
||||
|
||||
.align 32
|
||||
.Loop_mulx_gather:
|
||||
movdqa 0(%rbp),%xmm8
|
||||
movdqa 16(%rbp),%xmm9
|
||||
movdqa 32(%rbp),%xmm10
|
||||
movdqa 48(%rbp),%xmm11
|
||||
pand %xmm0,%xmm8
|
||||
movdqa 64(%rbp),%xmm12
|
||||
pand %xmm1,%xmm9
|
||||
movdqa 80(%rbp),%xmm13
|
||||
pand %xmm2,%xmm10
|
||||
movdqa 96(%rbp),%xmm14
|
||||
pand %xmm3,%xmm11
|
||||
movdqa 112(%rbp),%xmm15
|
||||
leaq 128(%rbp),%rbp
|
||||
pand %xmm4,%xmm12
|
||||
pand %xmm5,%xmm13
|
||||
pand %xmm6,%xmm14
|
||||
pand %xmm7,%xmm15
|
||||
por %xmm10,%xmm8
|
||||
por %xmm11,%xmm9
|
||||
por %xmm12,%xmm8
|
||||
por %xmm13,%xmm9
|
||||
por %xmm14,%xmm8
|
||||
por %xmm15,%xmm9
|
||||
|
||||
por %xmm9,%xmm8
|
||||
pshufd $0x4e,%xmm8,%xmm9
|
||||
por %xmm9,%xmm8
|
||||
.byte 102,76,15,126,194
|
||||
|
||||
.byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00
|
||||
adcxq %rax,%rbx
|
||||
adoxq %r9,%r8
|
||||
|
||||
mulxq 8(%rsi),%rax,%r9
|
||||
adcxq %rax,%r8
|
||||
adoxq %r10,%r9
|
||||
|
||||
mulxq 16(%rsi),%rax,%r10
|
||||
adcxq %rax,%r9
|
||||
adoxq %r11,%r10
|
||||
|
||||
.byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00
|
||||
adcxq %rax,%r10
|
||||
adoxq %r12,%r11
|
||||
|
||||
mulxq 32(%rsi),%rax,%r12
|
||||
adcxq %rax,%r11
|
||||
adoxq %r13,%r12
|
||||
|
||||
mulxq 40(%rsi),%rax,%r13
|
||||
adcxq %rax,%r12
|
||||
adoxq %r14,%r13
|
||||
|
||||
.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
|
||||
adcxq %rax,%r13
|
||||
.byte 0x67
|
||||
adoxq %r15,%r14
|
||||
|
||||
mulxq 56(%rsi),%rax,%r15
|
||||
movq %rbx,64(%rsp,%rcx,8)
|
||||
adcxq %rax,%r14
|
||||
adoxq %rdi,%r15
|
||||
movq %r8,%rbx
|
||||
adcxq %rdi,%r15
|
||||
|
||||
incq %rcx
|
||||
jnz .Loop_mulx_gather
|
||||
|
||||
movq %r8,64(%rsp)
|
||||
movq %r9,64+8(%rsp)
|
||||
movq %r10,64+16(%rsp)
|
||||
movq %r11,64+24(%rsp)
|
||||
movq %r12,64+32(%rsp)
|
||||
movq %r13,64+40(%rsp)
|
||||
movq %r14,64+48(%rsp)
|
||||
movq %r15,64+56(%rsp)
|
||||
|
||||
movq 128(%rsp),%rdx
|
||||
movq 128+8(%rsp),%rdi
|
||||
movq 128+16(%rsp),%rbp
|
||||
|
||||
movq (%rsp),%r8
|
||||
movq 8(%rsp),%r9
|
||||
movq 16(%rsp),%r10
|
||||
movq 24(%rsp),%r11
|
||||
movq 32(%rsp),%r12
|
||||
movq 40(%rsp),%r13
|
||||
movq 48(%rsp),%r14
|
||||
movq 56(%rsp),%r15
|
||||
|
||||
call __rsaz_512_reducex
|
||||
|
||||
.Lmul_gather_tail:
|
||||
addq 64(%rsp),%r8
|
||||
adcq 72(%rsp),%r9
|
||||
adcq 80(%rsp),%r10
|
||||
@ -833,6 +1280,10 @@ rsaz_512_mul_scatter4:
|
||||
movq %rcx,128(%rsp)
|
||||
|
||||
movq %rdi,%rbp
|
||||
movl $0x80100,%r11d
|
||||
andl OPENSSL_ia32cap_P+8(%rip),%r11d
|
||||
cmpl $0x80100,%r11d
|
||||
je .Lmulx_scatter
|
||||
movq (%rdi),%rbx
|
||||
call __rsaz_512_mul
|
||||
|
||||
@ -849,6 +1300,29 @@ rsaz_512_mul_scatter4:
|
||||
movq 56(%rsp),%r15
|
||||
|
||||
call __rsaz_512_reduce
|
||||
jmp .Lmul_scatter_tail
|
||||
|
||||
.align 32
|
||||
.Lmulx_scatter:
|
||||
movq (%rdi),%rdx
|
||||
call __rsaz_512_mulx
|
||||
|
||||
.byte 102,72,15,126,199
|
||||
.byte 102,72,15,126,205
|
||||
|
||||
movq 128(%rsp),%rdx
|
||||
movq (%rsp),%r8
|
||||
movq 8(%rsp),%r9
|
||||
movq 16(%rsp),%r10
|
||||
movq 24(%rsp),%r11
|
||||
movq 32(%rsp),%r12
|
||||
movq 40(%rsp),%r13
|
||||
movq 48(%rsp),%r14
|
||||
movq 56(%rsp),%r15
|
||||
|
||||
call __rsaz_512_reducex
|
||||
|
||||
.Lmul_scatter_tail:
|
||||
addq 64(%rsp),%r8
|
||||
adcq 72(%rsp),%r9
|
||||
adcq 80(%rsp),%r10
|
||||
@ -918,6 +1392,7 @@ rsaz_512_mul_by_one:
|
||||
subq $128+24,%rsp
|
||||
.cfi_adjust_cfa_offset 128+24
|
||||
.Lmul_by_one_body:
|
||||
movl OPENSSL_ia32cap_P+8(%rip),%eax
|
||||
movq %rdx,%rbp
|
||||
movq %rcx,128(%rsp)
|
||||
|
||||
@ -938,7 +1413,16 @@ rsaz_512_mul_by_one:
|
||||
movdqa %xmm0,64(%rsp)
|
||||
movdqa %xmm0,80(%rsp)
|
||||
movdqa %xmm0,96(%rsp)
|
||||
andl $0x80100,%eax
|
||||
cmpl $0x80100,%eax
|
||||
je .Lby_one_callx
|
||||
call __rsaz_512_reduce
|
||||
jmp .Lby_one_tail
|
||||
.align 32
|
||||
.Lby_one_callx:
|
||||
movq 128(%rsp),%rdx
|
||||
call __rsaz_512_reducex
|
||||
.Lby_one_tail:
|
||||
movq %r8,(%rdi)
|
||||
movq %r9,8(%rdi)
|
||||
movq %r10,16(%rdi)
|
||||
@ -1053,6 +1537,64 @@ __rsaz_512_reduce:
|
||||
.byte 0xf3,0xc3
|
||||
.cfi_endproc
|
||||
.size __rsaz_512_reduce,.-__rsaz_512_reduce
|
||||
.type __rsaz_512_reducex,@function
|
||||
.align 32
|
||||
__rsaz_512_reducex:
|
||||
.cfi_startproc
|
||||
|
||||
imulq %r8,%rdx
|
||||
xorq %rsi,%rsi
|
||||
movl $8,%ecx
|
||||
jmp .Lreduction_loopx
|
||||
|
||||
.align 32
|
||||
.Lreduction_loopx:
|
||||
movq %r8,%rbx
|
||||
mulxq 0(%rbp),%rax,%r8
|
||||
adcxq %rbx,%rax
|
||||
adoxq %r9,%r8
|
||||
|
||||
mulxq 8(%rbp),%rax,%r9
|
||||
adcxq %rax,%r8
|
||||
adoxq %r10,%r9
|
||||
|
||||
mulxq 16(%rbp),%rbx,%r10
|
||||
adcxq %rbx,%r9
|
||||
adoxq %r11,%r10
|
||||
|
||||
mulxq 24(%rbp),%rbx,%r11
|
||||
adcxq %rbx,%r10
|
||||
adoxq %r12,%r11
|
||||
|
||||
.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
|
||||
movq %rdx,%rax
|
||||
movq %r8,%rdx
|
||||
adcxq %rbx,%r11
|
||||
adoxq %r13,%r12
|
||||
|
||||
mulxq 128+8(%rsp),%rbx,%rdx
|
||||
movq %rax,%rdx
|
||||
|
||||
mulxq 40(%rbp),%rax,%r13
|
||||
adcxq %rax,%r12
|
||||
adoxq %r14,%r13
|
||||
|
||||
.byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00
|
||||
adcxq %rax,%r13
|
||||
adoxq %r15,%r14
|
||||
|
||||
mulxq 56(%rbp),%rax,%r15
|
||||
movq %rbx,%rdx
|
||||
adcxq %rax,%r14
|
||||
adoxq %rsi,%r15
|
||||
adcxq %rsi,%r15
|
||||
|
||||
decl %ecx
|
||||
jne .Lreduction_loopx
|
||||
|
||||
.byte 0xf3,0xc3
|
||||
.cfi_endproc
|
||||
.size __rsaz_512_reducex,.-__rsaz_512_reducex
|
||||
.type __rsaz_512_subtract,@function
|
||||
.align 32
|
||||
__rsaz_512_subtract:
|
||||
@ -1256,6 +1798,128 @@ __rsaz_512_mul:
|
||||
.byte 0xf3,0xc3
|
||||
.cfi_endproc
|
||||
.size __rsaz_512_mul,.-__rsaz_512_mul
|
||||
.type __rsaz_512_mulx,@function
|
||||
.align 32
|
||||
__rsaz_512_mulx:
|
||||
.cfi_startproc
|
||||
mulxq (%rsi),%rbx,%r8
|
||||
movq $-6,%rcx
|
||||
|
||||
mulxq 8(%rsi),%rax,%r9
|
||||
movq %rbx,8(%rsp)
|
||||
|
||||
mulxq 16(%rsi),%rbx,%r10
|
||||
adcq %rax,%r8
|
||||
|
||||
mulxq 24(%rsi),%rax,%r11
|
||||
adcq %rbx,%r9
|
||||
|
||||
mulxq 32(%rsi),%rbx,%r12
|
||||
adcq %rax,%r10
|
||||
|
||||
mulxq 40(%rsi),%rax,%r13
|
||||
adcq %rbx,%r11
|
||||
|
||||
mulxq 48(%rsi),%rbx,%r14
|
||||
adcq %rax,%r12
|
||||
|
||||
mulxq 56(%rsi),%rax,%r15
|
||||
movq 8(%rbp),%rdx
|
||||
adcq %rbx,%r13
|
||||
adcq %rax,%r14
|
||||
adcq $0,%r15
|
||||
|
||||
xorq %rdi,%rdi
|
||||
jmp .Loop_mulx
|
||||
|
||||
.align 32
|
||||
.Loop_mulx:
|
||||
movq %r8,%rbx
|
||||
mulxq (%rsi),%rax,%r8
|
||||
adcxq %rax,%rbx
|
||||
adoxq %r9,%r8
|
||||
|
||||
mulxq 8(%rsi),%rax,%r9
|
||||
adcxq %rax,%r8
|
||||
adoxq %r10,%r9
|
||||
|
||||
mulxq 16(%rsi),%rax,%r10
|
||||
adcxq %rax,%r9
|
||||
adoxq %r11,%r10
|
||||
|
||||
mulxq 24(%rsi),%rax,%r11
|
||||
adcxq %rax,%r10
|
||||
adoxq %r12,%r11
|
||||
|
||||
.byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00
|
||||
adcxq %rax,%r11
|
||||
adoxq %r13,%r12
|
||||
|
||||
mulxq 40(%rsi),%rax,%r13
|
||||
adcxq %rax,%r12
|
||||
adoxq %r14,%r13
|
||||
|
||||
mulxq 48(%rsi),%rax,%r14
|
||||
adcxq %rax,%r13
|
||||
adoxq %r15,%r14
|
||||
|
||||
mulxq 56(%rsi),%rax,%r15
|
||||
movq 64(%rbp,%rcx,8),%rdx
|
||||
movq %rbx,8+64-8(%rsp,%rcx,8)
|
||||
adcxq %rax,%r14
|
||||
adoxq %rdi,%r15
|
||||
adcxq %rdi,%r15
|
||||
|
||||
incq %rcx
|
||||
jnz .Loop_mulx
|
||||
|
||||
movq %r8,%rbx
|
||||
mulxq (%rsi),%rax,%r8
|
||||
adcxq %rax,%rbx
|
||||
adoxq %r9,%r8
|
||||
|
||||
.byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00
|
||||
adcxq %rax,%r8
|
||||
adoxq %r10,%r9
|
||||
|
||||
.byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00
|
||||
adcxq %rax,%r9
|
||||
adoxq %r11,%r10
|
||||
|
||||
mulxq 24(%rsi),%rax,%r11
|
||||
adcxq %rax,%r10
|
||||
adoxq %r12,%r11
|
||||
|
||||
mulxq 32(%rsi),%rax,%r12
|
||||
adcxq %rax,%r11
|
||||
adoxq %r13,%r12
|
||||
|
||||
mulxq 40(%rsi),%rax,%r13
|
||||
adcxq %rax,%r12
|
||||
adoxq %r14,%r13
|
||||
|
||||
.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
|
||||
adcxq %rax,%r13
|
||||
adoxq %r15,%r14
|
||||
|
||||
.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00
|
||||
adcxq %rax,%r14
|
||||
adoxq %rdi,%r15
|
||||
adcxq %rdi,%r15
|
||||
|
||||
movq %rbx,8+64-8(%rsp)
|
||||
movq %r8,8+64(%rsp)
|
||||
movq %r9,8+64+8(%rsp)
|
||||
movq %r10,8+64+16(%rsp)
|
||||
movq %r11,8+64+24(%rsp)
|
||||
movq %r12,8+64+32(%rsp)
|
||||
movq %r13,8+64+40(%rsp)
|
||||
movq %r14,8+64+48(%rsp)
|
||||
movq %r15,8+64+56(%rsp)
|
||||
|
||||
.byte 0xf3,0xc3
|
||||
.cfi_endproc
|
||||
.size __rsaz_512_mulx,.-__rsaz_512_mulx
|
||||
.globl rsaz_512_scatter4
|
||||
.type rsaz_512_scatter4,@function
|
||||
.align 16
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -397,32 +397,408 @@ x25519_fe51_mul121666:
|
||||
.Lfe51_mul121666_epilogue:
|
||||
.cfi_endproc
|
||||
.size x25519_fe51_mul121666,.-x25519_fe51_mul121666
|
||||
|
||||
.globl x25519_fe64_eligible
|
||||
.type x25519_fe64_eligible,@function
|
||||
.align 32
|
||||
x25519_fe64_eligible:
|
||||
.cfi_startproc
|
||||
movl OPENSSL_ia32cap_P+8(%rip),%ecx
|
||||
xorl %eax,%eax
|
||||
andl $0x80100,%ecx
|
||||
cmpl $0x80100,%ecx
|
||||
cmovel %ecx,%eax
|
||||
.byte 0xf3,0xc3
|
||||
.cfi_endproc
|
||||
.size x25519_fe64_eligible,.-x25519_fe64_eligible
|
||||
|
||||
.globl x25519_fe64_mul
|
||||
.type x25519_fe64_mul,@function
|
||||
.globl x25519_fe64_sqr
|
||||
.globl x25519_fe64_mul121666
|
||||
.globl x25519_fe64_add
|
||||
.globl x25519_fe64_sub
|
||||
.globl x25519_fe64_tobytes
|
||||
.align 32
|
||||
x25519_fe64_mul:
|
||||
x25519_fe64_sqr:
|
||||
x25519_fe64_mul121666:
|
||||
x25519_fe64_add:
|
||||
x25519_fe64_sub:
|
||||
x25519_fe64_tobytes:
|
||||
.cfi_startproc
|
||||
.byte 0x0f,0x0b
|
||||
.byte 0xf3,0xc3
|
||||
pushq %rbp
|
||||
.cfi_adjust_cfa_offset 8
|
||||
.cfi_offset %rbp,-16
|
||||
pushq %rbx
|
||||
.cfi_adjust_cfa_offset 8
|
||||
.cfi_offset %rbx,-24
|
||||
pushq %r12
|
||||
.cfi_adjust_cfa_offset 8
|
||||
.cfi_offset %r12,-32
|
||||
pushq %r13
|
||||
.cfi_adjust_cfa_offset 8
|
||||
.cfi_offset %r13,-40
|
||||
pushq %r14
|
||||
.cfi_adjust_cfa_offset 8
|
||||
.cfi_offset %r14,-48
|
||||
pushq %r15
|
||||
.cfi_adjust_cfa_offset 8
|
||||
.cfi_offset %r15,-56
|
||||
pushq %rdi
|
||||
.cfi_adjust_cfa_offset 8
|
||||
.cfi_offset %rdi,-64
|
||||
leaq -16(%rsp),%rsp
|
||||
.cfi_adjust_cfa_offset 16
|
||||
.Lfe64_mul_body:
|
||||
|
||||
movq %rdx,%rax
|
||||
movq 0(%rdx),%rbp
|
||||
movq 0(%rsi),%rdx
|
||||
movq 8(%rax),%rcx
|
||||
movq 16(%rax),%r14
|
||||
movq 24(%rax),%r15
|
||||
|
||||
mulxq %rbp,%r8,%rax
|
||||
xorl %edi,%edi
|
||||
mulxq %rcx,%r9,%rbx
|
||||
adcxq %rax,%r9
|
||||
mulxq %r14,%r10,%rax
|
||||
adcxq %rbx,%r10
|
||||
mulxq %r15,%r11,%r12
|
||||
movq 8(%rsi),%rdx
|
||||
adcxq %rax,%r11
|
||||
movq %r14,(%rsp)
|
||||
adcxq %rdi,%r12
|
||||
|
||||
mulxq %rbp,%rax,%rbx
|
||||
adoxq %rax,%r9
|
||||
adcxq %rbx,%r10
|
||||
mulxq %rcx,%rax,%rbx
|
||||
adoxq %rax,%r10
|
||||
adcxq %rbx,%r11
|
||||
mulxq %r14,%rax,%rbx
|
||||
adoxq %rax,%r11
|
||||
adcxq %rbx,%r12
|
||||
mulxq %r15,%rax,%r13
|
||||
movq 16(%rsi),%rdx
|
||||
adoxq %rax,%r12
|
||||
adcxq %rdi,%r13
|
||||
adoxq %rdi,%r13
|
||||
|
||||
mulxq %rbp,%rax,%rbx
|
||||
adcxq %rax,%r10
|
||||
adoxq %rbx,%r11
|
||||
mulxq %rcx,%rax,%rbx
|
||||
adcxq %rax,%r11
|
||||
adoxq %rbx,%r12
|
||||
mulxq %r14,%rax,%rbx
|
||||
adcxq %rax,%r12
|
||||
adoxq %rbx,%r13
|
||||
mulxq %r15,%rax,%r14
|
||||
movq 24(%rsi),%rdx
|
||||
adcxq %rax,%r13
|
||||
adoxq %rdi,%r14
|
||||
adcxq %rdi,%r14
|
||||
|
||||
mulxq %rbp,%rax,%rbx
|
||||
adoxq %rax,%r11
|
||||
adcxq %rbx,%r12
|
||||
mulxq %rcx,%rax,%rbx
|
||||
adoxq %rax,%r12
|
||||
adcxq %rbx,%r13
|
||||
mulxq (%rsp),%rax,%rbx
|
||||
adoxq %rax,%r13
|
||||
adcxq %rbx,%r14
|
||||
mulxq %r15,%rax,%r15
|
||||
movl $38,%edx
|
||||
adoxq %rax,%r14
|
||||
adcxq %rdi,%r15
|
||||
adoxq %rdi,%r15
|
||||
|
||||
jmp .Lreduce64
|
||||
.Lfe64_mul_epilogue:
|
||||
.cfi_endproc
|
||||
.size x25519_fe64_mul,.-x25519_fe64_mul
|
||||
|
||||
.globl x25519_fe64_sqr
|
||||
.type x25519_fe64_sqr,@function
|
||||
.align 32
|
||||
x25519_fe64_sqr:
|
||||
.cfi_startproc
|
||||
pushq %rbp
|
||||
.cfi_adjust_cfa_offset 8
|
||||
.cfi_offset %rbp,-16
|
||||
pushq %rbx
|
||||
.cfi_adjust_cfa_offset 8
|
||||
.cfi_offset %rbx,-24
|
||||
pushq %r12
|
||||
.cfi_adjust_cfa_offset 8
|
||||
.cfi_offset %r12,-32
|
||||
pushq %r13
|
||||
.cfi_adjust_cfa_offset 8
|
||||
.cfi_offset %r13,-40
|
||||
pushq %r14
|
||||
.cfi_adjust_cfa_offset 8
|
||||
.cfi_offset %r14,-48
|
||||
pushq %r15
|
||||
.cfi_adjust_cfa_offset 8
|
||||
.cfi_offset %r15,-56
|
||||
pushq %rdi
|
||||
.cfi_adjust_cfa_offset 8
|
||||
.cfi_offset %rdi,-64
|
||||
leaq -16(%rsp),%rsp
|
||||
.cfi_adjust_cfa_offset 16
|
||||
.Lfe64_sqr_body:
|
||||
|
||||
movq 0(%rsi),%rdx
|
||||
movq 8(%rsi),%rcx
|
||||
movq 16(%rsi),%rbp
|
||||
movq 24(%rsi),%rsi
|
||||
|
||||
|
||||
mulxq %rdx,%r8,%r15
|
||||
mulxq %rcx,%r9,%rax
|
||||
xorl %edi,%edi
|
||||
mulxq %rbp,%r10,%rbx
|
||||
adcxq %rax,%r10
|
||||
mulxq %rsi,%r11,%r12
|
||||
movq %rcx,%rdx
|
||||
adcxq %rbx,%r11
|
||||
adcxq %rdi,%r12
|
||||
|
||||
|
||||
mulxq %rbp,%rax,%rbx
|
||||
adoxq %rax,%r11
|
||||
adcxq %rbx,%r12
|
||||
mulxq %rsi,%rax,%r13
|
||||
movq %rbp,%rdx
|
||||
adoxq %rax,%r12
|
||||
adcxq %rdi,%r13
|
||||
|
||||
|
||||
mulxq %rsi,%rax,%r14
|
||||
movq %rcx,%rdx
|
||||
adoxq %rax,%r13
|
||||
adcxq %rdi,%r14
|
||||
adoxq %rdi,%r14
|
||||
|
||||
adcxq %r9,%r9
|
||||
adoxq %r15,%r9
|
||||
adcxq %r10,%r10
|
||||
mulxq %rdx,%rax,%rbx
|
||||
movq %rbp,%rdx
|
||||
adcxq %r11,%r11
|
||||
adoxq %rax,%r10
|
||||
adcxq %r12,%r12
|
||||
adoxq %rbx,%r11
|
||||
mulxq %rdx,%rax,%rbx
|
||||
movq %rsi,%rdx
|
||||
adcxq %r13,%r13
|
||||
adoxq %rax,%r12
|
||||
adcxq %r14,%r14
|
||||
adoxq %rbx,%r13
|
||||
mulxq %rdx,%rax,%r15
|
||||
movl $38,%edx
|
||||
adoxq %rax,%r14
|
||||
adcxq %rdi,%r15
|
||||
adoxq %rdi,%r15
|
||||
jmp .Lreduce64
|
||||
|
||||
.align 32
|
||||
.Lreduce64:
|
||||
mulxq %r12,%rax,%rbx
|
||||
adcxq %rax,%r8
|
||||
adoxq %rbx,%r9
|
||||
mulxq %r13,%rax,%rbx
|
||||
adcxq %rax,%r9
|
||||
adoxq %rbx,%r10
|
||||
mulxq %r14,%rax,%rbx
|
||||
adcxq %rax,%r10
|
||||
adoxq %rbx,%r11
|
||||
mulxq %r15,%rax,%r12
|
||||
adcxq %rax,%r11
|
||||
adoxq %rdi,%r12
|
||||
adcxq %rdi,%r12
|
||||
|
||||
movq 16(%rsp),%rdi
|
||||
imulq %rdx,%r12
|
||||
|
||||
addq %r12,%r8
|
||||
adcq $0,%r9
|
||||
adcq $0,%r10
|
||||
adcq $0,%r11
|
||||
|
||||
sbbq %rax,%rax
|
||||
andq $38,%rax
|
||||
|
||||
addq %rax,%r8
|
||||
movq %r9,8(%rdi)
|
||||
movq %r10,16(%rdi)
|
||||
movq %r11,24(%rdi)
|
||||
movq %r8,0(%rdi)
|
||||
|
||||
movq 24(%rsp),%r15
|
||||
.cfi_restore %r15
|
||||
movq 32(%rsp),%r14
|
||||
.cfi_restore %r14
|
||||
movq 40(%rsp),%r13
|
||||
.cfi_restore %r13
|
||||
movq 48(%rsp),%r12
|
||||
.cfi_restore %r12
|
||||
movq 56(%rsp),%rbx
|
||||
.cfi_restore %rbx
|
||||
movq 64(%rsp),%rbp
|
||||
.cfi_restore %rbp
|
||||
leaq 72(%rsp),%rsp
|
||||
.cfi_adjust_cfa_offset 88
|
||||
.Lfe64_sqr_epilogue:
|
||||
.byte 0xf3,0xc3
|
||||
.cfi_endproc
|
||||
.size x25519_fe64_sqr,.-x25519_fe64_sqr
|
||||
|
||||
.globl x25519_fe64_mul121666
|
||||
.type x25519_fe64_mul121666,@function
|
||||
.align 32
|
||||
x25519_fe64_mul121666:
|
||||
.Lfe64_mul121666_body:
|
||||
.cfi_startproc
|
||||
movl $121666,%edx
|
||||
mulxq 0(%rsi),%r8,%rcx
|
||||
mulxq 8(%rsi),%r9,%rax
|
||||
addq %rcx,%r9
|
||||
mulxq 16(%rsi),%r10,%rcx
|
||||
adcq %rax,%r10
|
||||
mulxq 24(%rsi),%r11,%rax
|
||||
adcq %rcx,%r11
|
||||
adcq $0,%rax
|
||||
|
||||
imulq $38,%rax,%rax
|
||||
|
||||
addq %rax,%r8
|
||||
adcq $0,%r9
|
||||
adcq $0,%r10
|
||||
adcq $0,%r11
|
||||
|
||||
sbbq %rax,%rax
|
||||
andq $38,%rax
|
||||
|
||||
addq %rax,%r8
|
||||
movq %r9,8(%rdi)
|
||||
movq %r10,16(%rdi)
|
||||
movq %r11,24(%rdi)
|
||||
movq %r8,0(%rdi)
|
||||
|
||||
.Lfe64_mul121666_epilogue:
|
||||
.byte 0xf3,0xc3
|
||||
.cfi_endproc
|
||||
.size x25519_fe64_mul121666,.-x25519_fe64_mul121666
|
||||
|
||||
.globl x25519_fe64_add
|
||||
.type x25519_fe64_add,@function
|
||||
.align 32
|
||||
x25519_fe64_add:
|
||||
.Lfe64_add_body:
|
||||
.cfi_startproc
|
||||
movq 0(%rsi),%r8
|
||||
movq 8(%rsi),%r9
|
||||
movq 16(%rsi),%r10
|
||||
movq 24(%rsi),%r11
|
||||
|
||||
addq 0(%rdx),%r8
|
||||
adcq 8(%rdx),%r9
|
||||
adcq 16(%rdx),%r10
|
||||
adcq 24(%rdx),%r11
|
||||
|
||||
sbbq %rax,%rax
|
||||
andq $38,%rax
|
||||
|
||||
addq %rax,%r8
|
||||
adcq $0,%r9
|
||||
adcq $0,%r10
|
||||
movq %r9,8(%rdi)
|
||||
adcq $0,%r11
|
||||
movq %r10,16(%rdi)
|
||||
sbbq %rax,%rax
|
||||
movq %r11,24(%rdi)
|
||||
andq $38,%rax
|
||||
|
||||
addq %rax,%r8
|
||||
movq %r8,0(%rdi)
|
||||
|
||||
.Lfe64_add_epilogue:
|
||||
.byte 0xf3,0xc3
|
||||
.cfi_endproc
|
||||
.size x25519_fe64_add,.-x25519_fe64_add
|
||||
|
||||
.globl x25519_fe64_sub
|
||||
.type x25519_fe64_sub,@function
|
||||
.align 32
|
||||
x25519_fe64_sub:
|
||||
.Lfe64_sub_body:
|
||||
.cfi_startproc
|
||||
movq 0(%rsi),%r8
|
||||
movq 8(%rsi),%r9
|
||||
movq 16(%rsi),%r10
|
||||
movq 24(%rsi),%r11
|
||||
|
||||
subq 0(%rdx),%r8
|
||||
sbbq 8(%rdx),%r9
|
||||
sbbq 16(%rdx),%r10
|
||||
sbbq 24(%rdx),%r11
|
||||
|
||||
sbbq %rax,%rax
|
||||
andq $38,%rax
|
||||
|
||||
subq %rax,%r8
|
||||
sbbq $0,%r9
|
||||
sbbq $0,%r10
|
||||
movq %r9,8(%rdi)
|
||||
sbbq $0,%r11
|
||||
movq %r10,16(%rdi)
|
||||
sbbq %rax,%rax
|
||||
movq %r11,24(%rdi)
|
||||
andq $38,%rax
|
||||
|
||||
subq %rax,%r8
|
||||
movq %r8,0(%rdi)
|
||||
|
||||
.Lfe64_sub_epilogue:
|
||||
.byte 0xf3,0xc3
|
||||
.cfi_endproc
|
||||
.size x25519_fe64_sub,.-x25519_fe64_sub
|
||||
|
||||
.globl x25519_fe64_tobytes
|
||||
.type x25519_fe64_tobytes,@function
|
||||
.align 32
|
||||
x25519_fe64_tobytes:
|
||||
.Lfe64_to_body:
|
||||
.cfi_startproc
|
||||
movq 0(%rsi),%r8
|
||||
movq 8(%rsi),%r9
|
||||
movq 16(%rsi),%r10
|
||||
movq 24(%rsi),%r11
|
||||
|
||||
|
||||
leaq (%r11,%r11,1),%rax
|
||||
sarq $63,%r11
|
||||
shrq $1,%rax
|
||||
andq $19,%r11
|
||||
addq $19,%r11
|
||||
|
||||
addq %r11,%r8
|
||||
adcq $0,%r9
|
||||
adcq $0,%r10
|
||||
adcq $0,%rax
|
||||
|
||||
leaq (%rax,%rax,1),%r11
|
||||
sarq $63,%rax
|
||||
shrq $1,%r11
|
||||
notq %rax
|
||||
andq $19,%rax
|
||||
|
||||
subq %rax,%r8
|
||||
sbbq $0,%r9
|
||||
sbbq $0,%r10
|
||||
sbbq $0,%r11
|
||||
|
||||
movq %r8,0(%rdi)
|
||||
movq %r9,8(%rdi)
|
||||
movq %r10,16(%rdi)
|
||||
movq %r11,24(%rdi)
|
||||
|
||||
.Lfe64_to_epilogue:
|
||||
.byte 0xf3,0xc3
|
||||
.cfi_endproc
|
||||
.size x25519_fe64_tobytes,.-x25519_fe64_tobytes
|
||||
.byte 88,50,53,53,49,57,32,112,114,105,109,105,116,105,118,101,115,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
|
||||
|
@ -16,6 +16,7 @@ bn_mul_mont:
|
||||
jnz .Lmul_enter
|
||||
cmpl $8,%r9d
|
||||
jb .Lmul_enter
|
||||
movl OPENSSL_ia32cap_P+8(%rip),%r11d
|
||||
cmpq %rsi,%rdx
|
||||
jne .Lmul4x_enter
|
||||
testl $7,%r9d
|
||||
@ -264,6 +265,9 @@ bn_mul4x_mont:
|
||||
movq %rsp,%rax
|
||||
.cfi_def_cfa_register %rax
|
||||
.Lmul4x_enter:
|
||||
andl $0x80100,%r11d
|
||||
cmpl $0x80100,%r11d
|
||||
je .Lmulx4x_enter
|
||||
pushq %rbx
|
||||
.cfi_offset %rbx,-16
|
||||
pushq %rbp
|
||||
@ -689,6 +693,7 @@ bn_mul4x_mont:
|
||||
.size bn_mul4x_mont,.-bn_mul4x_mont
|
||||
|
||||
|
||||
|
||||
.type bn_sqr8x_mont,@function
|
||||
.align 32
|
||||
bn_sqr8x_mont:
|
||||
@ -770,6 +775,25 @@ bn_sqr8x_mont:
|
||||
pxor %xmm0,%xmm0
|
||||
.byte 102,72,15,110,207
|
||||
.byte 102,73,15,110,218
|
||||
movl OPENSSL_ia32cap_P+8(%rip),%eax
|
||||
andl $0x80100,%eax
|
||||
cmpl $0x80100,%eax
|
||||
jne .Lsqr8x_nox
|
||||
|
||||
call bn_sqrx8x_internal
|
||||
|
||||
|
||||
|
||||
|
||||
leaq (%r8,%rcx,1),%rbx
|
||||
movq %rcx,%r9
|
||||
movq %rcx,%rdx
|
||||
.byte 102,72,15,126,207
|
||||
sarq $3+2,%rcx
|
||||
jmp .Lsqr8x_sub
|
||||
|
||||
.align 32
|
||||
.Lsqr8x_nox:
|
||||
call bn_sqr8x_internal
|
||||
|
||||
|
||||
@ -857,5 +881,361 @@ bn_sqr8x_mont:
|
||||
.byte 0xf3,0xc3
|
||||
.cfi_endproc
|
||||
.size bn_sqr8x_mont,.-bn_sqr8x_mont
|
||||
.type bn_mulx4x_mont,@function
|
||||
.align 32
|
||||
bn_mulx4x_mont:
|
||||
.cfi_startproc
|
||||
movq %rsp,%rax
|
||||
.cfi_def_cfa_register %rax
|
||||
.Lmulx4x_enter:
|
||||
pushq %rbx
|
||||
.cfi_offset %rbx,-16
|
||||
pushq %rbp
|
||||
.cfi_offset %rbp,-24
|
||||
pushq %r12
|
||||
.cfi_offset %r12,-32
|
||||
pushq %r13
|
||||
.cfi_offset %r13,-40
|
||||
pushq %r14
|
||||
.cfi_offset %r14,-48
|
||||
pushq %r15
|
||||
.cfi_offset %r15,-56
|
||||
.Lmulx4x_prologue:
|
||||
|
||||
shll $3,%r9d
|
||||
xorq %r10,%r10
|
||||
subq %r9,%r10
|
||||
movq (%r8),%r8
|
||||
leaq -72(%rsp,%r10,1),%rbp
|
||||
andq $-128,%rbp
|
||||
movq %rsp,%r11
|
||||
subq %rbp,%r11
|
||||
andq $-4096,%r11
|
||||
leaq (%r11,%rbp,1),%rsp
|
||||
movq (%rsp),%r10
|
||||
cmpq %rbp,%rsp
|
||||
ja .Lmulx4x_page_walk
|
||||
jmp .Lmulx4x_page_walk_done
|
||||
|
||||
.align 16
|
||||
.Lmulx4x_page_walk:
|
||||
leaq -4096(%rsp),%rsp
|
||||
movq (%rsp),%r10
|
||||
cmpq %rbp,%rsp
|
||||
ja .Lmulx4x_page_walk
|
||||
.Lmulx4x_page_walk_done:
|
||||
|
||||
leaq (%rdx,%r9,1),%r10
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
movq %r9,0(%rsp)
|
||||
shrq $5,%r9
|
||||
movq %r10,16(%rsp)
|
||||
subq $1,%r9
|
||||
movq %r8,24(%rsp)
|
||||
movq %rdi,32(%rsp)
|
||||
movq %rax,40(%rsp)
|
||||
.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
|
||||
movq %r9,48(%rsp)
|
||||
jmp .Lmulx4x_body
|
||||
|
||||
.align 32
|
||||
.Lmulx4x_body:
|
||||
leaq 8(%rdx),%rdi
|
||||
movq (%rdx),%rdx
|
||||
leaq 64+32(%rsp),%rbx
|
||||
movq %rdx,%r9
|
||||
|
||||
mulxq 0(%rsi),%r8,%rax
|
||||
mulxq 8(%rsi),%r11,%r14
|
||||
addq %rax,%r11
|
||||
movq %rdi,8(%rsp)
|
||||
mulxq 16(%rsi),%r12,%r13
|
||||
adcq %r14,%r12
|
||||
adcq $0,%r13
|
||||
|
||||
movq %r8,%rdi
|
||||
imulq 24(%rsp),%r8
|
||||
xorq %rbp,%rbp
|
||||
|
||||
mulxq 24(%rsi),%rax,%r14
|
||||
movq %r8,%rdx
|
||||
leaq 32(%rsi),%rsi
|
||||
adcxq %rax,%r13
|
||||
adcxq %rbp,%r14
|
||||
|
||||
mulxq 0(%rcx),%rax,%r10
|
||||
adcxq %rax,%rdi
|
||||
adoxq %r11,%r10
|
||||
mulxq 8(%rcx),%rax,%r11
|
||||
adcxq %rax,%r10
|
||||
adoxq %r12,%r11
|
||||
.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
|
||||
movq 48(%rsp),%rdi
|
||||
movq %r10,-32(%rbx)
|
||||
adcxq %rax,%r11
|
||||
adoxq %r13,%r12
|
||||
mulxq 24(%rcx),%rax,%r15
|
||||
movq %r9,%rdx
|
||||
movq %r11,-24(%rbx)
|
||||
adcxq %rax,%r12
|
||||
adoxq %rbp,%r15
|
||||
leaq 32(%rcx),%rcx
|
||||
movq %r12,-16(%rbx)
|
||||
|
||||
jmp .Lmulx4x_1st
|
||||
|
||||
.align 32
|
||||
.Lmulx4x_1st:
|
||||
adcxq %rbp,%r15
|
||||
mulxq 0(%rsi),%r10,%rax
|
||||
adcxq %r14,%r10
|
||||
mulxq 8(%rsi),%r11,%r14
|
||||
adcxq %rax,%r11
|
||||
mulxq 16(%rsi),%r12,%rax
|
||||
adcxq %r14,%r12
|
||||
mulxq 24(%rsi),%r13,%r14
|
||||
.byte 0x67,0x67
|
||||
movq %r8,%rdx
|
||||
adcxq %rax,%r13
|
||||
adcxq %rbp,%r14
|
||||
leaq 32(%rsi),%rsi
|
||||
leaq 32(%rbx),%rbx
|
||||
|
||||
adoxq %r15,%r10
|
||||
mulxq 0(%rcx),%rax,%r15
|
||||
adcxq %rax,%r10
|
||||
adoxq %r15,%r11
|
||||
mulxq 8(%rcx),%rax,%r15
|
||||
adcxq %rax,%r11
|
||||
adoxq %r15,%r12
|
||||
mulxq 16(%rcx),%rax,%r15
|
||||
movq %r10,-40(%rbx)
|
||||
adcxq %rax,%r12
|
||||
movq %r11,-32(%rbx)
|
||||
adoxq %r15,%r13
|
||||
mulxq 24(%rcx),%rax,%r15
|
||||
movq %r9,%rdx
|
||||
movq %r12,-24(%rbx)
|
||||
adcxq %rax,%r13
|
||||
adoxq %rbp,%r15
|
||||
leaq 32(%rcx),%rcx
|
||||
movq %r13,-16(%rbx)
|
||||
|
||||
decq %rdi
|
||||
jnz .Lmulx4x_1st
|
||||
|
||||
movq 0(%rsp),%rax
|
||||
movq 8(%rsp),%rdi
|
||||
adcq %rbp,%r15
|
||||
addq %r15,%r14
|
||||
sbbq %r15,%r15
|
||||
movq %r14,-8(%rbx)
|
||||
jmp .Lmulx4x_outer
|
||||
|
||||
.align 32
|
||||
.Lmulx4x_outer:
|
||||
movq (%rdi),%rdx
|
||||
leaq 8(%rdi),%rdi
|
||||
subq %rax,%rsi
|
||||
movq %r15,(%rbx)
|
||||
leaq 64+32(%rsp),%rbx
|
||||
subq %rax,%rcx
|
||||
|
||||
mulxq 0(%rsi),%r8,%r11
|
||||
xorl %ebp,%ebp
|
||||
movq %rdx,%r9
|
||||
mulxq 8(%rsi),%r14,%r12
|
||||
adoxq -32(%rbx),%r8
|
||||
adcxq %r14,%r11
|
||||
mulxq 16(%rsi),%r15,%r13
|
||||
adoxq -24(%rbx),%r11
|
||||
adcxq %r15,%r12
|
||||
adoxq -16(%rbx),%r12
|
||||
adcxq %rbp,%r13
|
||||
adoxq %rbp,%r13
|
||||
|
||||
movq %rdi,8(%rsp)
|
||||
movq %r8,%r15
|
||||
imulq 24(%rsp),%r8
|
||||
xorl %ebp,%ebp
|
||||
|
||||
mulxq 24(%rsi),%rax,%r14
|
||||
movq %r8,%rdx
|
||||
adcxq %rax,%r13
|
||||
adoxq -8(%rbx),%r13
|
||||
adcxq %rbp,%r14
|
||||
leaq 32(%rsi),%rsi
|
||||
adoxq %rbp,%r14
|
||||
|
||||
mulxq 0(%rcx),%rax,%r10
|
||||
adcxq %rax,%r15
|
||||
adoxq %r11,%r10
|
||||
mulxq 8(%rcx),%rax,%r11
|
||||
adcxq %rax,%r10
|
||||
adoxq %r12,%r11
|
||||
mulxq 16(%rcx),%rax,%r12
|
||||
movq %r10,-32(%rbx)
|
||||
adcxq %rax,%r11
|
||||
adoxq %r13,%r12
|
||||
mulxq 24(%rcx),%rax,%r15
|
||||
movq %r9,%rdx
|
||||
movq %r11,-24(%rbx)
|
||||
leaq 32(%rcx),%rcx
|
||||
adcxq %rax,%r12
|
||||
adoxq %rbp,%r15
|
||||
movq 48(%rsp),%rdi
|
||||
movq %r12,-16(%rbx)
|
||||
|
||||
jmp .Lmulx4x_inner
|
||||
|
||||
.align 32
|
||||
.Lmulx4x_inner:
|
||||
mulxq 0(%rsi),%r10,%rax
|
||||
adcxq %rbp,%r15
|
||||
adoxq %r14,%r10
|
||||
mulxq 8(%rsi),%r11,%r14
|
||||
adcxq 0(%rbx),%r10
|
||||
adoxq %rax,%r11
|
||||
mulxq 16(%rsi),%r12,%rax
|
||||
adcxq 8(%rbx),%r11
|
||||
adoxq %r14,%r12
|
||||
mulxq 24(%rsi),%r13,%r14
|
||||
movq %r8,%rdx
|
||||
adcxq 16(%rbx),%r12
|
||||
adoxq %rax,%r13
|
||||
adcxq 24(%rbx),%r13
|
||||
adoxq %rbp,%r14
|
||||
leaq 32(%rsi),%rsi
|
||||
leaq 32(%rbx),%rbx
|
||||
adcxq %rbp,%r14
|
||||
|
||||
adoxq %r15,%r10
|
||||
mulxq 0(%rcx),%rax,%r15
|
||||
adcxq %rax,%r10
|
||||
adoxq %r15,%r11
|
||||
mulxq 8(%rcx),%rax,%r15
|
||||
adcxq %rax,%r11
|
||||
adoxq %r15,%r12
|
||||
mulxq 16(%rcx),%rax,%r15
|
||||
movq %r10,-40(%rbx)
|
||||
adcxq %rax,%r12
|
||||
adoxq %r15,%r13
|
||||
mulxq 24(%rcx),%rax,%r15
|
||||
movq %r9,%rdx
|
||||
movq %r11,-32(%rbx)
|
||||
movq %r12,-24(%rbx)
|
||||
adcxq %rax,%r13
|
||||
adoxq %rbp,%r15
|
||||
leaq 32(%rcx),%rcx
|
||||
movq %r13,-16(%rbx)
|
||||
|
||||
decq %rdi
|
||||
jnz .Lmulx4x_inner
|
||||
|
||||
movq 0(%rsp),%rax
|
||||
movq 8(%rsp),%rdi
|
||||
adcq %rbp,%r15
|
||||
subq 0(%rbx),%rbp
|
||||
adcq %r15,%r14
|
||||
sbbq %r15,%r15
|
||||
movq %r14,-8(%rbx)
|
||||
|
||||
cmpq 16(%rsp),%rdi
|
||||
jne .Lmulx4x_outer
|
||||
|
||||
leaq 64(%rsp),%rbx
|
||||
subq %rax,%rcx
|
||||
negq %r15
|
||||
movq %rax,%rdx
|
||||
shrq $3+2,%rax
|
||||
movq 32(%rsp),%rdi
|
||||
jmp .Lmulx4x_sub
|
||||
|
||||
.align 32
|
||||
.Lmulx4x_sub:
|
||||
movq 0(%rbx),%r11
|
||||
movq 8(%rbx),%r12
|
||||
movq 16(%rbx),%r13
|
||||
movq 24(%rbx),%r14
|
||||
leaq 32(%rbx),%rbx
|
||||
sbbq 0(%rcx),%r11
|
||||
sbbq 8(%rcx),%r12
|
||||
sbbq 16(%rcx),%r13
|
||||
sbbq 24(%rcx),%r14
|
||||
leaq 32(%rcx),%rcx
|
||||
movq %r11,0(%rdi)
|
||||
movq %r12,8(%rdi)
|
||||
movq %r13,16(%rdi)
|
||||
movq %r14,24(%rdi)
|
||||
leaq 32(%rdi),%rdi
|
||||
decq %rax
|
||||
jnz .Lmulx4x_sub
|
||||
|
||||
sbbq $0,%r15
|
||||
leaq 64(%rsp),%rbx
|
||||
subq %rdx,%rdi
|
||||
|
||||
.byte 102,73,15,110,207
|
||||
pxor %xmm0,%xmm0
|
||||
pshufd $0,%xmm1,%xmm1
|
||||
movq 40(%rsp),%rsi
|
||||
.cfi_def_cfa %rsi,8
|
||||
jmp .Lmulx4x_cond_copy
|
||||
|
||||
.align 32
|
||||
.Lmulx4x_cond_copy:
|
||||
movdqa 0(%rbx),%xmm2
|
||||
movdqa 16(%rbx),%xmm3
|
||||
leaq 32(%rbx),%rbx
|
||||
movdqu 0(%rdi),%xmm4
|
||||
movdqu 16(%rdi),%xmm5
|
||||
leaq 32(%rdi),%rdi
|
||||
movdqa %xmm0,-32(%rbx)
|
||||
movdqa %xmm0,-16(%rbx)
|
||||
pcmpeqd %xmm1,%xmm0
|
||||
pand %xmm1,%xmm2
|
||||
pand %xmm1,%xmm3
|
||||
pand %xmm0,%xmm4
|
||||
pand %xmm0,%xmm5
|
||||
pxor %xmm0,%xmm0
|
||||
por %xmm2,%xmm4
|
||||
por %xmm3,%xmm5
|
||||
movdqu %xmm4,-32(%rdi)
|
||||
movdqu %xmm5,-16(%rdi)
|
||||
subq $32,%rdx
|
||||
jnz .Lmulx4x_cond_copy
|
||||
|
||||
movq %rdx,(%rbx)
|
||||
|
||||
movq $1,%rax
|
||||
movq -48(%rsi),%r15
|
||||
.cfi_restore %r15
|
||||
movq -40(%rsi),%r14
|
||||
.cfi_restore %r14
|
||||
movq -32(%rsi),%r13
|
||||
.cfi_restore %r13
|
||||
movq -24(%rsi),%r12
|
||||
.cfi_restore %r12
|
||||
movq -16(%rsi),%rbp
|
||||
.cfi_restore %rbp
|
||||
movq -8(%rsi),%rbx
|
||||
.cfi_restore %rbx
|
||||
leaq (%rsi),%rsp
|
||||
.cfi_def_cfa_register %rsp
|
||||
.Lmulx4x_epilogue:
|
||||
.byte 0xf3,0xc3
|
||||
.cfi_endproc
|
||||
.size bn_mulx4x_mont,.-bn_mulx4x_mont
|
||||
.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
|
||||
.align 16
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -385,6 +385,8 @@ ChaCha20_ssse3:
|
||||
pushl %esi
|
||||
pushl %edi
|
||||
.Lssse3_shortcut:
|
||||
testl $2048,4(%ebp)
|
||||
jnz .Lxop_shortcut
|
||||
movl 20(%esp),%edi
|
||||
movl 24(%esp),%esi
|
||||
movl 28(%esp),%ecx
|
||||
@ -528,6 +530,484 @@ ChaCha20_ssse3:
|
||||
.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
|
||||
.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
|
||||
.byte 114,103,62,0
|
||||
.globl ChaCha20_xop
|
||||
.type ChaCha20_xop,@function
|
||||
.align 16
|
||||
ChaCha20_xop:
|
||||
.L_ChaCha20_xop_begin:
|
||||
pushl %ebp
|
||||
pushl %ebx
|
||||
pushl %esi
|
||||
pushl %edi
|
||||
.Lxop_shortcut:
|
||||
movl 20(%esp),%edi
|
||||
movl 24(%esp),%esi
|
||||
movl 28(%esp),%ecx
|
||||
movl 32(%esp),%edx
|
||||
movl 36(%esp),%ebx
|
||||
vzeroupper
|
||||
movl %esp,%ebp
|
||||
subl $524,%esp
|
||||
andl $-64,%esp
|
||||
movl %ebp,512(%esp)
|
||||
leal .Lssse3_data-.Lpic_point(%eax),%eax
|
||||
vmovdqu (%ebx),%xmm3
|
||||
cmpl $256,%ecx
|
||||
jb .L0141x
|
||||
movl %edx,516(%esp)
|
||||
movl %ebx,520(%esp)
|
||||
subl $256,%ecx
|
||||
leal 384(%esp),%ebp
|
||||
vmovdqu (%edx),%xmm7
|
||||
vpshufd $0,%xmm3,%xmm0
|
||||
vpshufd $85,%xmm3,%xmm1
|
||||
vpshufd $170,%xmm3,%xmm2
|
||||
vpshufd $255,%xmm3,%xmm3
|
||||
vpaddd 48(%eax),%xmm0,%xmm0
|
||||
vpshufd $0,%xmm7,%xmm4
|
||||
vpshufd $85,%xmm7,%xmm5
|
||||
vpsubd 64(%eax),%xmm0,%xmm0
|
||||
vpshufd $170,%xmm7,%xmm6
|
||||
vpshufd $255,%xmm7,%xmm7
|
||||
vmovdqa %xmm0,64(%ebp)
|
||||
vmovdqa %xmm1,80(%ebp)
|
||||
vmovdqa %xmm2,96(%ebp)
|
||||
vmovdqa %xmm3,112(%ebp)
|
||||
vmovdqu 16(%edx),%xmm3
|
||||
vmovdqa %xmm4,-64(%ebp)
|
||||
vmovdqa %xmm5,-48(%ebp)
|
||||
vmovdqa %xmm6,-32(%ebp)
|
||||
vmovdqa %xmm7,-16(%ebp)
|
||||
vmovdqa 32(%eax),%xmm7
|
||||
leal 128(%esp),%ebx
|
||||
vpshufd $0,%xmm3,%xmm0
|
||||
vpshufd $85,%xmm3,%xmm1
|
||||
vpshufd $170,%xmm3,%xmm2
|
||||
vpshufd $255,%xmm3,%xmm3
|
||||
vpshufd $0,%xmm7,%xmm4
|
||||
vpshufd $85,%xmm7,%xmm5
|
||||
vpshufd $170,%xmm7,%xmm6
|
||||
vpshufd $255,%xmm7,%xmm7
|
||||
vmovdqa %xmm0,(%ebp)
|
||||
vmovdqa %xmm1,16(%ebp)
|
||||
vmovdqa %xmm2,32(%ebp)
|
||||
vmovdqa %xmm3,48(%ebp)
|
||||
vmovdqa %xmm4,-128(%ebp)
|
||||
vmovdqa %xmm5,-112(%ebp)
|
||||
vmovdqa %xmm6,-96(%ebp)
|
||||
vmovdqa %xmm7,-80(%ebp)
|
||||
leal 128(%esi),%esi
|
||||
leal 128(%edi),%edi
|
||||
jmp .L015outer_loop
|
||||
.align 32
|
||||
.L015outer_loop:
|
||||
vmovdqa -112(%ebp),%xmm1
|
||||
vmovdqa -96(%ebp),%xmm2
|
||||
vmovdqa -80(%ebp),%xmm3
|
||||
vmovdqa -48(%ebp),%xmm5
|
||||
vmovdqa -32(%ebp),%xmm6
|
||||
vmovdqa -16(%ebp),%xmm7
|
||||
vmovdqa %xmm1,-112(%ebx)
|
||||
vmovdqa %xmm2,-96(%ebx)
|
||||
vmovdqa %xmm3,-80(%ebx)
|
||||
vmovdqa %xmm5,-48(%ebx)
|
||||
vmovdqa %xmm6,-32(%ebx)
|
||||
vmovdqa %xmm7,-16(%ebx)
|
||||
vmovdqa 32(%ebp),%xmm2
|
||||
vmovdqa 48(%ebp),%xmm3
|
||||
vmovdqa 64(%ebp),%xmm4
|
||||
vmovdqa 80(%ebp),%xmm5
|
||||
vmovdqa 96(%ebp),%xmm6
|
||||
vmovdqa 112(%ebp),%xmm7
|
||||
vpaddd 64(%eax),%xmm4,%xmm4
|
||||
vmovdqa %xmm2,32(%ebx)
|
||||
vmovdqa %xmm3,48(%ebx)
|
||||
vmovdqa %xmm4,64(%ebx)
|
||||
vmovdqa %xmm5,80(%ebx)
|
||||
vmovdqa %xmm6,96(%ebx)
|
||||
vmovdqa %xmm7,112(%ebx)
|
||||
vmovdqa %xmm4,64(%ebp)
|
||||
vmovdqa -128(%ebp),%xmm0
|
||||
vmovdqa %xmm4,%xmm6
|
||||
vmovdqa -64(%ebp),%xmm3
|
||||
vmovdqa (%ebp),%xmm4
|
||||
vmovdqa 16(%ebp),%xmm5
|
||||
movl $10,%edx
|
||||
nop
|
||||
.align 32
|
||||
.L016loop:
|
||||
vpaddd %xmm3,%xmm0,%xmm0
|
||||
vpxor %xmm0,%xmm6,%xmm6
|
||||
.byte 143,232,120,194,246,16
|
||||
vpaddd %xmm6,%xmm4,%xmm4
|
||||
vpxor %xmm4,%xmm3,%xmm2
|
||||
vmovdqa -112(%ebx),%xmm1
|
||||
.byte 143,232,120,194,210,12
|
||||
vmovdqa -48(%ebx),%xmm3
|
||||
vpaddd %xmm2,%xmm0,%xmm0
|
||||
vmovdqa 80(%ebx),%xmm7
|
||||
vpxor %xmm0,%xmm6,%xmm6
|
||||
vpaddd %xmm3,%xmm1,%xmm1
|
||||
.byte 143,232,120,194,246,8
|
||||
vmovdqa %xmm0,-128(%ebx)
|
||||
vpaddd %xmm6,%xmm4,%xmm4
|
||||
vmovdqa %xmm6,64(%ebx)
|
||||
vpxor %xmm4,%xmm2,%xmm2
|
||||
vpxor %xmm1,%xmm7,%xmm7
|
||||
.byte 143,232,120,194,210,7
|
||||
vmovdqa %xmm4,(%ebx)
|
||||
.byte 143,232,120,194,255,16
|
||||
vmovdqa %xmm2,-64(%ebx)
|
||||
vpaddd %xmm7,%xmm5,%xmm5
|
||||
vmovdqa 32(%ebx),%xmm4
|
||||
vpxor %xmm5,%xmm3,%xmm3
|
||||
vmovdqa -96(%ebx),%xmm0
|
||||
.byte 143,232,120,194,219,12
|
||||
vmovdqa -32(%ebx),%xmm2
|
||||
vpaddd %xmm3,%xmm1,%xmm1
|
||||
vmovdqa 96(%ebx),%xmm6
|
||||
vpxor %xmm1,%xmm7,%xmm7
|
||||
vpaddd %xmm2,%xmm0,%xmm0
|
||||
.byte 143,232,120,194,255,8
|
||||
vmovdqa %xmm1,-112(%ebx)
|
||||
vpaddd %xmm7,%xmm5,%xmm5
|
||||
vmovdqa %xmm7,80(%ebx)
|
||||
vpxor %xmm5,%xmm3,%xmm3
|
||||
vpxor %xmm0,%xmm6,%xmm6
|
||||
.byte 143,232,120,194,219,7
|
||||
vmovdqa %xmm5,16(%ebx)
|
||||
.byte 143,232,120,194,246,16
|
||||
vmovdqa %xmm3,-48(%ebx)
|
||||
vpaddd %xmm6,%xmm4,%xmm4
|
||||
vmovdqa 48(%ebx),%xmm5
|
||||
vpxor %xmm4,%xmm2,%xmm2
|
||||
vmovdqa -80(%ebx),%xmm1
|
||||
.byte 143,232,120,194,210,12
|
||||
vmovdqa -16(%ebx),%xmm3
|
||||
vpaddd %xmm2,%xmm0,%xmm0
|
||||
vmovdqa 112(%ebx),%xmm7
|
||||
vpxor %xmm0,%xmm6,%xmm6
|
||||
vpaddd %xmm3,%xmm1,%xmm1
|
||||
.byte 143,232,120,194,246,8
|
||||
vmovdqa %xmm0,-96(%ebx)
|
||||
vpaddd %xmm6,%xmm4,%xmm4
|
||||
vmovdqa %xmm6,96(%ebx)
|
||||
vpxor %xmm4,%xmm2,%xmm2
|
||||
vpxor %xmm1,%xmm7,%xmm7
|
||||
.byte 143,232,120,194,210,7
|
||||
.byte 143,232,120,194,255,16
|
||||
vmovdqa %xmm2,-32(%ebx)
|
||||
vpaddd %xmm7,%xmm5,%xmm5
|
||||
vpxor %xmm5,%xmm3,%xmm3
|
||||
vmovdqa -128(%ebx),%xmm0
|
||||
.byte 143,232,120,194,219,12
|
||||
vmovdqa -48(%ebx),%xmm2
|
||||
vpaddd %xmm3,%xmm1,%xmm1
|
||||
vpxor %xmm1,%xmm7,%xmm7
|
||||
vpaddd %xmm2,%xmm0,%xmm0
|
||||
.byte 143,232,120,194,255,8
|
||||
vmovdqa %xmm1,-80(%ebx)
|
||||
vpaddd %xmm7,%xmm5,%xmm5
|
||||
vpxor %xmm5,%xmm3,%xmm3
|
||||
vpxor %xmm0,%xmm7,%xmm6
|
||||
.byte 143,232,120,194,219,7
|
||||
.byte 143,232,120,194,246,16
|
||||
vmovdqa %xmm3,-16(%ebx)
|
||||
vpaddd %xmm6,%xmm4,%xmm4
|
||||
vpxor %xmm4,%xmm2,%xmm2
|
||||
vmovdqa -112(%ebx),%xmm1
|
||||
.byte 143,232,120,194,210,12
|
||||
vmovdqa -32(%ebx),%xmm3
|
||||
vpaddd %xmm2,%xmm0,%xmm0
|
||||
vmovdqa 64(%ebx),%xmm7
|
||||
vpxor %xmm0,%xmm6,%xmm6
|
||||
vpaddd %xmm3,%xmm1,%xmm1
|
||||
.byte 143,232,120,194,246,8
|
||||
vmovdqa %xmm0,-128(%ebx)
|
||||
vpaddd %xmm6,%xmm4,%xmm4
|
||||
vmovdqa %xmm6,112(%ebx)
|
||||
vpxor %xmm4,%xmm2,%xmm2
|
||||
vpxor %xmm1,%xmm7,%xmm7
|
||||
.byte 143,232,120,194,210,7
|
||||
vmovdqa %xmm4,32(%ebx)
|
||||
.byte 143,232,120,194,255,16
|
||||
vmovdqa %xmm2,-48(%ebx)
|
||||
vpaddd %xmm7,%xmm5,%xmm5
|
||||
vmovdqa (%ebx),%xmm4
|
||||
vpxor %xmm5,%xmm3,%xmm3
|
||||
vmovdqa -96(%ebx),%xmm0
|
||||
.byte 143,232,120,194,219,12
|
||||
vmovdqa -16(%ebx),%xmm2
|
||||
vpaddd %xmm3,%xmm1,%xmm1
|
||||
vmovdqa 80(%ebx),%xmm6
|
||||
vpxor %xmm1,%xmm7,%xmm7
|
||||
vpaddd %xmm2,%xmm0,%xmm0
|
||||
.byte 143,232,120,194,255,8
|
||||
vmovdqa %xmm1,-112(%ebx)
|
||||
vpaddd %xmm7,%xmm5,%xmm5
|
||||
vmovdqa %xmm7,64(%ebx)
|
||||
vpxor %xmm5,%xmm3,%xmm3
|
||||
vpxor %xmm0,%xmm6,%xmm6
|
||||
.byte 143,232,120,194,219,7
|
||||
vmovdqa %xmm5,48(%ebx)
|
||||
.byte 143,232,120,194,246,16
|
||||
vmovdqa %xmm3,-32(%ebx)
|
||||
vpaddd %xmm6,%xmm4,%xmm4
|
||||
vmovdqa 16(%ebx),%xmm5
|
||||
vpxor %xmm4,%xmm2,%xmm2
|
||||
vmovdqa -80(%ebx),%xmm1
|
||||
.byte 143,232,120,194,210,12
|
||||
vmovdqa -64(%ebx),%xmm3
|
||||
vpaddd %xmm2,%xmm0,%xmm0
|
||||
vmovdqa 96(%ebx),%xmm7
|
||||
vpxor %xmm0,%xmm6,%xmm6
|
||||
vpaddd %xmm3,%xmm1,%xmm1
|
||||
.byte 143,232,120,194,246,8
|
||||
vmovdqa %xmm0,-96(%ebx)
|
||||
vpaddd %xmm6,%xmm4,%xmm4
|
||||
vmovdqa %xmm6,80(%ebx)
|
||||
vpxor %xmm4,%xmm2,%xmm2
|
||||
vpxor %xmm1,%xmm7,%xmm7
|
||||
.byte 143,232,120,194,210,7
|
||||
.byte 143,232,120,194,255,16
|
||||
vmovdqa %xmm2,-16(%ebx)
|
||||
vpaddd %xmm7,%xmm5,%xmm5
|
||||
vpxor %xmm5,%xmm3,%xmm3
|
||||
vmovdqa -128(%ebx),%xmm0
|
||||
.byte 143,232,120,194,219,12
|
||||
vpaddd %xmm3,%xmm1,%xmm1
|
||||
vmovdqa 64(%ebx),%xmm6
|
||||
vpxor %xmm1,%xmm7,%xmm7
|
||||
.byte 143,232,120,194,255,8
|
||||
vmovdqa %xmm1,-80(%ebx)
|
||||
vpaddd %xmm7,%xmm5,%xmm5
|
||||
vmovdqa %xmm7,96(%ebx)
|
||||
vpxor %xmm5,%xmm3,%xmm3
|
||||
.byte 143,232,120,194,219,7
|
||||
decl %edx
|
||||
jnz .L016loop
|
||||
vmovdqa %xmm3,-64(%ebx)
|
||||
vmovdqa %xmm4,(%ebx)
|
||||
vmovdqa %xmm5,16(%ebx)
|
||||
vmovdqa %xmm6,64(%ebx)
|
||||
vmovdqa %xmm7,96(%ebx)
|
||||
vmovdqa -112(%ebx),%xmm1
|
||||
vmovdqa -96(%ebx),%xmm2
|
||||
vmovdqa -80(%ebx),%xmm3
|
||||
vpaddd -128(%ebp),%xmm0,%xmm0
|
||||
vpaddd -112(%ebp),%xmm1,%xmm1
|
||||
vpaddd -96(%ebp),%xmm2,%xmm2
|
||||
vpaddd -80(%ebp),%xmm3,%xmm3
|
||||
vpunpckldq %xmm1,%xmm0,%xmm6
|
||||
vpunpckldq %xmm3,%xmm2,%xmm7
|
||||
vpunpckhdq %xmm1,%xmm0,%xmm0
|
||||
vpunpckhdq %xmm3,%xmm2,%xmm2
|
||||
vpunpcklqdq %xmm7,%xmm6,%xmm1
|
||||
vpunpckhqdq %xmm7,%xmm6,%xmm6
|
||||
vpunpcklqdq %xmm2,%xmm0,%xmm7
|
||||
vpunpckhqdq %xmm2,%xmm0,%xmm3
|
||||
vpxor -128(%esi),%xmm1,%xmm4
|
||||
vpxor -64(%esi),%xmm6,%xmm5
|
||||
vpxor (%esi),%xmm7,%xmm6
|
||||
vpxor 64(%esi),%xmm3,%xmm7
|
||||
leal 16(%esi),%esi
|
||||
vmovdqa -64(%ebx),%xmm0
|
||||
vmovdqa -48(%ebx),%xmm1
|
||||
vmovdqa -32(%ebx),%xmm2
|
||||
vmovdqa -16(%ebx),%xmm3
|
||||
vmovdqu %xmm4,-128(%edi)
|
||||
vmovdqu %xmm5,-64(%edi)
|
||||
vmovdqu %xmm6,(%edi)
|
||||
vmovdqu %xmm7,64(%edi)
|
||||
leal 16(%edi),%edi
|
||||
vpaddd -64(%ebp),%xmm0,%xmm0
|
||||
vpaddd -48(%ebp),%xmm1,%xmm1
|
||||
vpaddd -32(%ebp),%xmm2,%xmm2
|
||||
vpaddd -16(%ebp),%xmm3,%xmm3
|
||||
vpunpckldq %xmm1,%xmm0,%xmm6
|
||||
vpunpckldq %xmm3,%xmm2,%xmm7
|
||||
vpunpckhdq %xmm1,%xmm0,%xmm0
|
||||
vpunpckhdq %xmm3,%xmm2,%xmm2
|
||||
vpunpcklqdq %xmm7,%xmm6,%xmm1
|
||||
vpunpckhqdq %xmm7,%xmm6,%xmm6
|
||||
vpunpcklqdq %xmm2,%xmm0,%xmm7
|
||||
vpunpckhqdq %xmm2,%xmm0,%xmm3
|
||||
vpxor -128(%esi),%xmm1,%xmm4
|
||||
vpxor -64(%esi),%xmm6,%xmm5
|
||||
vpxor (%esi),%xmm7,%xmm6
|
||||
vpxor 64(%esi),%xmm3,%xmm7
|
||||
leal 16(%esi),%esi
|
||||
vmovdqa (%ebx),%xmm0
|
||||
vmovdqa 16(%ebx),%xmm1
|
||||
vmovdqa 32(%ebx),%xmm2
|
||||
vmovdqa 48(%ebx),%xmm3
|
||||
vmovdqu %xmm4,-128(%edi)
|
||||
vmovdqu %xmm5,-64(%edi)
|
||||
vmovdqu %xmm6,(%edi)
|
||||
vmovdqu %xmm7,64(%edi)
|
||||
leal 16(%edi),%edi
|
||||
vpaddd (%ebp),%xmm0,%xmm0
|
||||
vpaddd 16(%ebp),%xmm1,%xmm1
|
||||
vpaddd 32(%ebp),%xmm2,%xmm2
|
||||
vpaddd 48(%ebp),%xmm3,%xmm3
|
||||
vpunpckldq %xmm1,%xmm0,%xmm6
|
||||
vpunpckldq %xmm3,%xmm2,%xmm7
|
||||
vpunpckhdq %xmm1,%xmm0,%xmm0
|
||||
vpunpckhdq %xmm3,%xmm2,%xmm2
|
||||
vpunpcklqdq %xmm7,%xmm6,%xmm1
|
||||
vpunpckhqdq %xmm7,%xmm6,%xmm6
|
||||
vpunpcklqdq %xmm2,%xmm0,%xmm7
|
||||
vpunpckhqdq %xmm2,%xmm0,%xmm3
|
||||
vpxor -128(%esi),%xmm1,%xmm4
|
||||
vpxor -64(%esi),%xmm6,%xmm5
|
||||
vpxor (%esi),%xmm7,%xmm6
|
||||
vpxor 64(%esi),%xmm3,%xmm7
|
||||
leal 16(%esi),%esi
|
||||
vmovdqa 64(%ebx),%xmm0
|
||||
vmovdqa 80(%ebx),%xmm1
|
||||
vmovdqa 96(%ebx),%xmm2
|
||||
vmovdqa 112(%ebx),%xmm3
|
||||
vmovdqu %xmm4,-128(%edi)
|
||||
vmovdqu %xmm5,-64(%edi)
|
||||
vmovdqu %xmm6,(%edi)
|
||||
vmovdqu %xmm7,64(%edi)
|
||||
leal 16(%edi),%edi
|
||||
vpaddd 64(%ebp),%xmm0,%xmm0
|
||||
vpaddd 80(%ebp),%xmm1,%xmm1
|
||||
vpaddd 96(%ebp),%xmm2,%xmm2
|
||||
vpaddd 112(%ebp),%xmm3,%xmm3
|
||||
vpunpckldq %xmm1,%xmm0,%xmm6
|
||||
vpunpckldq %xmm3,%xmm2,%xmm7
|
||||
vpunpckhdq %xmm1,%xmm0,%xmm0
|
||||
vpunpckhdq %xmm3,%xmm2,%xmm2
|
||||
vpunpcklqdq %xmm7,%xmm6,%xmm1
|
||||
vpunpckhqdq %xmm7,%xmm6,%xmm6
|
||||
vpunpcklqdq %xmm2,%xmm0,%xmm7
|
||||
vpunpckhqdq %xmm2,%xmm0,%xmm3
|
||||
vpxor -128(%esi),%xmm1,%xmm4
|
||||
vpxor -64(%esi),%xmm6,%xmm5
|
||||
vpxor (%esi),%xmm7,%xmm6
|
||||
vpxor 64(%esi),%xmm3,%xmm7
|
||||
leal 208(%esi),%esi
|
||||
vmovdqu %xmm4,-128(%edi)
|
||||
vmovdqu %xmm5,-64(%edi)
|
||||
vmovdqu %xmm6,(%edi)
|
||||
vmovdqu %xmm7,64(%edi)
|
||||
leal 208(%edi),%edi
|
||||
subl $256,%ecx
|
||||
jnc .L015outer_loop
|
||||
addl $256,%ecx
|
||||
jz .L017done
|
||||
movl 520(%esp),%ebx
|
||||
leal -128(%esi),%esi
|
||||
movl 516(%esp),%edx
|
||||
leal -128(%edi),%edi
|
||||
vmovd 64(%ebp),%xmm2
|
||||
vmovdqu (%ebx),%xmm3
|
||||
vpaddd 96(%eax),%xmm2,%xmm2
|
||||
vpand 112(%eax),%xmm3,%xmm3
|
||||
vpor %xmm2,%xmm3,%xmm3
|
||||
.L0141x:
|
||||
vmovdqa 32(%eax),%xmm0
|
||||
vmovdqu (%edx),%xmm1
|
||||
vmovdqu 16(%edx),%xmm2
|
||||
vmovdqa (%eax),%xmm6
|
||||
vmovdqa 16(%eax),%xmm7
|
||||
movl %ebp,48(%esp)
|
||||
vmovdqa %xmm0,(%esp)
|
||||
vmovdqa %xmm1,16(%esp)
|
||||
vmovdqa %xmm2,32(%esp)
|
||||
vmovdqa %xmm3,48(%esp)
|
||||
movl $10,%edx
|
||||
jmp .L018loop1x
|
||||
.align 16
|
||||
.L019outer1x:
|
||||
vmovdqa 80(%eax),%xmm3
|
||||
vmovdqa (%esp),%xmm0
|
||||
vmovdqa 16(%esp),%xmm1
|
||||
vmovdqa 32(%esp),%xmm2
|
||||
vpaddd 48(%esp),%xmm3,%xmm3
|
||||
movl $10,%edx
|
||||
vmovdqa %xmm3,48(%esp)
|
||||
jmp .L018loop1x
|
||||
.align 16
|
||||
.L018loop1x:
|
||||
vpaddd %xmm1,%xmm0,%xmm0
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
.byte 143,232,120,194,219,16
|
||||
vpaddd %xmm3,%xmm2,%xmm2
|
||||
vpxor %xmm2,%xmm1,%xmm1
|
||||
.byte 143,232,120,194,201,12
|
||||
vpaddd %xmm1,%xmm0,%xmm0
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
.byte 143,232,120,194,219,8
|
||||
vpaddd %xmm3,%xmm2,%xmm2
|
||||
vpxor %xmm2,%xmm1,%xmm1
|
||||
.byte 143,232,120,194,201,7
|
||||
vpshufd $78,%xmm2,%xmm2
|
||||
vpshufd $57,%xmm1,%xmm1
|
||||
vpshufd $147,%xmm3,%xmm3
|
||||
vpaddd %xmm1,%xmm0,%xmm0
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
.byte 143,232,120,194,219,16
|
||||
vpaddd %xmm3,%xmm2,%xmm2
|
||||
vpxor %xmm2,%xmm1,%xmm1
|
||||
.byte 143,232,120,194,201,12
|
||||
vpaddd %xmm1,%xmm0,%xmm0
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
.byte 143,232,120,194,219,8
|
||||
vpaddd %xmm3,%xmm2,%xmm2
|
||||
vpxor %xmm2,%xmm1,%xmm1
|
||||
.byte 143,232,120,194,201,7
|
||||
vpshufd $78,%xmm2,%xmm2
|
||||
vpshufd $147,%xmm1,%xmm1
|
||||
vpshufd $57,%xmm3,%xmm3
|
||||
decl %edx
|
||||
jnz .L018loop1x
|
||||
vpaddd (%esp),%xmm0,%xmm0
|
||||
vpaddd 16(%esp),%xmm1,%xmm1
|
||||
vpaddd 32(%esp),%xmm2,%xmm2
|
||||
vpaddd 48(%esp),%xmm3,%xmm3
|
||||
cmpl $64,%ecx
|
||||
jb .L020tail
|
||||
vpxor (%esi),%xmm0,%xmm0
|
||||
vpxor 16(%esi),%xmm1,%xmm1
|
||||
vpxor 32(%esi),%xmm2,%xmm2
|
||||
vpxor 48(%esi),%xmm3,%xmm3
|
||||
leal 64(%esi),%esi
|
||||
vmovdqu %xmm0,(%edi)
|
||||
vmovdqu %xmm1,16(%edi)
|
||||
vmovdqu %xmm2,32(%edi)
|
||||
vmovdqu %xmm3,48(%edi)
|
||||
leal 64(%edi),%edi
|
||||
subl $64,%ecx
|
||||
jnz .L019outer1x
|
||||
jmp .L017done
|
||||
.L020tail:
|
||||
vmovdqa %xmm0,(%esp)
|
||||
vmovdqa %xmm1,16(%esp)
|
||||
vmovdqa %xmm2,32(%esp)
|
||||
vmovdqa %xmm3,48(%esp)
|
||||
xorl %eax,%eax
|
||||
xorl %edx,%edx
|
||||
xorl %ebp,%ebp
|
||||
.L021tail_loop:
|
||||
movb (%esp,%ebp,1),%al
|
||||
movb (%esi,%ebp,1),%dl
|
||||
leal 1(%ebp),%ebp
|
||||
xorb %dl,%al
|
||||
movb %al,-1(%edi,%ebp,1)
|
||||
decl %ecx
|
||||
jnz .L021tail_loop
|
||||
.L017done:
|
||||
vzeroupper
|
||||
movl 512(%esp),%esp
|
||||
popl %edi
|
||||
popl %esi
|
||||
popl %ebx
|
||||
popl %ebp
|
||||
ret
|
||||
.size ChaCha20_xop,.-.L_ChaCha20_xop_begin
|
||||
.comm OPENSSL_ia32cap_P,16,4
|
||||
#else
|
||||
.text
|
||||
@ -914,6 +1394,8 @@ ChaCha20_ssse3:
|
||||
pushl %esi
|
||||
pushl %edi
|
||||
.Lssse3_shortcut:
|
||||
testl $2048,4(%ebp)
|
||||
jnz .Lxop_shortcut
|
||||
movl 20(%esp),%edi
|
||||
movl 24(%esp),%esi
|
||||
movl 28(%esp),%ecx
|
||||
@ -1057,5 +1539,483 @@ ChaCha20_ssse3:
|
||||
.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
|
||||
.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
|
||||
.byte 114,103,62,0
|
||||
.globl ChaCha20_xop
|
||||
.type ChaCha20_xop,@function
|
||||
.align 16
|
||||
ChaCha20_xop:
|
||||
.L_ChaCha20_xop_begin:
|
||||
pushl %ebp
|
||||
pushl %ebx
|
||||
pushl %esi
|
||||
pushl %edi
|
||||
.Lxop_shortcut:
|
||||
movl 20(%esp),%edi
|
||||
movl 24(%esp),%esi
|
||||
movl 28(%esp),%ecx
|
||||
movl 32(%esp),%edx
|
||||
movl 36(%esp),%ebx
|
||||
vzeroupper
|
||||
movl %esp,%ebp
|
||||
subl $524,%esp
|
||||
andl $-64,%esp
|
||||
movl %ebp,512(%esp)
|
||||
leal .Lssse3_data-.Lpic_point(%eax),%eax
|
||||
vmovdqu (%ebx),%xmm3
|
||||
cmpl $256,%ecx
|
||||
jb .L0141x
|
||||
movl %edx,516(%esp)
|
||||
movl %ebx,520(%esp)
|
||||
subl $256,%ecx
|
||||
leal 384(%esp),%ebp
|
||||
vmovdqu (%edx),%xmm7
|
||||
vpshufd $0,%xmm3,%xmm0
|
||||
vpshufd $85,%xmm3,%xmm1
|
||||
vpshufd $170,%xmm3,%xmm2
|
||||
vpshufd $255,%xmm3,%xmm3
|
||||
vpaddd 48(%eax),%xmm0,%xmm0
|
||||
vpshufd $0,%xmm7,%xmm4
|
||||
vpshufd $85,%xmm7,%xmm5
|
||||
vpsubd 64(%eax),%xmm0,%xmm0
|
||||
vpshufd $170,%xmm7,%xmm6
|
||||
vpshufd $255,%xmm7,%xmm7
|
||||
vmovdqa %xmm0,64(%ebp)
|
||||
vmovdqa %xmm1,80(%ebp)
|
||||
vmovdqa %xmm2,96(%ebp)
|
||||
vmovdqa %xmm3,112(%ebp)
|
||||
vmovdqu 16(%edx),%xmm3
|
||||
vmovdqa %xmm4,-64(%ebp)
|
||||
vmovdqa %xmm5,-48(%ebp)
|
||||
vmovdqa %xmm6,-32(%ebp)
|
||||
vmovdqa %xmm7,-16(%ebp)
|
||||
vmovdqa 32(%eax),%xmm7
|
||||
leal 128(%esp),%ebx
|
||||
vpshufd $0,%xmm3,%xmm0
|
||||
vpshufd $85,%xmm3,%xmm1
|
||||
vpshufd $170,%xmm3,%xmm2
|
||||
vpshufd $255,%xmm3,%xmm3
|
||||
vpshufd $0,%xmm7,%xmm4
|
||||
vpshufd $85,%xmm7,%xmm5
|
||||
vpshufd $170,%xmm7,%xmm6
|
||||
vpshufd $255,%xmm7,%xmm7
|
||||
vmovdqa %xmm0,(%ebp)
|
||||
vmovdqa %xmm1,16(%ebp)
|
||||
vmovdqa %xmm2,32(%ebp)
|
||||
vmovdqa %xmm3,48(%ebp)
|
||||
vmovdqa %xmm4,-128(%ebp)
|
||||
vmovdqa %xmm5,-112(%ebp)
|
||||
vmovdqa %xmm6,-96(%ebp)
|
||||
vmovdqa %xmm7,-80(%ebp)
|
||||
leal 128(%esi),%esi
|
||||
leal 128(%edi),%edi
|
||||
jmp .L015outer_loop
|
||||
.align 32
|
||||
.L015outer_loop:
|
||||
vmovdqa -112(%ebp),%xmm1
|
||||
vmovdqa -96(%ebp),%xmm2
|
||||
vmovdqa -80(%ebp),%xmm3
|
||||
vmovdqa -48(%ebp),%xmm5
|
||||
vmovdqa -32(%ebp),%xmm6
|
||||
vmovdqa -16(%ebp),%xmm7
|
||||
vmovdqa %xmm1,-112(%ebx)
|
||||
vmovdqa %xmm2,-96(%ebx)
|
||||
vmovdqa %xmm3,-80(%ebx)
|
||||
vmovdqa %xmm5,-48(%ebx)
|
||||
vmovdqa %xmm6,-32(%ebx)
|
||||
vmovdqa %xmm7,-16(%ebx)
|
||||
vmovdqa 32(%ebp),%xmm2
|
||||
vmovdqa 48(%ebp),%xmm3
|
||||
vmovdqa 64(%ebp),%xmm4
|
||||
vmovdqa 80(%ebp),%xmm5
|
||||
vmovdqa 96(%ebp),%xmm6
|
||||
vmovdqa 112(%ebp),%xmm7
|
||||
vpaddd 64(%eax),%xmm4,%xmm4
|
||||
vmovdqa %xmm2,32(%ebx)
|
||||
vmovdqa %xmm3,48(%ebx)
|
||||
vmovdqa %xmm4,64(%ebx)
|
||||
vmovdqa %xmm5,80(%ebx)
|
||||
vmovdqa %xmm6,96(%ebx)
|
||||
vmovdqa %xmm7,112(%ebx)
|
||||
vmovdqa %xmm4,64(%ebp)
|
||||
vmovdqa -128(%ebp),%xmm0
|
||||
vmovdqa %xmm4,%xmm6
|
||||
vmovdqa -64(%ebp),%xmm3
|
||||
vmovdqa (%ebp),%xmm4
|
||||
vmovdqa 16(%ebp),%xmm5
|
||||
movl $10,%edx
|
||||
nop
|
||||
.align 32
|
||||
.L016loop:
|
||||
vpaddd %xmm3,%xmm0,%xmm0
|
||||
vpxor %xmm0,%xmm6,%xmm6
|
||||
.byte 143,232,120,194,246,16
|
||||
vpaddd %xmm6,%xmm4,%xmm4
|
||||
vpxor %xmm4,%xmm3,%xmm2
|
||||
vmovdqa -112(%ebx),%xmm1
|
||||
.byte 143,232,120,194,210,12
|
||||
vmovdqa -48(%ebx),%xmm3
|
||||
vpaddd %xmm2,%xmm0,%xmm0
|
||||
vmovdqa 80(%ebx),%xmm7
|
||||
vpxor %xmm0,%xmm6,%xmm6
|
||||
vpaddd %xmm3,%xmm1,%xmm1
|
||||
.byte 143,232,120,194,246,8
|
||||
vmovdqa %xmm0,-128(%ebx)
|
||||
vpaddd %xmm6,%xmm4,%xmm4
|
||||
vmovdqa %xmm6,64(%ebx)
|
||||
vpxor %xmm4,%xmm2,%xmm2
|
||||
vpxor %xmm1,%xmm7,%xmm7
|
||||
.byte 143,232,120,194,210,7
|
||||
vmovdqa %xmm4,(%ebx)
|
||||
.byte 143,232,120,194,255,16
|
||||
vmovdqa %xmm2,-64(%ebx)
|
||||
vpaddd %xmm7,%xmm5,%xmm5
|
||||
vmovdqa 32(%ebx),%xmm4
|
||||
vpxor %xmm5,%xmm3,%xmm3
|
||||
vmovdqa -96(%ebx),%xmm0
|
||||
.byte 143,232,120,194,219,12
|
||||
vmovdqa -32(%ebx),%xmm2
|
||||
vpaddd %xmm3,%xmm1,%xmm1
|
||||
vmovdqa 96(%ebx),%xmm6
|
||||
vpxor %xmm1,%xmm7,%xmm7
|
||||
vpaddd %xmm2,%xmm0,%xmm0
|
||||
.byte 143,232,120,194,255,8
|
||||
vmovdqa %xmm1,-112(%ebx)
|
||||
vpaddd %xmm7,%xmm5,%xmm5
|
||||
vmovdqa %xmm7,80(%ebx)
|
||||
vpxor %xmm5,%xmm3,%xmm3
|
||||
vpxor %xmm0,%xmm6,%xmm6
|
||||
.byte 143,232,120,194,219,7
|
||||
vmovdqa %xmm5,16(%ebx)
|
||||
.byte 143,232,120,194,246,16
|
||||
vmovdqa %xmm3,-48(%ebx)
|
||||
vpaddd %xmm6,%xmm4,%xmm4
|
||||
vmovdqa 48(%ebx),%xmm5
|
||||
vpxor %xmm4,%xmm2,%xmm2
|
||||
vmovdqa -80(%ebx),%xmm1
|
||||
.byte 143,232,120,194,210,12
|
||||
vmovdqa -16(%ebx),%xmm3
|
||||
vpaddd %xmm2,%xmm0,%xmm0
|
||||
vmovdqa 112(%ebx),%xmm7
|
||||
vpxor %xmm0,%xmm6,%xmm6
|
||||
vpaddd %xmm3,%xmm1,%xmm1
|
||||
.byte 143,232,120,194,246,8
|
||||
vmovdqa %xmm0,-96(%ebx)
|
||||
vpaddd %xmm6,%xmm4,%xmm4
|
||||
vmovdqa %xmm6,96(%ebx)
|
||||
vpxor %xmm4,%xmm2,%xmm2
|
||||
vpxor %xmm1,%xmm7,%xmm7
|
||||
.byte 143,232,120,194,210,7
|
||||
.byte 143,232,120,194,255,16
|
||||
vmovdqa %xmm2,-32(%ebx)
|
||||
vpaddd %xmm7,%xmm5,%xmm5
|
||||
vpxor %xmm5,%xmm3,%xmm3
|
||||
vmovdqa -128(%ebx),%xmm0
|
||||
.byte 143,232,120,194,219,12
|
||||
vmovdqa -48(%ebx),%xmm2
|
||||
vpaddd %xmm3,%xmm1,%xmm1
|
||||
vpxor %xmm1,%xmm7,%xmm7
|
||||
vpaddd %xmm2,%xmm0,%xmm0
|
||||
.byte 143,232,120,194,255,8
|
||||
vmovdqa %xmm1,-80(%ebx)
|
||||
vpaddd %xmm7,%xmm5,%xmm5
|
||||
vpxor %xmm5,%xmm3,%xmm3
|
||||
vpxor %xmm0,%xmm7,%xmm6
|
||||
.byte 143,232,120,194,219,7
|
||||
.byte 143,232,120,194,246,16
|
||||
vmovdqa %xmm3,-16(%ebx)
|
||||
vpaddd %xmm6,%xmm4,%xmm4
|
||||
vpxor %xmm4,%xmm2,%xmm2
|
||||
vmovdqa -112(%ebx),%xmm1
|
||||
.byte 143,232,120,194,210,12
|
||||
vmovdqa -32(%ebx),%xmm3
|
||||
vpaddd %xmm2,%xmm0,%xmm0
|
||||
vmovdqa 64(%ebx),%xmm7
|
||||
vpxor %xmm0,%xmm6,%xmm6
|
||||
vpaddd %xmm3,%xmm1,%xmm1
|
||||
.byte 143,232,120,194,246,8
|
||||
vmovdqa %xmm0,-128(%ebx)
|
||||
vpaddd %xmm6,%xmm4,%xmm4
|
||||
vmovdqa %xmm6,112(%ebx)
|
||||
vpxor %xmm4,%xmm2,%xmm2
|
||||
vpxor %xmm1,%xmm7,%xmm7
|
||||
.byte 143,232,120,194,210,7
|
||||
vmovdqa %xmm4,32(%ebx)
|
||||
.byte 143,232,120,194,255,16
|
||||
vmovdqa %xmm2,-48(%ebx)
|
||||
vpaddd %xmm7,%xmm5,%xmm5
|
||||
vmovdqa (%ebx),%xmm4
|
||||
vpxor %xmm5,%xmm3,%xmm3
|
||||
vmovdqa -96(%ebx),%xmm0
|
||||
.byte 143,232,120,194,219,12
|
||||
vmovdqa -16(%ebx),%xmm2
|
||||
vpaddd %xmm3,%xmm1,%xmm1
|
||||
vmovdqa 80(%ebx),%xmm6
|
||||
vpxor %xmm1,%xmm7,%xmm7
|
||||
vpaddd %xmm2,%xmm0,%xmm0
|
||||
.byte 143,232,120,194,255,8
|
||||
vmovdqa %xmm1,-112(%ebx)
|
||||
vpaddd %xmm7,%xmm5,%xmm5
|
||||
vmovdqa %xmm7,64(%ebx)
|
||||
vpxor %xmm5,%xmm3,%xmm3
|
||||
vpxor %xmm0,%xmm6,%xmm6
|
||||
.byte 143,232,120,194,219,7
|
||||
vmovdqa %xmm5,48(%ebx)
|
||||
.byte 143,232,120,194,246,16
|
||||
vmovdqa %xmm3,-32(%ebx)
|
||||
vpaddd %xmm6,%xmm4,%xmm4
|
||||
vmovdqa 16(%ebx),%xmm5
|
||||
vpxor %xmm4,%xmm2,%xmm2
|
||||
vmovdqa -80(%ebx),%xmm1
|
||||
.byte 143,232,120,194,210,12
|
||||
vmovdqa -64(%ebx),%xmm3
|
||||
vpaddd %xmm2,%xmm0,%xmm0
|
||||
vmovdqa 96(%ebx),%xmm7
|
||||
vpxor %xmm0,%xmm6,%xmm6
|
||||
vpaddd %xmm3,%xmm1,%xmm1
|
||||
.byte 143,232,120,194,246,8
|
||||
vmovdqa %xmm0,-96(%ebx)
|
||||
vpaddd %xmm6,%xmm4,%xmm4
|
||||
vmovdqa %xmm6,80(%ebx)
|
||||
vpxor %xmm4,%xmm2,%xmm2
|
||||
vpxor %xmm1,%xmm7,%xmm7
|
||||
.byte 143,232,120,194,210,7
|
||||
.byte 143,232,120,194,255,16
|
||||
vmovdqa %xmm2,-16(%ebx)
|
||||
vpaddd %xmm7,%xmm5,%xmm5
|
||||
vpxor %xmm5,%xmm3,%xmm3
|
||||
vmovdqa -128(%ebx),%xmm0
|
||||
.byte 143,232,120,194,219,12
|
||||
vpaddd %xmm3,%xmm1,%xmm1
|
||||
vmovdqa 64(%ebx),%xmm6
|
||||
vpxor %xmm1,%xmm7,%xmm7
|
||||
.byte 143,232,120,194,255,8
|
||||
vmovdqa %xmm1,-80(%ebx)
|
||||
vpaddd %xmm7,%xmm5,%xmm5
|
||||
vmovdqa %xmm7,96(%ebx)
|
||||
vpxor %xmm5,%xmm3,%xmm3
|
||||
.byte 143,232,120,194,219,7
|
||||
decl %edx
|
||||
jnz .L016loop
|
||||
vmovdqa %xmm3,-64(%ebx)
|
||||
vmovdqa %xmm4,(%ebx)
|
||||
vmovdqa %xmm5,16(%ebx)
|
||||
vmovdqa %xmm6,64(%ebx)
|
||||
vmovdqa %xmm7,96(%ebx)
|
||||
vmovdqa -112(%ebx),%xmm1
|
||||
vmovdqa -96(%ebx),%xmm2
|
||||
vmovdqa -80(%ebx),%xmm3
|
||||
vpaddd -128(%ebp),%xmm0,%xmm0
|
||||
vpaddd -112(%ebp),%xmm1,%xmm1
|
||||
vpaddd -96(%ebp),%xmm2,%xmm2
|
||||
vpaddd -80(%ebp),%xmm3,%xmm3
|
||||
vpunpckldq %xmm1,%xmm0,%xmm6
|
||||
vpunpckldq %xmm3,%xmm2,%xmm7
|
||||
vpunpckhdq %xmm1,%xmm0,%xmm0
|
||||
vpunpckhdq %xmm3,%xmm2,%xmm2
|
||||
vpunpcklqdq %xmm7,%xmm6,%xmm1
|
||||
vpunpckhqdq %xmm7,%xmm6,%xmm6
|
||||
vpunpcklqdq %xmm2,%xmm0,%xmm7
|
||||
vpunpckhqdq %xmm2,%xmm0,%xmm3
|
||||
vpxor -128(%esi),%xmm1,%xmm4
|
||||
vpxor -64(%esi),%xmm6,%xmm5
|
||||
vpxor (%esi),%xmm7,%xmm6
|
||||
vpxor 64(%esi),%xmm3,%xmm7
|
||||
leal 16(%esi),%esi
|
||||
vmovdqa -64(%ebx),%xmm0
|
||||
vmovdqa -48(%ebx),%xmm1
|
||||
vmovdqa -32(%ebx),%xmm2
|
||||
vmovdqa -16(%ebx),%xmm3
|
||||
vmovdqu %xmm4,-128(%edi)
|
||||
vmovdqu %xmm5,-64(%edi)
|
||||
vmovdqu %xmm6,(%edi)
|
||||
vmovdqu %xmm7,64(%edi)
|
||||
leal 16(%edi),%edi
|
||||
vpaddd -64(%ebp),%xmm0,%xmm0
|
||||
vpaddd -48(%ebp),%xmm1,%xmm1
|
||||
vpaddd -32(%ebp),%xmm2,%xmm2
|
||||
vpaddd -16(%ebp),%xmm3,%xmm3
|
||||
vpunpckldq %xmm1,%xmm0,%xmm6
|
||||
vpunpckldq %xmm3,%xmm2,%xmm7
|
||||
vpunpckhdq %xmm1,%xmm0,%xmm0
|
||||
vpunpckhdq %xmm3,%xmm2,%xmm2
|
||||
vpunpcklqdq %xmm7,%xmm6,%xmm1
|
||||
vpunpckhqdq %xmm7,%xmm6,%xmm6
|
||||
vpunpcklqdq %xmm2,%xmm0,%xmm7
|
||||
vpunpckhqdq %xmm2,%xmm0,%xmm3
|
||||
vpxor -128(%esi),%xmm1,%xmm4
|
||||
vpxor -64(%esi),%xmm6,%xmm5
|
||||
vpxor (%esi),%xmm7,%xmm6
|
||||
vpxor 64(%esi),%xmm3,%xmm7
|
||||
leal 16(%esi),%esi
|
||||
vmovdqa (%ebx),%xmm0
|
||||
vmovdqa 16(%ebx),%xmm1
|
||||
vmovdqa 32(%ebx),%xmm2
|
||||
vmovdqa 48(%ebx),%xmm3
|
||||
vmovdqu %xmm4,-128(%edi)
|
||||
vmovdqu %xmm5,-64(%edi)
|
||||
vmovdqu %xmm6,(%edi)
|
||||
vmovdqu %xmm7,64(%edi)
|
||||
leal 16(%edi),%edi
|
||||
vpaddd (%ebp),%xmm0,%xmm0
|
||||
vpaddd 16(%ebp),%xmm1,%xmm1
|
||||
vpaddd 32(%ebp),%xmm2,%xmm2
|
||||
vpaddd 48(%ebp),%xmm3,%xmm3
|
||||
vpunpckldq %xmm1,%xmm0,%xmm6
|
||||
vpunpckldq %xmm3,%xmm2,%xmm7
|
||||
vpunpckhdq %xmm1,%xmm0,%xmm0
|
||||
vpunpckhdq %xmm3,%xmm2,%xmm2
|
||||
vpunpcklqdq %xmm7,%xmm6,%xmm1
|
||||
vpunpckhqdq %xmm7,%xmm6,%xmm6
|
||||
vpunpcklqdq %xmm2,%xmm0,%xmm7
|
||||
vpunpckhqdq %xmm2,%xmm0,%xmm3
|
||||
vpxor -128(%esi),%xmm1,%xmm4
|
||||
vpxor -64(%esi),%xmm6,%xmm5
|
||||
vpxor (%esi),%xmm7,%xmm6
|
||||
vpxor 64(%esi),%xmm3,%xmm7
|
||||
leal 16(%esi),%esi
|
||||
vmovdqa 64(%ebx),%xmm0
|
||||
vmovdqa 80(%ebx),%xmm1
|
||||
vmovdqa 96(%ebx),%xmm2
|
||||
vmovdqa 112(%ebx),%xmm3
|
||||
vmovdqu %xmm4,-128(%edi)
|
||||
vmovdqu %xmm5,-64(%edi)
|
||||
vmovdqu %xmm6,(%edi)
|
||||
vmovdqu %xmm7,64(%edi)
|
||||
leal 16(%edi),%edi
|
||||
vpaddd 64(%ebp),%xmm0,%xmm0
|
||||
vpaddd 80(%ebp),%xmm1,%xmm1
|
||||
vpaddd 96(%ebp),%xmm2,%xmm2
|
||||
vpaddd 112(%ebp),%xmm3,%xmm3
|
||||
vpunpckldq %xmm1,%xmm0,%xmm6
|
||||
vpunpckldq %xmm3,%xmm2,%xmm7
|
||||
vpunpckhdq %xmm1,%xmm0,%xmm0
|
||||
vpunpckhdq %xmm3,%xmm2,%xmm2
|
||||
vpunpcklqdq %xmm7,%xmm6,%xmm1
|
||||
vpunpckhqdq %xmm7,%xmm6,%xmm6
|
||||
vpunpcklqdq %xmm2,%xmm0,%xmm7
|
||||
vpunpckhqdq %xmm2,%xmm0,%xmm3
|
||||
vpxor -128(%esi),%xmm1,%xmm4
|
||||
vpxor -64(%esi),%xmm6,%xmm5
|
||||
vpxor (%esi),%xmm7,%xmm6
|
||||
vpxor 64(%esi),%xmm3,%xmm7
|
||||
leal 208(%esi),%esi
|
||||
vmovdqu %xmm4,-128(%edi)
|
||||
vmovdqu %xmm5,-64(%edi)
|
||||
vmovdqu %xmm6,(%edi)
|
||||
vmovdqu %xmm7,64(%edi)
|
||||
leal 208(%edi),%edi
|
||||
subl $256,%ecx
|
||||
jnc .L015outer_loop
|
||||
addl $256,%ecx
|
||||
jz .L017done
|
||||
movl 520(%esp),%ebx
|
||||
leal -128(%esi),%esi
|
||||
movl 516(%esp),%edx
|
||||
leal -128(%edi),%edi
|
||||
vmovd 64(%ebp),%xmm2
|
||||
vmovdqu (%ebx),%xmm3
|
||||
vpaddd 96(%eax),%xmm2,%xmm2
|
||||
vpand 112(%eax),%xmm3,%xmm3
|
||||
vpor %xmm2,%xmm3,%xmm3
|
||||
.L0141x:
|
||||
vmovdqa 32(%eax),%xmm0
|
||||
vmovdqu (%edx),%xmm1
|
||||
vmovdqu 16(%edx),%xmm2
|
||||
vmovdqa (%eax),%xmm6
|
||||
vmovdqa 16(%eax),%xmm7
|
||||
movl %ebp,48(%esp)
|
||||
vmovdqa %xmm0,(%esp)
|
||||
vmovdqa %xmm1,16(%esp)
|
||||
vmovdqa %xmm2,32(%esp)
|
||||
vmovdqa %xmm3,48(%esp)
|
||||
movl $10,%edx
|
||||
jmp .L018loop1x
|
||||
.align 16
|
||||
.L019outer1x:
|
||||
vmovdqa 80(%eax),%xmm3
|
||||
vmovdqa (%esp),%xmm0
|
||||
vmovdqa 16(%esp),%xmm1
|
||||
vmovdqa 32(%esp),%xmm2
|
||||
vpaddd 48(%esp),%xmm3,%xmm3
|
||||
movl $10,%edx
|
||||
vmovdqa %xmm3,48(%esp)
|
||||
jmp .L018loop1x
|
||||
.align 16
|
||||
.L018loop1x:
|
||||
vpaddd %xmm1,%xmm0,%xmm0
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
.byte 143,232,120,194,219,16
|
||||
vpaddd %xmm3,%xmm2,%xmm2
|
||||
vpxor %xmm2,%xmm1,%xmm1
|
||||
.byte 143,232,120,194,201,12
|
||||
vpaddd %xmm1,%xmm0,%xmm0
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
.byte 143,232,120,194,219,8
|
||||
vpaddd %xmm3,%xmm2,%xmm2
|
||||
vpxor %xmm2,%xmm1,%xmm1
|
||||
.byte 143,232,120,194,201,7
|
||||
vpshufd $78,%xmm2,%xmm2
|
||||
vpshufd $57,%xmm1,%xmm1
|
||||
vpshufd $147,%xmm3,%xmm3
|
||||
vpaddd %xmm1,%xmm0,%xmm0
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
.byte 143,232,120,194,219,16
|
||||
vpaddd %xmm3,%xmm2,%xmm2
|
||||
vpxor %xmm2,%xmm1,%xmm1
|
||||
.byte 143,232,120,194,201,12
|
||||
vpaddd %xmm1,%xmm0,%xmm0
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
.byte 143,232,120,194,219,8
|
||||
vpaddd %xmm3,%xmm2,%xmm2
|
||||
vpxor %xmm2,%xmm1,%xmm1
|
||||
.byte 143,232,120,194,201,7
|
||||
vpshufd $78,%xmm2,%xmm2
|
||||
vpshufd $147,%xmm1,%xmm1
|
||||
vpshufd $57,%xmm3,%xmm3
|
||||
decl %edx
|
||||
jnz .L018loop1x
|
||||
vpaddd (%esp),%xmm0,%xmm0
|
||||
vpaddd 16(%esp),%xmm1,%xmm1
|
||||
vpaddd 32(%esp),%xmm2,%xmm2
|
||||
vpaddd 48(%esp),%xmm3,%xmm3
|
||||
cmpl $64,%ecx
|
||||
jb .L020tail
|
||||
vpxor (%esi),%xmm0,%xmm0
|
||||
vpxor 16(%esi),%xmm1,%xmm1
|
||||
vpxor 32(%esi),%xmm2,%xmm2
|
||||
vpxor 48(%esi),%xmm3,%xmm3
|
||||
leal 64(%esi),%esi
|
||||
vmovdqu %xmm0,(%edi)
|
||||
vmovdqu %xmm1,16(%edi)
|
||||
vmovdqu %xmm2,32(%edi)
|
||||
vmovdqu %xmm3,48(%edi)
|
||||
leal 64(%edi),%edi
|
||||
subl $64,%ecx
|
||||
jnz .L019outer1x
|
||||
jmp .L017done
|
||||
.L020tail:
|
||||
vmovdqa %xmm0,(%esp)
|
||||
vmovdqa %xmm1,16(%esp)
|
||||
vmovdqa %xmm2,32(%esp)
|
||||
vmovdqa %xmm3,48(%esp)
|
||||
xorl %eax,%eax
|
||||
xorl %edx,%edx
|
||||
xorl %ebp,%ebp
|
||||
.L021tail_loop:
|
||||
movb (%esp,%ebp,1),%al
|
||||
movb (%esi,%ebp,1),%dl
|
||||
leal 1(%ebp),%ebp
|
||||
xorb %dl,%al
|
||||
movb %al,-1(%edi,%ebp,1)
|
||||
decl %ecx
|
||||
jnz .L021tail_loop
|
||||
.L017done:
|
||||
vzeroupper
|
||||
movl 512(%esp),%esp
|
||||
popl %edi
|
||||
popl %esi
|
||||
popl %ebx
|
||||
popl %ebp
|
||||
ret
|
||||
.size ChaCha20_xop,.-.L_ChaCha20_xop_begin
|
||||
.comm OPENSSL_ia32cap_P,16,4
|
||||
#endif
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user