Build OpenSSL assembly sources for aarch64. Tested with ThunderX by andrew.

2016-10-26 20:02:22 +00:00 · 2016-10-26 20:02:22 +00:00 · 7518a9bd2b
commit 7518a9bd2b
parent 726f4773ec
11 changed files with 4416 additions and 12 deletions
--- a/crypto/openssl/crypto/aes/asm/aesv8-armx.pl
+++ b/crypto/openssl/crypto/aes/asm/aesv8-armx.pl
@ -42,7 +42,7 @@ $code=<<___;
 #if __ARM_MAX_ARCH__>=7
 .text
 ___
-$code.=".arch	armv8-a+crypto\n"			if ($flavour =~ /64/);
+# $code.=".arch	armv8-a+crypto\n"			if ($flavour =~ /64/);
 $code.=".arch	armv7-a\n.fpu	neon\n.code	32\n"	if ($flavour !~ /64/);
 		#^^^^^^ this is done to simplify adoption by not depending
 		#	on latest binutils.
--- a/crypto/openssl/crypto/arm64cpuid.S
+++ b/crypto/openssl/crypto/arm64cpuid.S
@ -1,7 +1,6 @@
 #include "arm_arch.h"

 .text
-.arch	armv8-a+crypto

 .align	5
 .global	_armv7_neon_probe
--- a/crypto/openssl/crypto/modes/asm/ghashv8-armx.pl
+++ b/crypto/openssl/crypto/modes/asm/ghashv8-armx.pl
@ -49,7 +49,7 @@ $code=<<___;

 .text
 ___
-$code.=".arch	armv8-a+crypto\n"	if ($flavour =~ /64/);
+# $code.=".arch	armv8-a+crypto\n"	if ($flavour =~ /64/);
 $code.=".fpu	neon\n.code	32\n"	if ($flavour !~ /64/);

 ################################################################################
--- a/secure/lib/libcrypto/Makefile
+++ b/secure/lib/libcrypto/Makefile
@ -22,7 +22,10 @@ MAN+=	config.5 des_modes.7
 # base sources
 SRCS=	cpt_err.c cryptlib.c cversion.c ex_data.c mem.c mem_dbg.c o_dir.c \
 	o_fips.c o_init.c o_str.c o_time.c uid.c
-.if defined(ASM_amd64)
+.if defined(ASM_aarch64)
+SRCS+=	arm64cpuid.S armcap.c mem_clr.c
+CFLAGS.arm64cpuid.S=	-march=armv8-a+crypto
+.elif defined(ASM_amd64)
 SRCS+=	x86_64cpuid.S
 .elif defined(ASM_arm)
 SRCS+=	armcap.c armv4cpuid.S
@ -35,7 +38,10 @@ INCS+=	crypto.h ebcdic.h opensslv.h ossl_typ.h symhacks.h ../e_os2.h

 # aes
 SRCS+=	aes_cfb.c aes_ctr.c aes_ecb.c aes_ige.c aes_misc.c aes_ofb.c aes_wrap.c
-.if defined(ASM_amd64)
+.if defined(ASM_aarch64)
+SRCS+=	aes_cbc.c aes_core.c aesv8-armx.S
+CFLAGS.aesv8-armx.S=	-march=armv8-a+crypto
+.elif defined(ASM_amd64)
 SRCS+=	aes-x86_64.S aesni-mb-x86_64.S aesni-sha1-x86_64.S \
 	aesni-sha256-x86_64.S aesni-x86_64.S bsaes-x86_64.S vpaes-x86_64.S
 .elif defined(ASM_arm)
@ -238,7 +244,10 @@ INCS+=	mdc2.h
 # modes
 SRCS+=	cbc128.c ccm128.c cfb128.c ctr128.c cts128.c gcm128.c ofb128.c \
 	wrap128.c xts128.c
-.if defined(ASM_amd64)
+.if defined(ASM_aarch64)
+SRCS+=	ghashv8-armx.S
+CFLAGS.ghashv8-armx.S=	-march=armv8-a+crypto
+.elif defined(ASM_amd64)
 SRCS+=	aesni-gcm-x86_64.S ghash-x86_64.S
 .elif defined(ASM_arm)
 SRCS+=	ghash-armv4.S ghashv8-armx.S
@ -324,7 +333,9 @@ INCS+=	seed.h

 # sha
 SRCS+=	sha1_one.c sha1dgst.c sha256.c sha512.c sha_dgst.c sha_one.c
-.if defined(ASM_amd64)
+.if defined(ASM_aarch64)
+SRCS+=	sha1-armv8.S sha256-armv8.S sha512-armv8.S
+.elif defined(ASM_amd64)
 SRCS+=	sha1-mb-x86_64.S sha1-x86_64.S sha256-mb-x86_64.S sha256-x86_64.S \
 	sha512-x86_64.S
 .elif defined(ASM_arm)
--- a/secure/lib/libcrypto/Makefile.asm
+++ b/secure/lib/libcrypto/Makefile.asm
@ -6,7 +6,44 @@

 .include "Makefile.inc"

-.if defined(ASM_amd64)
+.if defined(ASM_aarch64)
+
+.PATH:	${LCRYPTO_SRC}/crypto \
+	${LCRYPTO_SRC}/crypto/aes/asm \
+	${LCRYPTO_SRC}/crypto/modes/asm \
+	${LCRYPTO_SRC}/crypto/sha/asm
+
+PERLPATH=	-I${LCRYPTO_SRC}/crypto/perlasm
+
+# aes
+SRCS=	aesv8-armx.pl
+
+# modes
+SRCS+=	ghashv8-armx.pl
+
+# sha
+SRCS+=	sha1-armv8.pl sha512-armv8.pl
+
+ASM=	${SRCS:R:S/$/.S/} sha256-armv8.S
+
+all:	${ASM}
+
+CLEANFILES=	${ASM} ${SRCS:R:S/$/.s/} sha256-armv8.s
+.SUFFIXES:	.pl
+
+sha256-armv8.S:	sha512-armv8.pl
+	env CC=cc perl ${.ALLSRC} 64 ${.TARGET:R:S/$/.s/}
+	( echo '/* $$'FreeBSD'$$ */' ;\
+	echo '/* Do not modify. This file is auto-generated from ${.ALLSRC:T:R:S/$/.pl/}. */' ;\
+	cat ${.TARGET:R:S/$/.s/}) > ${.TARGET}
+
+.pl.S:
+	env CC=cc perl ${.IMPSRC} 64 ${.TARGET:R:S/$/.s/}
+	( echo '/* $$'FreeBSD'$$ */' ;\
+	echo '/* Do not modify. This file is auto-generated from ${.IMPSRC:T:R:S/$/.pl/}. */' ;\
+	cat ${.TARGET:R:S/$/.s/}) > ${.TARGET}
+
+.elif defined(ASM_amd64)

 .PATH:	${LCRYPTO_SRC}/crypto \
 	${LCRYPTO_SRC}/crypto/aes/asm \
--- a/secure/lib/libcrypto/Makefile.inc
+++ b/secure/lib/libcrypto/Makefile.inc
@ -21,7 +21,9 @@ CFLAGS+=-DL_ENDIAN
 CFLAGS+=-DB_ENDIAN
 .endif

-.if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386"
+.if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "arm"
+ASM_${MACHINE_CPUARCH}=
+.elif ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386"
 _ASM_AVX!=	{ \
 		    echo vzeroall | \
 		    ${CC} -x assembler -o /dev/null -c - 2> /dev/null; \
@ -29,11 +31,11 @@ _ASM_AVX!=	{ \
 .if ${_ASM_AVX} == yes
 ASM_${MACHINE_CPUARCH}=
 .endif
-.elif ${MACHINE_CPUARCH} == "arm"
-ASM_arm=
 .endif

-.if defined(ASM_amd64)
+.if defined(ASM_aarch64)
+CFLAGS+=-DSHA1_ASM -DSHA256_ASM -DSHA512_ASM
+.elif defined(ASM_amd64)
 CFLAGS+=-DOPENSSL_IA32_SSE2
 CFLAGS+=-DAES_ASM -DBSAES_ASM -DVPAES_ASM
 CFLAGS+=-DECP_NISTZ256_ASM
--- a/secure/lib/libcrypto/aarch64/aesv8-armx.S
+++ b/secure/lib/libcrypto/aarch64/aesv8-armx.S
@ -0,0 +1,748 @@
+/* $FreeBSD$ */
+/* Do not modify. This file is auto-generated from aesv8-armx.pl. */
+#include "arm_arch.h"
+
+#if __ARM_MAX_ARCH__>=7
+.text
+.align	5
+rcon:
+.long	0x01,0x01,0x01,0x01
+.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
+.long	0x1b,0x1b,0x1b,0x1b
+
+.globl	aes_v8_set_encrypt_key
+.type	aes_v8_set_encrypt_key,%function
+.align	5
+aes_v8_set_encrypt_key:
+.Lenc_key:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	mov	x3,#-1
+	cmp	x0,#0
+	b.eq	.Lenc_key_abort
+	cmp	x2,#0
+	b.eq	.Lenc_key_abort
+	mov	x3,#-2
+	cmp	w1,#128
+	b.lt	.Lenc_key_abort
+	cmp	w1,#256
+	b.gt	.Lenc_key_abort
+	tst	w1,#0x3f
+	b.ne	.Lenc_key_abort
+
+	adr	x3,rcon
+	cmp	w1,#192
+
+	eor	v0.16b,v0.16b,v0.16b
+	ld1	{v3.16b},[x0],#16
+	mov	w1,#8		// reuse w1
+	ld1	{v1.4s,v2.4s},[x3],#32
+
+	b.lt	.Loop128
+	b.eq	.L192
+	b	.L256
+
+.align	4
+.Loop128:
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	 eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+	b.ne	.Loop128
+
+	ld1	{v1.4s},[x3]
+
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	 eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	 eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	eor	v3.16b,v3.16b,v6.16b
+	st1	{v3.4s},[x2]
+	add	x2,x2,#0x50
+
+	mov	w12,#10
+	b	.Ldone
+
+.align	4
+.L192:
+	ld1	{v4.8b},[x0],#8
+	movi	v6.16b,#8			// borrow v6.16b
+	st1	{v3.4s},[x2],#16
+	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
+
+.Loop192:
+	tbl	v6.16b,{v4.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v4.8b},[x2],#8
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+
+	dup	v5.4s,v3.s[3]
+	eor	v5.16b,v5.16b,v4.16b
+	 eor	v6.16b,v6.16b,v1.16b
+	ext	v4.16b,v0.16b,v4.16b,#12
+	shl	v1.16b,v1.16b,#1
+	eor	v4.16b,v4.16b,v5.16b
+	eor	v3.16b,v3.16b,v6.16b
+	eor	v4.16b,v4.16b,v6.16b
+	st1	{v3.4s},[x2],#16
+	b.ne	.Loop192
+
+	mov	w12,#12
+	add	x2,x2,#0x20
+	b	.Ldone
+
+.align	4
+.L256:
+	ld1	{v4.16b},[x0]
+	mov	w1,#7
+	mov	w12,#14
+	st1	{v3.4s},[x2],#16
+
+.Loop256:
+	tbl	v6.16b,{v4.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v4.4s},[x2],#16
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	 eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+	st1	{v3.4s},[x2],#16
+	b.eq	.Ldone
+
+	dup	v6.4s,v3.s[3]		// just splat
+	ext	v5.16b,v0.16b,v4.16b,#12
+	aese	v6.16b,v0.16b
+
+	eor	v4.16b,v4.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v4.16b,v4.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v4.16b,v4.16b,v5.16b
+
+	eor	v4.16b,v4.16b,v6.16b
+	b	.Loop256
+
+.Ldone:
+	str	w12,[x2]
+	mov	x3,#0
+
+.Lenc_key_abort:
+	mov	x0,x3			// return value
+	ldr	x29,[sp],#16
+	ret
+.size	aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
+
+.globl	aes_v8_set_decrypt_key
+.type	aes_v8_set_decrypt_key,%function
+.align	5
+aes_v8_set_decrypt_key:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	bl	.Lenc_key
+
+	cmp	x0,#0
+	b.ne	.Ldec_key_abort
+
+	sub	x2,x2,#240		// restore original x2
+	mov	x4,#-16
+	add	x0,x2,x12,lsl#4	// end of key schedule
+
+	ld1	{v0.4s},[x2]
+	ld1	{v1.4s},[x0]
+	st1	{v0.4s},[x0],x4
+	st1	{v1.4s},[x2],#16
+
+.Loop_imc:
+	ld1	{v0.4s},[x2]
+	ld1	{v1.4s},[x0]
+	aesimc	v0.16b,v0.16b
+	aesimc	v1.16b,v1.16b
+	st1	{v0.4s},[x0],x4
+	st1	{v1.4s},[x2],#16
+	cmp	x0,x2
+	b.hi	.Loop_imc
+
+	ld1	{v0.4s},[x2]
+	aesimc	v0.16b,v0.16b
+	st1	{v0.4s},[x0]
+
+	eor	x0,x0,x0		// return value
+.Ldec_key_abort:
+	ldp	x29,x30,[sp],#16
+	ret
+.size	aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
+.globl	aes_v8_encrypt
+.type	aes_v8_encrypt,%function
+.align	5
+aes_v8_encrypt:
+	ldr	w3,[x2,#240]
+	ld1	{v0.4s},[x2],#16
+	ld1	{v2.16b},[x0]
+	sub	w3,w3,#2
+	ld1	{v1.4s},[x2],#16
+
+.Loop_enc:
+	aese	v2.16b,v0.16b
+	aesmc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2],#16
+	subs	w3,w3,#2
+	aese	v2.16b,v1.16b
+	aesmc	v2.16b,v2.16b
+	ld1	{v1.4s},[x2],#16
+	b.gt	.Loop_enc
+
+	aese	v2.16b,v0.16b
+	aesmc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2]
+	aese	v2.16b,v1.16b
+	eor	v2.16b,v2.16b,v0.16b
+
+	st1	{v2.16b},[x1]
+	ret
+.size	aes_v8_encrypt,.-aes_v8_encrypt
+.globl	aes_v8_decrypt
+.type	aes_v8_decrypt,%function
+.align	5
+aes_v8_decrypt:
+	ldr	w3,[x2,#240]
+	ld1	{v0.4s},[x2],#16
+	ld1	{v2.16b},[x0]
+	sub	w3,w3,#2
+	ld1	{v1.4s},[x2],#16
+
+.Loop_dec:
+	aesd	v2.16b,v0.16b
+	aesimc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2],#16
+	subs	w3,w3,#2
+	aesd	v2.16b,v1.16b
+	aesimc	v2.16b,v2.16b
+	ld1	{v1.4s},[x2],#16
+	b.gt	.Loop_dec
+
+	aesd	v2.16b,v0.16b
+	aesimc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2]
+	aesd	v2.16b,v1.16b
+	eor	v2.16b,v2.16b,v0.16b
+
+	st1	{v2.16b},[x1]
+	ret
+.size	aes_v8_decrypt,.-aes_v8_decrypt
+.globl	aes_v8_cbc_encrypt
+.type	aes_v8_cbc_encrypt,%function
+.align	5
+aes_v8_cbc_encrypt:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	subs	x2,x2,#16
+	mov	x8,#16
+	b.lo	.Lcbc_abort
+	csel	x8,xzr,x8,eq
+
+	cmp	w5,#0			// en- or decrypting?
+	ldr	w5,[x3,#240]
+	and	x2,x2,#-16
+	ld1	{v6.16b},[x4]
+	ld1	{v0.16b},[x0],x8
+
+	ld1	{v16.4s-v17.4s},[x3]		// load key schedule...
+	sub	w5,w5,#6
+	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
+	sub	w5,w5,#2
+	ld1	{v18.4s-v19.4s},[x7],#32
+	ld1	{v20.4s-v21.4s},[x7],#32
+	ld1	{v22.4s-v23.4s},[x7],#32
+	ld1	{v7.4s},[x7]
+
+	add	x7,x3,#32
+	mov	w6,w5
+	b.eq	.Lcbc_dec
+
+	cmp	w5,#2
+	eor	v0.16b,v0.16b,v6.16b
+	eor	v5.16b,v16.16b,v7.16b
+	b.eq	.Lcbc_enc128
+
+	ld1	{v2.4s-v3.4s},[x7]
+	add	x7,x3,#16
+	add	x6,x3,#16*4
+	add	x12,x3,#16*5
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	add	x14,x3,#16*6
+	add	x3,x3,#16*7
+	b	.Lenter_cbc_enc
+
+.align	4
+.Loop_cbc_enc:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	 st1	{v6.16b},[x1],#16
+.Lenter_cbc_enc:
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v2.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.4s},[x6]
+	cmp	w5,#4
+	aese	v0.16b,v3.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v17.4s},[x12]
+	b.eq	.Lcbc_enc192
+
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.4s},[x14]
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v17.4s},[x3]
+	nop
+
+.Lcbc_enc192:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	 subs	x2,x2,#16
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	 csel	x8,xzr,x8,eq
+	aese	v0.16b,v18.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v19.16b
+	aesmc	v0.16b,v0.16b
+	 ld1	{v16.16b},[x0],x8
+	aese	v0.16b,v20.16b
+	aesmc	v0.16b,v0.16b
+	 eor	v16.16b,v16.16b,v5.16b
+	aese	v0.16b,v21.16b
+	aesmc	v0.16b,v0.16b
+	 ld1 {v17.4s},[x7]		// re-pre-load rndkey[1]
+	aese	v0.16b,v22.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v23.16b
+	eor	v6.16b,v0.16b,v7.16b
+	b.hs	.Loop_cbc_enc
+
+	st1	{v6.16b},[x1],#16
+	b	.Lcbc_done
+
+.align	5
+.Lcbc_enc128:
+	ld1	{v2.4s-v3.4s},[x7]
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	b	.Lenter_cbc_enc128
+.Loop_cbc_enc128:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	 st1	{v6.16b},[x1],#16
+.Lenter_cbc_enc128:
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	 subs	x2,x2,#16
+	aese	v0.16b,v2.16b
+	aesmc	v0.16b,v0.16b
+	 csel	x8,xzr,x8,eq
+	aese	v0.16b,v3.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v18.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v19.16b
+	aesmc	v0.16b,v0.16b
+	 ld1	{v16.16b},[x0],x8
+	aese	v0.16b,v20.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v21.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v22.16b
+	aesmc	v0.16b,v0.16b
+	 eor	v16.16b,v16.16b,v5.16b
+	aese	v0.16b,v23.16b
+	eor	v6.16b,v0.16b,v7.16b
+	b.hs	.Loop_cbc_enc128
+
+	st1	{v6.16b},[x1],#16
+	b	.Lcbc_done
+.align	5
+.Lcbc_dec:
+	ld1	{v18.16b},[x0],#16
+	subs	x2,x2,#32		// bias
+	add	w6,w5,#2
+	orr	v3.16b,v0.16b,v0.16b
+	orr	v1.16b,v0.16b,v0.16b
+	orr	v19.16b,v18.16b,v18.16b
+	b.lo	.Lcbc_dec_tail
+
+	orr	v1.16b,v18.16b,v18.16b
+	ld1	{v18.16b},[x0],#16
+	orr	v2.16b,v0.16b,v0.16b
+	orr	v3.16b,v1.16b,v1.16b
+	orr	v19.16b,v18.16b,v18.16b
+
+.Loop3x_cbc_dec:
+	aesd	v0.16b,v16.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aesd	v0.16b,v17.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	.Loop3x_cbc_dec
+
+	aesd	v0.16b,v16.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	 eor	v4.16b,v6.16b,v7.16b
+	 subs	x2,x2,#0x30
+	 eor	v5.16b,v2.16b,v7.16b
+	 csel	x6,x2,x6,lo			// x6, w6, is zero at this point
+	aesd	v0.16b,v17.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	 eor	v17.16b,v3.16b,v7.16b
+	 add	x0,x0,x6		// x0 is adjusted in such way that
+					// at exit from the loop v1.16b-v18.16b
+					// are loaded with last "words"
+	 orr	v6.16b,v19.16b,v19.16b
+	 mov	x7,x3
+	aesd	v0.16b,v20.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v20.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v20.16b
+	aesimc	v18.16b,v18.16b
+	 ld1	{v2.16b},[x0],#16
+	aesd	v0.16b,v21.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v21.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v21.16b
+	aesimc	v18.16b,v18.16b
+	 ld1	{v3.16b},[x0],#16
+	aesd	v0.16b,v22.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v22.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v22.16b
+	aesimc	v18.16b,v18.16b
+	 ld1	{v19.16b},[x0],#16
+	aesd	v0.16b,v23.16b
+	aesd	v1.16b,v23.16b
+	aesd	v18.16b,v23.16b
+	 ld1 {v16.4s},[x7],#16	// re-pre-load rndkey[0]
+	 add	w6,w5,#2
+	eor	v4.16b,v4.16b,v0.16b
+	eor	v5.16b,v5.16b,v1.16b
+	eor	v18.16b,v18.16b,v17.16b
+	 ld1 {v17.4s},[x7],#16	// re-pre-load rndkey[1]
+	st1	{v4.16b},[x1],#16
+	 orr	v0.16b,v2.16b,v2.16b
+	st1	{v5.16b},[x1],#16
+	 orr	v1.16b,v3.16b,v3.16b
+	st1	{v18.16b},[x1],#16
+	 orr	v18.16b,v19.16b,v19.16b
+	b.hs	.Loop3x_cbc_dec
+
+	cmn	x2,#0x30
+	b.eq	.Lcbc_done
+	nop
+
+.Lcbc_dec_tail:
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	.Lcbc_dec_tail
+
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	aesd	v1.16b,v20.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v20.16b
+	aesimc	v18.16b,v18.16b
+	 cmn	x2,#0x20
+	aesd	v1.16b,v21.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v21.16b
+	aesimc	v18.16b,v18.16b
+	 eor	v5.16b,v6.16b,v7.16b
+	aesd	v1.16b,v22.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v22.16b
+	aesimc	v18.16b,v18.16b
+	 eor	v17.16b,v3.16b,v7.16b
+	aesd	v1.16b,v23.16b
+	aesd	v18.16b,v23.16b
+	b.eq	.Lcbc_dec_one
+	eor	v5.16b,v5.16b,v1.16b
+	eor	v17.16b,v17.16b,v18.16b
+	 orr	v6.16b,v19.16b,v19.16b
+	st1	{v5.16b},[x1],#16
+	st1	{v17.16b},[x1],#16
+	b	.Lcbc_done
+
+.Lcbc_dec_one:
+	eor	v5.16b,v5.16b,v18.16b
+	 orr	v6.16b,v19.16b,v19.16b
+	st1	{v5.16b},[x1],#16
+
+.Lcbc_done:
+	st1	{v6.16b},[x4]
+.Lcbc_abort:
+	ldr	x29,[sp],#16
+	ret
+.size	aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
+.globl	aes_v8_ctr32_encrypt_blocks
+.type	aes_v8_ctr32_encrypt_blocks,%function
+.align	5
+aes_v8_ctr32_encrypt_blocks:
+	stp		x29,x30,[sp,#-16]!
+	add		x29,sp,#0
+	ldr		w5,[x3,#240]
+
+	ldr		w8, [x4, #12]
+	ld1		{v0.4s},[x4]
+
+	ld1		{v16.4s-v17.4s},[x3]		// load key schedule...
+	sub		w5,w5,#4
+	mov		x12,#16
+	cmp		x2,#2
+	add		x7,x3,x5,lsl#4	// pointer to last 5 round keys
+	sub		w5,w5,#2
+	ld1		{v20.4s-v21.4s},[x7],#32
+	ld1		{v22.4s-v23.4s},[x7],#32
+	ld1		{v7.4s},[x7]
+	add		x7,x3,#32
+	mov		w6,w5
+	csel	x12,xzr,x12,lo
+#ifndef __ARMEB__
+	rev		w8, w8
+#endif
+	orr		v1.16b,v0.16b,v0.16b
+	add		w10, w8, #1
+	orr		v18.16b,v0.16b,v0.16b
+	add		w8, w8, #2
+	orr		v6.16b,v0.16b,v0.16b
+	rev		w10, w10
+	mov		v1.s[3],w10
+	b.ls		.Lctr32_tail
+	rev		w12, w8
+	sub		x2,x2,#3		// bias
+	mov		v18.s[3],w12
+	b		.Loop3x_ctr32
+
+.align	4
+.Loop3x_ctr32:
+	aese		v0.16b,v16.16b
+	aesmc		v0.16b,v0.16b
+	aese		v1.16b,v16.16b
+	aesmc		v1.16b,v1.16b
+	aese		v18.16b,v16.16b
+	aesmc		v18.16b,v18.16b
+	ld1		{v16.4s},[x7],#16
+	subs		w6,w6,#2
+	aese		v0.16b,v17.16b
+	aesmc		v0.16b,v0.16b
+	aese		v1.16b,v17.16b
+	aesmc		v1.16b,v1.16b
+	aese		v18.16b,v17.16b
+	aesmc		v18.16b,v18.16b
+	ld1		{v17.4s},[x7],#16
+	b.gt		.Loop3x_ctr32
+
+	aese		v0.16b,v16.16b
+	aesmc		v4.16b,v0.16b
+	aese		v1.16b,v16.16b
+	aesmc		v5.16b,v1.16b
+	 ld1		{v2.16b},[x0],#16
+	 orr		v0.16b,v6.16b,v6.16b
+	aese		v18.16b,v16.16b
+	aesmc		v18.16b,v18.16b
+	 ld1		{v3.16b},[x0],#16
+	 orr		v1.16b,v6.16b,v6.16b
+	aese		v4.16b,v17.16b
+	aesmc		v4.16b,v4.16b
+	aese		v5.16b,v17.16b
+	aesmc		v5.16b,v5.16b
+	 ld1		{v19.16b},[x0],#16
+	 mov		x7,x3
+	aese		v18.16b,v17.16b
+	aesmc		v17.16b,v18.16b
+	 orr		v18.16b,v6.16b,v6.16b
+	 add		w9,w8,#1
+	aese		v4.16b,v20.16b
+	aesmc		v4.16b,v4.16b
+	aese		v5.16b,v20.16b
+	aesmc		v5.16b,v5.16b
+	 eor		v2.16b,v2.16b,v7.16b
+	 add		w10,w8,#2
+	aese		v17.16b,v20.16b
+	aesmc		v17.16b,v17.16b
+	 eor		v3.16b,v3.16b,v7.16b
+	 add		w8,w8,#3
+	aese		v4.16b,v21.16b
+	aesmc		v4.16b,v4.16b
+	aese		v5.16b,v21.16b
+	aesmc		v5.16b,v5.16b
+	 eor		v19.16b,v19.16b,v7.16b
+	 rev		w9,w9
+	aese		v17.16b,v21.16b
+	aesmc		v17.16b,v17.16b
+	 mov	v0.s[3], w9
+	 rev		w10,w10
+	aese		v4.16b,v22.16b
+	aesmc		v4.16b,v4.16b
+	aese		v5.16b,v22.16b
+	aesmc		v5.16b,v5.16b
+	 mov	v1.s[3], w10
+	 rev		w12,w8
+	aese		v17.16b,v22.16b
+	aesmc		v17.16b,v17.16b
+	 mov	v18.s[3], w12
+	 subs		x2,x2,#3
+	aese		v4.16b,v23.16b
+	aese		v5.16b,v23.16b
+	aese		v17.16b,v23.16b
+
+	eor		v2.16b,v2.16b,v4.16b
+	 ld1	 {v16.4s},[x7],#16	// re-pre-load rndkey[0]
+	st1		{v2.16b},[x1],#16
+	eor		v3.16b,v3.16b,v5.16b
+	 mov		w6,w5
+	st1		{v3.16b},[x1],#16
+	eor		v19.16b,v19.16b,v17.16b
+	 ld1	 {v17.4s},[x7],#16	// re-pre-load rndkey[1]
+	st1		{v19.16b},[x1],#16
+	b.hs		.Loop3x_ctr32
+
+	adds		x2,x2,#3
+	b.eq		.Lctr32_done
+	cmp		x2,#1
+	mov		x12,#16
+	csel	x12,xzr,x12,eq
+
+.Lctr32_tail:
+	aese		v0.16b,v16.16b
+	aesmc		v0.16b,v0.16b
+	aese		v1.16b,v16.16b
+	aesmc		v1.16b,v1.16b
+	ld1		{v16.4s},[x7],#16
+	subs		w6,w6,#2
+	aese		v0.16b,v17.16b
+	aesmc		v0.16b,v0.16b
+	aese		v1.16b,v17.16b
+	aesmc		v1.16b,v1.16b
+	ld1		{v17.4s},[x7],#16
+	b.gt		.Lctr32_tail
+
+	aese		v0.16b,v16.16b
+	aesmc		v0.16b,v0.16b
+	aese		v1.16b,v16.16b
+	aesmc		v1.16b,v1.16b
+	aese		v0.16b,v17.16b
+	aesmc		v0.16b,v0.16b
+	aese		v1.16b,v17.16b
+	aesmc		v1.16b,v1.16b
+	 ld1		{v2.16b},[x0],x12
+	aese		v0.16b,v20.16b
+	aesmc		v0.16b,v0.16b
+	aese		v1.16b,v20.16b
+	aesmc		v1.16b,v1.16b
+	 ld1		{v3.16b},[x0]
+	aese		v0.16b,v21.16b
+	aesmc		v0.16b,v0.16b
+	aese		v1.16b,v21.16b
+	aesmc		v1.16b,v1.16b
+	 eor		v2.16b,v2.16b,v7.16b
+	aese		v0.16b,v22.16b
+	aesmc		v0.16b,v0.16b
+	aese		v1.16b,v22.16b
+	aesmc		v1.16b,v1.16b
+	 eor		v3.16b,v3.16b,v7.16b
+	aese		v0.16b,v23.16b
+	aese		v1.16b,v23.16b
+
+	cmp		x2,#1
+	eor		v2.16b,v2.16b,v0.16b
+	eor		v3.16b,v3.16b,v1.16b
+	st1		{v2.16b},[x1],#16
+	b.eq		.Lctr32_done
+	st1		{v3.16b},[x1]
+
+.Lctr32_done:
+	ldr		x29,[sp],#16
+	ret
+.size	aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
+#endif
--- a/secure/lib/libcrypto/aarch64/ghashv8-armx.S
+++ b/secure/lib/libcrypto/aarch64/ghashv8-armx.S
@ -0,0 +1,228 @@
+/* $FreeBSD$ */
+/* Do not modify. This file is auto-generated from ghashv8-armx.pl. */
+#include "arm_arch.h"
+
+.text
+.global	gcm_init_v8
+.type	gcm_init_v8,%function
+.align	4
+gcm_init_v8:
+	ld1		{v17.2d},[x1]		//load input H
+	movi		v19.16b,#0xe1
+	shl	v19.2d,v19.2d,#57		//0xc2.0
+	ext		v3.16b,v17.16b,v17.16b,#8
+	ushr	v18.2d,v19.2d,#63
+	dup		v17.4s,v17.s[1]
+	ext		v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
+	ushr	v18.2d,v3.2d,#63
+	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
+	and		v18.16b,v18.16b,v16.16b
+	shl	v3.2d,v3.2d,#1
+	ext		v18.16b,v18.16b,v18.16b,#8
+	and		v16.16b,v16.16b,v17.16b
+	orr		v3.16b,v3.16b,v18.16b		//H<<<=1
+	eor		v20.16b,v3.16b,v16.16b		//twisted H
+	st1		{v20.2d},[x0],#16		//store Htable[0]
+
+	//calculate H^2
+	ext		v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
+	pmull	v0.1q,v20.1d,v20.1d
+	eor		v16.16b,v16.16b,v20.16b
+	pmull2	v2.1q,v20.2d,v20.2d
+	pmull	v1.1q,v16.1d,v16.1d
+
+	ext		v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor		v18.16b,v0.16b,v2.16b
+	eor		v1.16b,v1.16b,v17.16b
+	eor		v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor		v0.16b,v1.16b,v18.16b
+
+	ext		v18.16b,v0.16b,v0.16b,#8		//2nd phase
+	pmull	v0.1q,v0.1d,v19.1d
+	eor		v18.16b,v18.16b,v2.16b
+	eor		v22.16b,v0.16b,v18.16b
+
+	ext		v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
+	eor		v17.16b,v17.16b,v22.16b
+	ext		v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
+	st1		{v21.2d-v22.2d},[x0]		//store Htable[1..2]
+
+	ret
+.size	gcm_init_v8,.-gcm_init_v8
+.global	gcm_gmult_v8
+.type	gcm_gmult_v8,%function
+.align	4
+gcm_gmult_v8:
+	ld1		{v17.2d},[x0]		//load Xi
+	movi		v19.16b,#0xe1
+	ld1		{v20.2d-v21.2d},[x1]	//load twisted H, ...
+	shl	v19.2d,v19.2d,#57
+#ifndef __ARMEB__
+	rev64	v17.16b,v17.16b
+#endif
+	ext		v3.16b,v17.16b,v17.16b,#8
+
+	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
+	eor		v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
+	pmull2	v2.1q,v20.2d,v3.2d		//H.hi·Xi.hi
+	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	ext		v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor		v18.16b,v0.16b,v2.16b
+	eor		v1.16b,v1.16b,v17.16b
+	eor		v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor		v0.16b,v1.16b,v18.16b
+
+	ext		v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor		v18.16b,v18.16b,v2.16b
+	eor		v0.16b,v0.16b,v18.16b
+
+#ifndef __ARMEB__
+	rev64	v0.16b,v0.16b
+#endif
+	ext		v0.16b,v0.16b,v0.16b,#8
+	st1		{v0.2d},[x0]		//write out Xi
+
+	ret
+.size	gcm_gmult_v8,.-gcm_gmult_v8
+.global	gcm_ghash_v8
+.type	gcm_ghash_v8,%function
+.align	4
+gcm_ghash_v8:
+	ld1		{v0.2d},[x0]		//load [rotated] Xi
+						//"[rotated]" means that
+						//loaded value would have
+						//to be rotated in order to
+						//make it appear as in
+						//alorithm specification
+	subs		x3,x3,#32		//see if x3 is 32 or larger
+	mov		x12,#16		//x12 is used as post-
+						//increment for input pointer;
+						//as loop is modulo-scheduled
+						//x12 is zeroed just in time
+						//to preclude oversteping
+						//inp[len], which means that
+						//last block[s] are actually
+						//loaded twice, but last
+						//copy is not processed
+	ld1		{v20.2d-v21.2d},[x1],#32	//load twisted H, ..., H^2
+	movi		v19.16b,#0xe1
+	ld1		{v22.2d},[x1]
+	csel	x12,xzr,x12,eq			//is it time to zero x12?
+	ext		v0.16b,v0.16b,v0.16b,#8		//rotate Xi
+	ld1		{v16.2d},[x2],#16	//load [rotated] I[0]
+	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
+#ifndef __ARMEB__
+	rev64	v16.16b,v16.16b
+	rev64	v0.16b,v0.16b
+#endif
+	ext		v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
+	b.lo		.Lodd_tail_v8		//x3 was less than 32
+	ld1		{v17.2d},[x2],x12	//load [rotated] I[1]
+#ifndef __ARMEB__
+	rev64	v17.16b,v17.16b
+#endif
+	ext		v7.16b,v17.16b,v17.16b,#8
+	eor		v3.16b,v3.16b,v0.16b		//I[i]^=Xi
+	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
+	eor		v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
+	pmull2	v6.1q,v20.2d,v7.2d
+	b		.Loop_mod2x_v8
+
+.align	4
+.Loop_mod2x_v8:
+	ext		v18.16b,v3.16b,v3.16b,#8
+	subs		x3,x3,#32		//is there more data?
+	pmull	v0.1q,v22.1d,v3.1d		//H^2.lo·Xi.lo
+	csel	x12,xzr,x12,lo			//is it time to zero x12?
+
+	 pmull	v5.1q,v21.1d,v17.1d
+	eor		v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
+	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hi·Xi.hi
+	eor		v0.16b,v0.16b,v4.16b		//accumulate
+	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+	 ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
+
+	eor		v2.16b,v2.16b,v6.16b
+	 csel	x12,xzr,x12,eq			//is it time to zero x12?
+	eor		v1.16b,v1.16b,v5.16b
+
+	ext		v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor		v18.16b,v0.16b,v2.16b
+	eor		v1.16b,v1.16b,v17.16b
+	 ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
+#ifndef __ARMEB__
+	 rev64	v16.16b,v16.16b
+#endif
+	eor		v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+
+#ifndef __ARMEB__
+	 rev64	v17.16b,v17.16b
+#endif
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	 ext		v7.16b,v17.16b,v17.16b,#8
+	 ext		v3.16b,v16.16b,v16.16b,#8
+	eor		v0.16b,v1.16b,v18.16b
+	 pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
+	eor		v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
+
+	ext		v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor		v3.16b,v3.16b,v18.16b
+	 eor		v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
+	eor		v3.16b,v3.16b,v0.16b
+	 pmull2	v6.1q,v20.2d,v7.2d
+	b.hs		.Loop_mod2x_v8		//there was at least 32 more bytes
+
+	eor		v2.16b,v2.16b,v18.16b
+	ext		v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
+	adds		x3,x3,#32		//re-construct x3
+	eor		v0.16b,v0.16b,v2.16b		//re-construct v0.16b
+	b.eq		.Ldone_v8		//is x3 zero?
+.Lodd_tail_v8:
+	ext		v18.16b,v0.16b,v0.16b,#8
+	eor		v3.16b,v3.16b,v0.16b		//inp^=Xi
+	eor		v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
+
+	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
+	eor		v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
+	pmull2	v2.1q,v20.2d,v3.2d		//H.hi·Xi.hi
+	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	ext		v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor		v18.16b,v0.16b,v2.16b
+	eor		v1.16b,v1.16b,v17.16b
+	eor		v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor		v0.16b,v1.16b,v18.16b
+
+	ext		v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor		v18.16b,v18.16b,v2.16b
+	eor		v0.16b,v0.16b,v18.16b
+
+.Ldone_v8:
+#ifndef __ARMEB__
+	rev64	v0.16b,v0.16b
+#endif
+	ext		v0.16b,v0.16b,v0.16b,#8
+	st1		{v0.2d},[x0]		//write out Xi
+
+	ret
+.size	gcm_ghash_v8,.-gcm_ghash_v8
+.asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
+.align  2
--- a/secure/lib/libcrypto/aarch64/sha1-armv8.S
+++ b/secure/lib/libcrypto/aarch64/sha1-armv8.S
--- a/secure/lib/libcrypto/aarch64/sha256-armv8.S
+++ b/secure/lib/libcrypto/aarch64/sha256-armv8.S
--- a/secure/lib/libcrypto/aarch64/sha512-armv8.S
+++ b/secure/lib/libcrypto/aarch64/sha512-armv8.S