Use a better version of memcpy/bcopy for mips kernel.

Use a variant of mips libc memcpy for kernel. This implementation uses 64-bit operations when compiled for 64-bit, and is significantly faster in that case. Submitted by: Tanmay Jagdale <tanmayj@broadcom.com>
2013-09-07 16:31:30 +00:00 · 2013-09-07 16:31:30 +00:00 · cbd49bff46
commit cbd49bff46
parent 58909b74b9
3 changed files with 287 additions and 92 deletions
--- a/sys/conf/files.mips
+++ b/sys/conf/files.mips
@ -38,6 +38,7 @@ mips/mips/stack_machdep.c		optional	ddb | stack
 mips/mips/stdatomic.c			standard \
 	compile-with "${NORMAL_C:N-Wmissing-prototypes}"
 mips/mips/support.S			standard
+mips/mips/bcopy.S			standard
 mips/mips/swtch.S			standard
 mips/mips/sys_machdep.c			standard
 mips/mips/tlb.c				standard
--- a/sys/mips/mips/bcopy.S
+++ b/sys/mips/mips/bcopy.S
@ -0,0 +1,286 @@
+/*	$NetBSD: bcopy.S,v 1.3 2009/12/14 00:39:00 matt Exp $	*/
+
+/*
+ * Mach Operating System
+ * Copyright (c) 1993 Carnegie Mellon University
+ * All Rights Reserved.
+ *
+ * Permission to use, copy, modify and distribute this software and its
+ * documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
+ * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie Mellon
+ * the rights to redistribute these changes.
+ */
+
+/*
+ *	File:	mips_bcopy.s
+ *	Author:	Chris Maeda
+ *	Date:	June 1993
+ *
+ *	Fast copy routine.  Derived from aligned_block_copy.
+ */
+
+
+#include <machine/asm.h>
+__FBSDID("$FreeBSD$");
+
+#include <machine/endian.h>
+
+#if defined(LIBC_SCCS) && !defined(lint)
+#if 0
+	ASMSTR("from: @(#)mips_bcopy.s	2.2 CMU 18/06/93")
+#else
+	ASMSTR("$NetBSD: bcopy.S,v 1.3 2009/12/14 00:39:00 matt Exp $")
+#endif
+#endif /* LIBC_SCCS and not lint */
+
+#ifdef __ABICALLS__
+	.abicalls
+#endif
+
+/*
+ *	bcopy(caddr_t src, caddr_t dst, unsigned int len)
+ *
+ *	a0 	src address
+ *	a1	dst address
+ *	a2	length
+ */
+
+#define	SRCREG	a0
+#define	DSTREG	a1
+#define	SIZEREG	a2
+
+LEAF(memcpy)
+	.set	noat
+	.set	noreorder
+
+	move	v0, a0
+	move	a0, a1
+	move	a1, v0
+
+ALEAF(bcopy)
+ALEAF(ovbcopy)
+	/*
+	 *	Make sure we can copy forwards.
+	 */
+	sltu	t0,SRCREG,DSTREG	# t0 == SRCREG < DSTREG
+	bne	t0,zero,6f		# copy backwards
+
+	/*
+	 * 	There are four alignment cases (with frequency)
+	 *	(Based on measurements taken with a DECstation 5000/200
+	 *	 inside a Mach kernel.)
+	 *
+	 * 	aligned   -> aligned		(mostly)
+	 * 	unaligned -> aligned		(sometimes)
+	 * 	aligned,unaligned -> unaligned	(almost never)
+	 *
+	 *	Note that we could add another case that checks if
+	 *	the destination and source are unaligned but the
+	 *	copy is alignable.  eg if src and dest are both
+	 *	on a halfword boundary.
+	 */
+	andi		t1,DSTREG,(SZREG-1)	# get last bits of dest
+	bne		t1,zero,3f		# dest unaligned
+	andi		t0,SRCREG,(SZREG-1)	# get last bits of src
+	bne		t0,zero,5f
+
+	/*
+	 *	Forward aligned->aligned copy, 8 words at a time.
+	 */
+98:
+	li		AT,-(SZREG*8)
+	and		t0,SIZEREG,AT		# count truncated to multiples
+	PTR_ADDU	a3,SRCREG,t0		# run fast loop up to this addr
+	sltu		AT,SRCREG,a3		# any work to do?
+	beq		AT,zero,2f
+	PTR_SUBU	SIZEREG,t0
+
+	/*
+	 *	loop body
+	 */
+1:	# cp
+	REG_L		t3,(0*SZREG)(SRCREG)
+	REG_L		v1,(1*SZREG)(SRCREG)
+	REG_L		t0,(2*SZREG)(SRCREG)
+	REG_L		t1,(3*SZREG)(SRCREG)
+	PTR_ADDU	SRCREG,SZREG*8
+	REG_S		t3,(0*SZREG)(DSTREG)
+	REG_S		v1,(1*SZREG)(DSTREG)
+	REG_S		t0,(2*SZREG)(DSTREG)
+	REG_S		t1,(3*SZREG)(DSTREG)
+	REG_L		t1,(-1*SZREG)(SRCREG)
+	REG_L		t0,(-2*SZREG)(SRCREG)
+	REG_L		v1,(-3*SZREG)(SRCREG)
+	REG_L		t3,(-4*SZREG)(SRCREG)
+	PTR_ADDU	DSTREG,SZREG*8
+	REG_S		t1,(-1*SZREG)(DSTREG)
+	REG_S		t0,(-2*SZREG)(DSTREG)
+	REG_S		v1,(-3*SZREG)(DSTREG)
+	bne		SRCREG,a3,1b
+	REG_S		t3,(-4*SZREG)(DSTREG)
+
+	/*
+	 *	Copy a word at a time, no loop unrolling.
+	 */
+2:	# wordcopy
+	andi		t2,SIZEREG,(SZREG-1)	# get byte count / SZREG
+	PTR_SUBU	t2,SIZEREG,t2		# t2 = words to copy * SZREG
+	beq		t2,zero,3f
+	PTR_ADDU	t0,SRCREG,t2		# stop at t0
+	PTR_SUBU	SIZEREG,SIZEREG,t2
+1:
+	REG_L		t3,0(SRCREG)
+	PTR_ADDU	SRCREG,SZREG
+	REG_S		t3,0(DSTREG)
+	bne		SRCREG,t0,1b
+	PTR_ADDU	DSTREG,SZREG
+
+3:	# bytecopy
+	beq		SIZEREG,zero,4f		# nothing left to do?
+	nop
+1:
+	lb		t3,0(SRCREG)
+	PTR_ADDU	SRCREG,1
+	sb		t3,0(DSTREG)
+	PTR_SUBU	SIZEREG,1
+	bgtz		SIZEREG,1b
+	PTR_ADDU	DSTREG,1
+
+4:	# copydone
+	j	ra
+	nop
+
+	/*
+	 *	Copy from unaligned source to aligned dest.
+	 */
+5:	# destaligned
+	andi		t0,SIZEREG,(SZREG-1)	# t0 = bytecount mod SZREG
+	PTR_SUBU	a3,SIZEREG,t0		# number of words to transfer
+	beq		a3,zero,3b
+	nop
+	move		SIZEREG,t0		# this many to do after we are done
+	PTR_ADDU	a3,SRCREG,a3		# stop point
+
+1:
+	REG_LHI		t3,0(SRCREG)
+	REG_LLO		t3,SZREG-1(SRCREG)
+	PTR_ADDI	SRCREG,SZREG
+	REG_S		t3,0(DSTREG)
+	bne		SRCREG,a3,1b
+	PTR_ADDI	DSTREG,SZREG
+
+	b		3b
+	nop
+
+6:	# backcopy -- based on above
+	PTR_ADDU	SRCREG,SIZEREG
+	PTR_ADDU	DSTREG,SIZEREG
+	andi		t1,DSTREG,SZREG-1	# get last 3 bits of dest
+	bne		t1,zero,3f
+	andi		t0,SRCREG,SZREG-1	# get last 3 bits of src
+	bne		t0,zero,5f
+
+	/*
+	 *	Forward aligned->aligned copy, 8*4 bytes at a time.
+	 */
+	li		AT,(-8*SZREG)
+	and		t0,SIZEREG,AT		# count truncated to multiple of 32
+	beq		t0,zero,2f		# any work to do?
+	PTR_SUBU	SIZEREG,t0
+	PTR_SUBU	a3,SRCREG,t0
+
+	/*
+	 *	loop body
+	 */
+1:	# cp
+	REG_L		t3,(-4*SZREG)(SRCREG)
+	REG_L		v1,(-3*SZREG)(SRCREG)
+	REG_L		t0,(-2*SZREG)(SRCREG)
+	REG_L		t1,(-1*SZREG)(SRCREG)
+	PTR_SUBU	SRCREG,8*SZREG
+	REG_S		t3,(-4*SZREG)(DSTREG)
+	REG_S		v1,(-3*SZREG)(DSTREG)
+	REG_S		t0,(-2*SZREG)(DSTREG)
+	REG_S		t1,(-1*SZREG)(DSTREG)
+	REG_L		t1,(3*SZREG)(SRCREG)
+	REG_L		t0,(2*SZREG)(SRCREG)
+	REG_L		v1,(1*SZREG)(SRCREG)
+	REG_L		t3,(0*SZREG)(SRCREG)
+	PTR_SUBU	DSTREG,8*SZREG
+	REG_S		t1,(3*SZREG)(DSTREG)
+	REG_S		t0,(2*SZREG)(DSTREG)
+	REG_S		v1,(1*SZREG)(DSTREG)
+	bne		SRCREG,a3,1b
+	REG_S		t3,(0*SZREG)(DSTREG)
+
+	/*
+	 *	Copy a word at a time, no loop unrolling.
+	 */
+2:	# wordcopy
+	andi		t2,SIZEREG,SZREG-1	# get byte count / 4
+	PTR_SUBU	t2,SIZEREG,t2		# t2 = number of words to copy
+	beq		t2,zero,3f
+	PTR_SUBU	t0,SRCREG,t2		# stop at t0
+	PTR_SUBU	SIZEREG,SIZEREG,t2
+1:
+	REG_L		t3,-SZREG(SRCREG)
+	PTR_SUBU	SRCREG,SZREG
+	REG_S		t3,-SZREG(DSTREG)
+	bne		SRCREG,t0,1b
+	PTR_SUBU	DSTREG,SZREG
+
+3:	# bytecopy
+	beq		SIZEREG,zero,4f		# nothing left to do?
+	nop
+1:
+	lb		t3,-1(SRCREG)
+	PTR_SUBU	SRCREG,1
+	sb		t3,-1(DSTREG)
+	PTR_SUBU	SIZEREG,1
+	bgtz		SIZEREG,1b
+	PTR_SUBU	DSTREG,1
+
+4:	# copydone
+	j	ra
+	nop
+
+	/*
+	 *	Copy from unaligned source to aligned dest.
+	 */
+5:	# destaligned
+	andi		t0,SIZEREG,SZREG-1	# t0 = bytecount mod 4
+	PTR_SUBU	a3,SIZEREG,t0		# number of words to transfer
+	beq		a3,zero,3b
+	nop
+	move		SIZEREG,t0		# this many to do after we are done
+	PTR_SUBU	a3,SRCREG,a3		# stop point
+
+1:
+	REG_LHI		t3,-SZREG(SRCREG)
+	REG_LLO		t3,-1(SRCREG)
+	PTR_SUBU	SRCREG,SZREG
+	REG_S		t3,-SZREG(DSTREG)
+	bne		SRCREG,a3,1b
+	PTR_SUBU	DSTREG,SZREG
+
+	b		3b
+	nop
+
+	.set	reorder
+	.set	at
+END(memcpy)
--- a/sys/mips/mips/support.S
+++ b/sys/mips/mips/support.S
@ -506,98 +506,6 @@ LEAF(fswintrberr)
 	li	v0, -1
 END(fswintrberr)

-/*
- * memcpy(to, from, len)
- * {ov}bcopy(from, to, len)
- */
-LEAF(memcpy)
-	.set	noreorder
-	move	v0, a0			# swap from and to
-	move	a0, a1
-	move	a1, v0
-ALEAF(bcopy)
-ALEAF(ovbcopy)
-	.set	noreorder
-	PTR_ADDU	t0, a0, a2		# t0 = end of s1 region
-	sltu	t1, a1, t0
-	sltu	t2, a0, a1
-	and	t1, t1, t2		# t1 = true if from < to < (from+len)
-	beq	t1, zero, forward	# non overlapping, do forward copy
-	slt	t2, a2, 12		# check for small copy
-
-	ble	a2, zero, 2f
-	PTR_ADDU	t1, a1, a2		# t1 = end of to region
-1:
-	lb	v1, -1(t0)		# copy bytes backwards,
-	PTR_SUBU	t0, t0, 1		#   doesnt happen often so do slow way
-	PTR_SUBU	t1, t1, 1
-	bne	t0, a0, 1b
-	sb	v1, 0(t1)
-2:
-	j	ra
-	nop
-forward:
-	bne	t2, zero, smallcpy	# do a small bcopy
-	xor	v1, a0, a1		# compare low two bits of addresses
-	and	v1, v1, 3
-	PTR_SUBU	a3, zero, a1		# compute # bytes to word align address
-	beq	v1, zero, aligned	# addresses can be word aligned
-	and	a3, a3, 3
-
-	beq	a3, zero, 1f
-	PTR_SUBU	a2, a2, a3		# subtract from remaining count
-	LWHI	v1, 0(a0)		# get next 4 bytes (unaligned)
-	LWLO	v1, 3(a0)
-	PTR_ADDU	a0, a0, a3
-	SWHI	v1, 0(a1)		# store 1, 2, or 3 bytes to align a1
-	PTR_ADDU	a1, a1, a3
-1:
-	and	v1, a2, 3		# compute number of words left
-	PTR_SUBU	a3, a2, v1
-	move	a2, v1
-	PTR_ADDU	a3, a3, a0		# compute ending address
-2:
-	LWHI	v1, 0(a0)		# copy words a0 unaligned, a1 aligned
-	LWLO	v1, 3(a0)
-	PTR_ADDU	a0, a0, 4
-	sw	v1, 0(a1)
-	PTR_ADDU	a1, a1, 4
-	bne	a0, a3, 2b
-	nop				# We have to do this mmu-bug.
-	b	smallcpy
-	nop
-aligned:
-	beq	a3, zero, 1f
-	PTR_SUBU	a2, a2, a3		# subtract from remaining count
-	LWHI	v1, 0(a0)		# copy 1, 2, or 3 bytes to align
-	PTR_ADDU	a0, a0, a3
-	SWHI	v1, 0(a1)
-	PTR_ADDU	a1, a1, a3
-1:
-	and	v1, a2, 3		# compute number of whole words left
-	PTR_SUBU	a3, a2, v1
-	move	a2, v1
-	PTR_ADDU	a3, a3, a0		# compute ending address
-2:
-	lw	v1, 0(a0)		# copy words
-	PTR_ADDU	a0, a0, 4
-	sw	v1, 0(a1)
-	bne	a0, a3, 2b
-	PTR_ADDU	a1, a1, 4
-smallcpy:
-	ble	a2, zero, 2f
-	PTR_ADDU	a3, a2, a0		# compute ending address
-1:
-	lbu	v1, 0(a0)		# copy bytes
-	PTR_ADDU	a0, a0, 1
-	sb	v1, 0(a1)
-	bne	a0, a3, 1b
-	PTR_ADDU	a1, a1, 1	   # MMU BUG ? can not do -1(a1) at 0x80000000!!
-2:
-	j	ra
-	nop
-END(memcpy)
-
 /*
 * memset(void *s1, int c, int len)
 * NetBSD: memset.S,v 1.3 2001/10/16 15:40:53 uch Exp