_ARM_ARCH_5E is always defined, we not support older CPUs.

This commit is contained in:
Michal Meloun 2020-11-29 16:44:22 +00:00
parent c5426ce3a6
commit f72366f927
5 changed files with 2 additions and 992 deletions

View File

@ -44,145 +44,6 @@ __FBSDID("$FreeBSD$");
#include "assym.inc"
#ifndef _ARM_ARCH_5E
/* #define BIG_LOOPS */
/*
* bcopy_page(src, dest)
*
* Optimised copy page routine.
*
* On entry:
* r0 - src address
* r1 - dest address
*
* Requires:
* number of bytes per page (PAGE_SIZE) is a multiple of 512 (BIG_LOOPS), 128
* otherwise.
*/
#define CHUNK_SIZE 32
#define PREFETCH_FIRST_CHUNK /* nothing */
#define PREFETCH_NEXT_CHUNK /* nothing */
#ifndef COPY_CHUNK
#define COPY_CHUNK \
PREFETCH_NEXT_CHUNK ; \
ldmia r0!, {r3-r8,ip,lr} ; \
stmia r1!, {r3-r8,ip,lr}
#endif /* ! COPY_CHUNK */
#ifndef SAVE_REGS
#define SAVE_REGS stmfd sp!, {r4-r8, lr}; _SAVE({r4-r8, lr})
#define RESTORE_REGS ldmfd sp!, {r4-r8, pc}
#endif
ENTRY(bcopy_page)
PREFETCH_FIRST_CHUNK
SAVE_REGS
#ifdef BIG_LOOPS
mov r2, #(PAGE_SIZE >> 9)
#else
mov r2, #(PAGE_SIZE >> 7)
#endif
1:
COPY_CHUNK
COPY_CHUNK
COPY_CHUNK
COPY_CHUNK
#ifdef BIG_LOOPS
/* There is little point making the loop any larger; unless we are
running with the cache off, the load/store overheads will
completely dominate this loop. */
COPY_CHUNK
COPY_CHUNK
COPY_CHUNK
COPY_CHUNK
COPY_CHUNK
COPY_CHUNK
COPY_CHUNK
COPY_CHUNK
COPY_CHUNK
COPY_CHUNK
COPY_CHUNK
COPY_CHUNK
#endif
subs r2, r2, #1
bne 1b
RESTORE_REGS /* ...and return. */
END(bcopy_page)
/*
* bzero_page(dest)
*
* Optimised zero page routine.
*
* On entry:
* r0 - dest address
*
* Requires:
* number of bytes per page (PAGE_SIZE) is a multiple of 512 (BIG_LOOPS), 128
* otherwise
*/
ENTRY(bzero_page)
stmfd sp!, {r4-r8, lr}
_SAVE({r4-r8, lr})
#ifdef BIG_LOOPS
mov r2, #(PAGE_SIZE >> 9)
#else
mov r2, #(PAGE_SIZE >> 7)
#endif
mov r3, #0
mov r4, #0
mov r5, #0
mov r6, #0
mov r7, #0
mov r8, #0
mov ip, #0
mov lr, #0
1:
stmia r0!, {r3-r8,ip,lr}
stmia r0!, {r3-r8,ip,lr}
stmia r0!, {r3-r8,ip,lr}
stmia r0!, {r3-r8,ip,lr}
#ifdef BIG_LOOPS
/* There is little point making the loop any larger; unless we are
running with the cache off, the load/store overheads will
completely dominate this loop. */
stmia r0!, {r3-r8,ip,lr}
stmia r0!, {r3-r8,ip,lr}
stmia r0!, {r3-r8,ip,lr}
stmia r0!, {r3-r8,ip,lr}
stmia r0!, {r3-r8,ip,lr}
stmia r0!, {r3-r8,ip,lr}
stmia r0!, {r3-r8,ip,lr}
stmia r0!, {r3-r8,ip,lr}
stmia r0!, {r3-r8,ip,lr}
stmia r0!, {r3-r8,ip,lr}
stmia r0!, {r3-r8,ip,lr}
stmia r0!, {r3-r8,ip,lr}
#endif
subs r2, r2, #1
bne 1b
ldmfd sp!, {r4-r8, pc}
END(bzero_page)
#else /* _ARM_ARCH_5E */
/*
* armv5e version of bcopy_page
@ -279,4 +140,3 @@ ENTRY(bzero_page)
bne 1b
RET
END(bzero_page)
#endif /* _ARM_ARCH_5E */

View File

@ -47,510 +47,7 @@
.word _C_LABEL(_min_memcpy_size)
__FBSDID("$FreeBSD$");
#ifdef _ARM_ARCH_5E
#include <arm/arm/bcopyinout_xscale.S>
#else
.text
.align 2
#define GET_PCB(tmp) \
mrc p15, 0, tmp, c13, c0, 4; \
add tmp, tmp, #(TD_PCB)
#define SAVE_REGS stmfd sp!, {r4-r11}; _SAVE({r4-r11})
#define RESTORE_REGS ldmfd sp!, {r4-r11}
#if defined(_ARM_ARCH_5E)
#define HELLOCPP #
#define PREFETCH(rx,o) pld [ rx , HELLOCPP (o) ]
#else
#define PREFETCH(rx,o)
#endif
/*
* r0 = user space address
* r1 = kernel space address
* r2 = length
*
* Copies bytes from user space to kernel space
*
* We save/restore r4-r11:
* r4-r11 are scratch
*/
ENTRY(copyin)
/* Quick exit if length is zero */
teq r2, #0
moveq r0, #0
RETeq
adds r3, r0, r2
movcs r0, #EFAULT
RETc(cs)
ldr r12, =(VM_MAXUSER_ADDRESS + 1)
cmp r3, r12
movcs r0, #EFAULT
RETc(cs)
ldr r3, .L_arm_memcpy
ldr r3, [r3]
cmp r3, #0
beq .Lnormal
ldr r3, .L_min_memcpy_size
ldr r3, [r3]
cmp r2, r3
blt .Lnormal
stmfd sp!, {r0-r2, r4, lr}
mov r3, r0
mov r0, r1
mov r1, r3
mov r3, #2 /* SRC_IS_USER */
ldr r4, .L_arm_memcpy
mov lr, pc
ldr pc, [r4]
cmp r0, #0
ldmfd sp!, {r0-r2, r4, lr}
moveq r0, #0
RETeq
.Lnormal:
SAVE_REGS
GET_PCB(r4)
ldr r4, [r4]
ldr r5, [r4, #PCB_ONFAULT]
adr r3, .Lcopyfault
str r3, [r4, #PCB_ONFAULT]
PREFETCH(r0, 0)
PREFETCH(r1, 0)
/*
* If not too many bytes, take the slow path.
*/
cmp r2, #0x08
blt .Licleanup
/*
* Align destination to word boundary.
*/
and r6, r1, #0x3
ldr pc, [pc, r6, lsl #2]
b .Lialend
.word .Lialend
.word .Lial3
.word .Lial2
.word .Lial1
.Lial3: ldrbt r6, [r0], #1
sub r2, r2, #1
strb r6, [r1], #1
.Lial2: ldrbt r7, [r0], #1
sub r2, r2, #1
strb r7, [r1], #1
.Lial1: ldrbt r6, [r0], #1
sub r2, r2, #1
strb r6, [r1], #1
.Lialend:
/*
* If few bytes left, finish slow.
*/
cmp r2, #0x08
blt .Licleanup
/*
* If source is not aligned, finish slow.
*/
ands r3, r0, #0x03
bne .Licleanup
cmp r2, #0x60 /* Must be > 0x5f for unrolled cacheline */
blt .Licleanup8
/*
* Align destination to cacheline boundary.
* If source and destination are nicely aligned, this can be a big
* win. If not, it's still cheaper to copy in groups of 32 even if
* we don't get the nice cacheline alignment.
*/
and r6, r1, #0x1f
ldr pc, [pc, r6]
b .Licaligned
.word .Licaligned
.word .Lical28
.word .Lical24
.word .Lical20
.word .Lical16
.word .Lical12
.word .Lical8
.word .Lical4
.Lical28:ldrt r6, [r0], #4
sub r2, r2, #4
str r6, [r1], #4
.Lical24:ldrt r7, [r0], #4
sub r2, r2, #4
str r7, [r1], #4
.Lical20:ldrt r6, [r0], #4
sub r2, r2, #4
str r6, [r1], #4
.Lical16:ldrt r7, [r0], #4
sub r2, r2, #4
str r7, [r1], #4
.Lical12:ldrt r6, [r0], #4
sub r2, r2, #4
str r6, [r1], #4
.Lical8:ldrt r7, [r0], #4
sub r2, r2, #4
str r7, [r1], #4
.Lical4:ldrt r6, [r0], #4
sub r2, r2, #4
str r6, [r1], #4
/*
* We start with > 0x40 bytes to copy (>= 0x60 got us into this
* part of the code, and we may have knocked that down by as much
* as 0x1c getting aligned).
*
* This loop basically works out to:
* do {
* prefetch-next-cacheline(s)
* bytes -= 0x20;
* copy cacheline
* } while (bytes >= 0x40);
* bytes -= 0x20;
* copy cacheline
*/
.Licaligned:
PREFETCH(r0, 32)
PREFETCH(r1, 32)
sub r2, r2, #0x20
/* Copy a cacheline */
ldrt r10, [r0], #4
ldrt r11, [r0], #4
ldrt r6, [r0], #4
ldrt r7, [r0], #4
ldrt r8, [r0], #4
ldrt r9, [r0], #4
stmia r1!, {r10-r11}
ldrt r10, [r0], #4
ldrt r11, [r0], #4
stmia r1!, {r6-r11}
cmp r2, #0x40
bge .Licaligned
sub r2, r2, #0x20
/* Copy a cacheline */
ldrt r10, [r0], #4
ldrt r11, [r0], #4
ldrt r6, [r0], #4
ldrt r7, [r0], #4
ldrt r8, [r0], #4
ldrt r9, [r0], #4
stmia r1!, {r10-r11}
ldrt r10, [r0], #4
ldrt r11, [r0], #4
stmia r1!, {r6-r11}
cmp r2, #0x08
blt .Liprecleanup
.Licleanup8:
ldrt r8, [r0], #4
ldrt r9, [r0], #4
sub r2, r2, #8
stmia r1!, {r8, r9}
cmp r2, #8
bge .Licleanup8
.Liprecleanup:
/*
* If we're done, bail.
*/
cmp r2, #0
beq .Lout
.Licleanup:
and r6, r2, #0x3
ldr pc, [pc, r6, lsl #2]
b .Licend
.word .Lic4
.word .Lic1
.word .Lic2
.word .Lic3
.Lic4: ldrbt r6, [r0], #1
sub r2, r2, #1
strb r6, [r1], #1
.Lic3: ldrbt r7, [r0], #1
sub r2, r2, #1
strb r7, [r1], #1
.Lic2: ldrbt r6, [r0], #1
sub r2, r2, #1
strb r6, [r1], #1
.Lic1: ldrbt r7, [r0], #1
subs r2, r2, #1
strb r7, [r1], #1
.Licend:
bne .Licleanup
.Liout:
mov r0, #0
str r5, [r4, #PCB_ONFAULT]
RESTORE_REGS
RET
.Lcopyfault:
ldr r0, =EFAULT
str r5, [r4, #PCB_ONFAULT]
RESTORE_REGS
RET
END(copyin)
/*
* r0 = kernel space address
* r1 = user space address
* r2 = length
*
* Copies bytes from kernel space to user space
*
* We save/restore r4-r11:
* r4-r11 are scratch
*/
ENTRY(copyout)
/* Quick exit if length is zero */
teq r2, #0
moveq r0, #0
RETeq
adds r3, r1, r2
movcs r0, #EFAULT
RETc(cs)
ldr r12, =(VM_MAXUSER_ADDRESS + 1)
cmp r3, r12
movcs r0, #EFAULT
RETc(cs)
ldr r3, .L_arm_memcpy
ldr r3, [r3]
cmp r3, #0
beq .Lnormale
ldr r3, .L_min_memcpy_size
ldr r3, [r3]
cmp r2, r3
blt .Lnormale
stmfd sp!, {r0-r2, r4, lr}
_SAVE({r0-r2, r4, lr})
mov r3, r0
mov r0, r1
mov r1, r3
mov r3, #1 /* DST_IS_USER */
ldr r4, .L_arm_memcpy
mov lr, pc
ldr pc, [r4]
cmp r0, #0
ldmfd sp!, {r0-r2, r4, lr}
moveq r0, #0
RETeq
.Lnormale:
SAVE_REGS
GET_PCB(r4)
ldr r4, [r4]
ldr r5, [r4, #PCB_ONFAULT]
adr r3, .Lcopyfault
str r3, [r4, #PCB_ONFAULT]
PREFETCH(r0, 0)
PREFETCH(r1, 0)
/*
* If not too many bytes, take the slow path.
*/
cmp r2, #0x08
blt .Lcleanup
/*
* Align destination to word boundary.
*/
and r6, r1, #0x3
ldr pc, [pc, r6, lsl #2]
b .Lalend
.word .Lalend
.word .Lal3
.word .Lal2
.word .Lal1
.Lal3: ldrb r6, [r0], #1
sub r2, r2, #1
strbt r6, [r1], #1
.Lal2: ldrb r7, [r0], #1
sub r2, r2, #1
strbt r7, [r1], #1
.Lal1: ldrb r6, [r0], #1
sub r2, r2, #1
strbt r6, [r1], #1
.Lalend:
/*
* If few bytes left, finish slow.
*/
cmp r2, #0x08
blt .Lcleanup
/*
* If source is not aligned, finish slow.
*/
ands r3, r0, #0x03
bne .Lcleanup
cmp r2, #0x60 /* Must be > 0x5f for unrolled cacheline */
blt .Lcleanup8
/*
* Align source & destination to cacheline boundary.
*/
and r6, r1, #0x1f
ldr pc, [pc, r6]
b .Lcaligned
.word .Lcaligned
.word .Lcal28
.word .Lcal24
.word .Lcal20
.word .Lcal16
.word .Lcal12
.word .Lcal8
.word .Lcal4
.Lcal28:ldr r6, [r0], #4
sub r2, r2, #4
strt r6, [r1], #4
.Lcal24:ldr r7, [r0], #4
sub r2, r2, #4
strt r7, [r1], #4
.Lcal20:ldr r6, [r0], #4
sub r2, r2, #4
strt r6, [r1], #4
.Lcal16:ldr r7, [r0], #4
sub r2, r2, #4
strt r7, [r1], #4
.Lcal12:ldr r6, [r0], #4
sub r2, r2, #4
strt r6, [r1], #4
.Lcal8: ldr r7, [r0], #4
sub r2, r2, #4
strt r7, [r1], #4
.Lcal4: ldr r6, [r0], #4
sub r2, r2, #4
strt r6, [r1], #4
/*
* We start with > 0x40 bytes to copy (>= 0x60 got us into this
* part of the code, and we may have knocked that down by as much
* as 0x1c getting aligned).
*
* This loop basically works out to:
* do {
* prefetch-next-cacheline(s)
* bytes -= 0x20;
* copy cacheline
* } while (bytes >= 0x40);
* bytes -= 0x20;
* copy cacheline
*/
.Lcaligned:
PREFETCH(r0, 32)
PREFETCH(r1, 32)
sub r2, r2, #0x20
/* Copy a cacheline */
ldmia r0!, {r6-r11}
strt r6, [r1], #4
strt r7, [r1], #4
ldmia r0!, {r6-r7}
strt r8, [r1], #4
strt r9, [r1], #4
strt r10, [r1], #4
strt r11, [r1], #4
strt r6, [r1], #4
strt r7, [r1], #4
cmp r2, #0x40
bge .Lcaligned
sub r2, r2, #0x20
/* Copy a cacheline */
ldmia r0!, {r6-r11}
strt r6, [r1], #4
strt r7, [r1], #4
ldmia r0!, {r6-r7}
strt r8, [r1], #4
strt r9, [r1], #4
strt r10, [r1], #4
strt r11, [r1], #4
strt r6, [r1], #4
strt r7, [r1], #4
cmp r2, #0x08
blt .Lprecleanup
.Lcleanup8:
ldmia r0!, {r8-r9}
sub r2, r2, #8
strt r8, [r1], #4
strt r9, [r1], #4
cmp r2, #8
bge .Lcleanup8
.Lprecleanup:
/*
* If we're done, bail.
*/
cmp r2, #0
beq .Lout
.Lcleanup:
and r6, r2, #0x3
ldr pc, [pc, r6, lsl #2]
b .Lcend
.word .Lc4
.word .Lc1
.word .Lc2
.word .Lc3
.Lc4: ldrb r6, [r0], #1
sub r2, r2, #1
strbt r6, [r1], #1
.Lc3: ldrb r7, [r0], #1
sub r2, r2, #1
strbt r7, [r1], #1
.Lc2: ldrb r6, [r0], #1
sub r2, r2, #1
strbt r6, [r1], #1
.Lc1: ldrb r7, [r0], #1
subs r2, r2, #1
strbt r7, [r1], #1
.Lcend:
bne .Lcleanup
.Lout:
mov r0, #0
str r5, [r4, #PCB_ONFAULT]
RESTORE_REGS
RET
END(copyout)
#endif
/*
* int badaddr_read_1(const uint8_t *src, uint8_t *dest)

View File

@ -116,9 +116,7 @@ END(do_cksum)
*/
/* LINTSTUB: Ignore */
ASENTRY_NP(L_cksumdata)
#ifdef _ARM_ARCH_5E
pld [r0] /* Pre-fetch the start of the buffer */
#endif
mov r2, #0
/* We first have to word-align the buffer. */
@ -144,7 +142,6 @@ ASENTRY_NP(L_cksumdata)
/* Buffer is now word aligned */
.Lcksumdata_wordaligned:
#ifdef _ARM_ARCH_5E
cmp r1, #0x04 /* Less than 4 bytes left? */
blt .Lcksumdata_endgame /* Yup */
@ -199,43 +196,10 @@ ASENTRY_NP(L_cksumdata)
adcs r2, r2, r7
adc r2, r2, #0x00
#else /* !_ARM_ARCH_5E */
subs r1, r1, #0x40
blt .Lcksumdata_bigloop_end
.Lcksumdata_bigloop:
ldmia r0!, {r3, r4, r5, r6}
adds r2, r2, r3
adcs r2, r2, r4
adcs r2, r2, r5
ldmia r0!, {r3, r4, r5, r7}
adcs r2, r2, r6
adcs r2, r2, r3
adcs r2, r2, r4
adcs r2, r2, r5
ldmia r0!, {r3, r4, r5, r6}
adcs r2, r2, r7
adcs r2, r2, r3
adcs r2, r2, r4
adcs r2, r2, r5
ldmia r0!, {r3, r4, r5, r7}
adcs r2, r2, r6
adcs r2, r2, r3
adcs r2, r2, r4
adcs r2, r2, r5
adcs r2, r2, r7
adc r2, r2, #0x00
subs r1, r1, #0x40
bge .Lcksumdata_bigloop
.Lcksumdata_bigloop_end:
#endif
adds r1, r1, #0x40
RETeq
cmp r1, #0x20
#ifdef _ARM_ARCH_5E
ldrdge r4, [r0], #0x08 /* Avoid stalling pld and result */
blt .Lcksumdata_less_than_32
pld [r0, #0x18]
@ -250,19 +214,6 @@ ASENTRY_NP(L_cksumdata)
adcs r2, r2, r5
adcs r2, r2, r6 /* XXX: Unavoidable result stall */
adcs r2, r2, r7
#else
blt .Lcksumdata_less_than_32
ldmia r0!, {r3, r4, r5, r6}
adds r2, r2, r3
adcs r2, r2, r4
adcs r2, r2, r5
ldmia r0!, {r3, r4, r5, r7}
adcs r2, r2, r6
adcs r2, r2, r3
adcs r2, r2, r4
adcs r2, r2, r5
adcs r2, r2, r7
#endif
adc r2, r2, #0x00
subs r1, r1, #0x20
RETeq

View File

@ -107,8 +107,8 @@ __FBSDID("$FreeBSD$");
#endif
#ifndef _ARM_ARCH_5E
#error FreeBSD requires ARMv5 or later
#ifndef _ARM_ARCH_6
#error FreeBSD requires ARMv6 or later
#endif
struct pcpu __pcpu[MAXCPU];

View File

@ -149,17 +149,11 @@ do_memset:
/* We are now word aligned */
.Lmemset_wordaligned:
orr r3, r3, r3, lsl #8 /* Extend value to 16-bits */
#ifdef _ARM_ARCH_5E
tst ip, #0x04 /* Quad-align for armv5e */
#else
cmp r1, #0x10
#endif
orr r3, r3, r3, lsl #16 /* Extend value to 32-bits */
#ifdef _ARM_ARCH_5E
subne r1, r1, #0x04 /* Quad-align if necessary */
strne r3, [ip], #0x04
cmp r1, #0x10
#endif
blt .Lmemset_loop4 /* If less than 16 then use words */
mov r2, r3 /* Duplicate data */
cmp r1, #0x80 /* If < 128 then skip the big loop */
@ -168,7 +162,6 @@ do_memset:
/* Do 128 bytes at a time */
.Lmemset_loop128:
subs r1, r1, #0x80
#ifdef _ARM_ARCH_5E
strdge r2, [ip], #0x08
strdge r2, [ip], #0x08
strdge r2, [ip], #0x08
@ -185,24 +178,6 @@ do_memset:
strdge r2, [ip], #0x08
strdge r2, [ip], #0x08
strdge r2, [ip], #0x08
#else
stmiage ip!, {r2-r3}
stmiage ip!, {r2-r3}
stmiage ip!, {r2-r3}
stmiage ip!, {r2-r3}
stmiage ip!, {r2-r3}
stmiage ip!, {r2-r3}
stmiage ip!, {r2-r3}
stmiage ip!, {r2-r3}
stmiage ip!, {r2-r3}
stmiage ip!, {r2-r3}
stmiage ip!, {r2-r3}
stmiage ip!, {r2-r3}
stmiage ip!, {r2-r3}
stmiage ip!, {r2-r3}
stmiage ip!, {r2-r3}
stmiage ip!, {r2-r3}
#endif
bgt .Lmemset_loop128
RETeq /* Zero length so just exit */
@ -211,30 +186,18 @@ do_memset:
/* Do 32 bytes at a time */
.Lmemset_loop32:
subs r1, r1, #0x20
#ifdef _ARM_ARCH_5E
strdge r2, [ip], #0x08
strdge r2, [ip], #0x08
strdge r2, [ip], #0x08
strdge r2, [ip], #0x08
#else
stmiage ip!, {r2-r3}
stmiage ip!, {r2-r3}
stmiage ip!, {r2-r3}
stmiage ip!, {r2-r3}
#endif
bgt .Lmemset_loop32
RETeq /* Zero length so just exit */
adds r1, r1, #0x10 /* Partially adjust for extra sub */
/* Deal with 16 bytes or more */
#ifdef _ARM_ARCH_5E
strdge r2, [ip], #0x08
strdge r2, [ip], #0x08
#else
stmiage ip!, {r2-r3}
stmiage ip!, {r2-r3}
#endif
RETeq /* Zero length so just exit */
addlt r1, r1, #0x10 /* Possibly adjust for extra sub */
@ -246,14 +209,10 @@ do_memset:
bgt .Lmemset_loop4
RETeq /* Zero length so just exit */
#ifdef _ARM_ARCH_5E
/* Compensate for 64-bit alignment check */
adds r1, r1, #0x04
RETeq
cmp r1, #2
#else
cmp r1, #-2
#endif
strb r3, [ip], #0x01 /* Set 1 byte */
strbge r3, [ip], #0x01 /* Set another byte */
@ -804,243 +763,6 @@ EENTRY(memmove)
EEND(memmove)
END(bcopy)
#if !defined(_ARM_ARCH_5E)
ENTRY(memcpy)
/* save leaf functions having to store this away */
/* Do not check arm_memcpy if we're running from flash */
#if defined(FLASHADDR) && defined(PHYSADDR)
#if FLASHADDR > PHYSADDR
ldr r3, =FLASHADDR
cmp r3, pc
bls .Lnormal
#else
ldr r3, =FLASHADDR
cmp r3, pc
bhi .Lnormal
#endif
#endif
ldr r3, .L_arm_memcpy
ldr r3, [r3]
cmp r3, #0
beq .Lnormal
ldr r3, .L_min_memcpy_size
ldr r3, [r3]
cmp r2, r3
blt .Lnormal
stmfd sp!, {r0-r2, r4, lr}
mov r3, #0
ldr r4, .L_arm_memcpy
mov lr, pc
ldr pc, [r4]
cmp r0, #0
ldmfd sp!, {r0-r2, r4, lr}
RETeq
.Lnormal:
stmdb sp!, {r0, lr} /* memcpy() returns dest addr */
subs r2, r2, #4
blt .Lmemcpy_l4 /* less than 4 bytes */
ands r12, r0, #3
bne .Lmemcpy_destul /* oh unaligned destination addr */
ands r12, r1, #3
bne .Lmemcpy_srcul /* oh unaligned source addr */
.Lmemcpy_t8:
/* We have aligned source and destination */
subs r2, r2, #8
blt .Lmemcpy_l12 /* less than 12 bytes (4 from above) */
subs r2, r2, #0x14
blt .Lmemcpy_l32 /* less than 32 bytes (12 from above) */
stmdb sp!, {r4} /* borrow r4 */
/* blat 32 bytes at a time */
/* XXX for really big copies perhaps we should use more registers */
.Lmemcpy_loop32:
ldmia r1!, {r3, r4, r12, lr}
stmia r0!, {r3, r4, r12, lr}
ldmia r1!, {r3, r4, r12, lr}
stmia r0!, {r3, r4, r12, lr}
subs r2, r2, #0x20
bge .Lmemcpy_loop32
cmn r2, #0x10
ldmiage r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
stmiage r0!, {r3, r4, r12, lr}
subge r2, r2, #0x10
ldmia sp!, {r4} /* return r4 */
.Lmemcpy_l32:
adds r2, r2, #0x14
/* blat 12 bytes at a time */
.Lmemcpy_loop12:
ldmiage r1!, {r3, r12, lr}
stmiage r0!, {r3, r12, lr}
subsge r2, r2, #0x0c
bge .Lmemcpy_loop12
.Lmemcpy_l12:
adds r2, r2, #8
blt .Lmemcpy_l4
subs r2, r2, #4
ldrlt r3, [r1], #4
strlt r3, [r0], #4
ldmiage r1!, {r3, r12}
stmiage r0!, {r3, r12}
subge r2, r2, #4
.Lmemcpy_l4:
/* less than 4 bytes to go */
adds r2, r2, #4
#ifdef __APCS_26_
ldmiaeq sp!, {r0, pc}^ /* done */
#else
ldmiaeq sp!, {r0, pc} /* done */
#endif
/* copy the crud byte at a time */
cmp r2, #2
ldrb r3, [r1], #1
strb r3, [r0], #1
ldrbge r3, [r1], #1
strbge r3, [r0], #1
ldrbgt r3, [r1], #1
strbgt r3, [r0], #1
ldmia sp!, {r0, pc}
/* erg - unaligned destination */
.Lmemcpy_destul:
rsb r12, r12, #4
cmp r12, #2
/* align destination with byte copies */
ldrb r3, [r1], #1
strb r3, [r0], #1
ldrbge r3, [r1], #1
strbge r3, [r0], #1
ldrbgt r3, [r1], #1
strbgt r3, [r0], #1
subs r2, r2, r12
blt .Lmemcpy_l4 /* less the 4 bytes */
ands r12, r1, #3
beq .Lmemcpy_t8 /* we have an aligned source */
/* erg - unaligned source */
/* This is where it gets nasty ... */
.Lmemcpy_srcul:
bic r1, r1, #3
ldr lr, [r1], #4
cmp r12, #2
bgt .Lmemcpy_srcul3
beq .Lmemcpy_srcul2
cmp r2, #0x0c
blt .Lmemcpy_srcul1loop4
sub r2, r2, #0x0c
stmdb sp!, {r4, r5}
.Lmemcpy_srcul1loop16:
mov r3, lr, lsr #8
ldmia r1!, {r4, r5, r12, lr}
orr r3, r3, r4, lsl #24
mov r4, r4, lsr #8
orr r4, r4, r5, lsl #24
mov r5, r5, lsr #8
orr r5, r5, r12, lsl #24
mov r12, r12, lsr #8
orr r12, r12, lr, lsl #24
stmia r0!, {r3-r5, r12}
subs r2, r2, #0x10
bge .Lmemcpy_srcul1loop16
ldmia sp!, {r4, r5}
adds r2, r2, #0x0c
blt .Lmemcpy_srcul1l4
.Lmemcpy_srcul1loop4:
mov r12, lr, lsr #8
ldr lr, [r1], #4
orr r12, r12, lr, lsl #24
str r12, [r0], #4
subs r2, r2, #4
bge .Lmemcpy_srcul1loop4
.Lmemcpy_srcul1l4:
sub r1, r1, #3
b .Lmemcpy_l4
.Lmemcpy_srcul2:
cmp r2, #0x0c
blt .Lmemcpy_srcul2loop4
sub r2, r2, #0x0c
stmdb sp!, {r4, r5}
.Lmemcpy_srcul2loop16:
mov r3, lr, lsr #16
ldmia r1!, {r4, r5, r12, lr}
orr r3, r3, r4, lsl #16
mov r4, r4, lsr #16
orr r4, r4, r5, lsl #16
mov r5, r5, lsr #16
orr r5, r5, r12, lsl #16
mov r12, r12, lsr #16
orr r12, r12, lr, lsl #16
stmia r0!, {r3-r5, r12}
subs r2, r2, #0x10
bge .Lmemcpy_srcul2loop16
ldmia sp!, {r4, r5}
adds r2, r2, #0x0c
blt .Lmemcpy_srcul2l4
.Lmemcpy_srcul2loop4:
mov r12, lr, lsr #16
ldr lr, [r1], #4
orr r12, r12, lr, lsl #16
str r12, [r0], #4
subs r2, r2, #4
bge .Lmemcpy_srcul2loop4
.Lmemcpy_srcul2l4:
sub r1, r1, #2
b .Lmemcpy_l4
.Lmemcpy_srcul3:
cmp r2, #0x0c
blt .Lmemcpy_srcul3loop4
sub r2, r2, #0x0c
stmdb sp!, {r4, r5}
.Lmemcpy_srcul3loop16:
mov r3, lr, lsr #24
ldmia r1!, {r4, r5, r12, lr}
orr r3, r3, r4, lsl #8
mov r4, r4, lsr #24
orr r4, r4, r5, lsl #8
mov r5, r5, lsr #24
orr r5, r5, r12, lsl #8
mov r12, r12, lsr #24
orr r12, r12, lr, lsl #8
stmia r0!, {r3-r5, r12}
subs r2, r2, #0x10
bge .Lmemcpy_srcul3loop16
ldmia sp!, {r4, r5}
adds r2, r2, #0x0c
blt .Lmemcpy_srcul3l4
.Lmemcpy_srcul3loop4:
mov r12, lr, lsr #24
ldr lr, [r1], #4
orr r12, r12, lr, lsl #8
str r12, [r0], #4
subs r2, r2, #4
bge .Lmemcpy_srcul3loop4
.Lmemcpy_srcul3l4:
sub r1, r1, #1
b .Lmemcpy_l4
END(memcpy)
#else
/* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
ENTRY(memcpy)
pld [r1]
@ -2398,23 +2120,3 @@ ENTRY(memcpy)
strb r1, [r0, #0x0b]
RET
END(memcpy)
#endif /* _ARM_ARCH_5E */
#ifdef GPROF
ENTRY(user)
nop
END(user)
ENTRY(btrap)
nop
END(btrap)
ENTRY(etrap)
nop
END(etrap)
ENTRY(bintr)
nop
END(bintr)
ENTRY(eintr)
nop
END(eintr)
#endif