freebsd-nq/sys/arm/arm/bcopyinout.S
Olivier Houchard 0f7432f516 Do not use __XSCALE__ to detect if pld/strd/ldrd is available, use
_ARM_ARCH_5E instead.

MFC After:	3 days
2007-10-13 12:05:03 +00:00

658 lines
12 KiB
ArmAsm

/* $NetBSD: bcopyinout.S,v 1.11 2003/10/13 21:22:40 scw Exp $ */
/*-
* Copyright (c) 2002 Wasabi Systems, Inc.
* All rights reserved.
*
* Written by Allen Briggs for Wasabi Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed for the NetBSD Project by
* Wasabi Systems, Inc.
* 4. The name of Wasabi Systems, Inc. may not be used to endorse
* or promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "assym.s"
#include <machine/asm.h>
.L_arm_memcpy:
.word _C_LABEL(_arm_memcpy)
.L_min_memcpy_size:
.word _C_LABEL(_min_memcpy_size)
__FBSDID("$FreeBSD$");
#ifdef _ARM_ARCH_5E
#include <arm/arm/bcopyinout_xscale.S>
#else
.text
.align 0
#ifdef MULTIPROCESSOR
.Lcpu_info:
.word _C_LABEL(cpu_info)
#else
.Lcurpcb:
.word _C_LABEL(__pcpu) + PC_CURPCB
#endif
#define SAVE_REGS stmfd sp!, {r4-r11}
#define RESTORE_REGS ldmfd sp!, {r4-r11}
#if defined(_ARM_ARCH_5E)
#define HELLOCPP #
#define PREFETCH(rx,o) pld [ rx , HELLOCPP (o) ]
#else
#define PREFETCH(rx,o)
#endif
/*
* r0 = user space address
* r1 = kernel space address
* r2 = length
*
* Copies bytes from user space to kernel space
*
* We save/restore r4-r11:
* r4-r11 are scratch
*/
ENTRY(copyin)
/* Quick exit if length is zero */
teq r2, #0
moveq r0, #0
RETeq
ldr r3, .L_arm_memcpy
ldr r3, [r3]
cmp r3, #0
beq .Lnormal
ldr r3, .L_min_memcpy_size
ldr r3, [r3]
cmp r2, r3
blt .Lnormal
stmfd sp!, {r0-r2, r4, lr}
mov r3, r0
mov r0, r1
mov r1, r3
mov r3, #2 /* SRC_IS_USER */
ldr r4, .L_arm_memcpy
mov lr, pc
ldr pc, [r4]
cmp r0, #0
ldmfd sp!, {r0-r2, r4, lr}
moveq r0, #0
RETeq
.Lnormal:
SAVE_REGS
#ifdef MULTIPROCESSOR
/* XXX Probably not appropriate for non-Hydra SMPs */
stmfd sp!, {r0-r2, r14}
bl _C_LABEL(cpu_number)
ldr r4, .Lcpu_info
ldr r4, [r4, r0, lsl #2]
ldr r4, [r4, #CI_CURPCB]
ldmfd sp!, {r0-r2, r14}
#else
ldr r4, .Lcurpcb
ldr r4, [r4]
#endif
ldr r5, [r4, #PCB_ONFAULT]
adr r3, .Lcopyfault
str r3, [r4, #PCB_ONFAULT]
PREFETCH(r0, 0)
PREFETCH(r1, 0)
/*
* If not too many bytes, take the slow path.
*/
cmp r2, #0x08
blt .Licleanup
/*
* Align destination to word boundary.
*/
and r6, r1, #0x3
ldr pc, [pc, r6, lsl #2]
b .Lialend
.word .Lialend
.word .Lial3
.word .Lial2
.word .Lial1
.Lial3: ldrbt r6, [r0], #1
sub r2, r2, #1
strb r6, [r1], #1
.Lial2: ldrbt r7, [r0], #1
sub r2, r2, #1
strb r7, [r1], #1
.Lial1: ldrbt r6, [r0], #1
sub r2, r2, #1
strb r6, [r1], #1
.Lialend:
/*
* If few bytes left, finish slow.
*/
cmp r2, #0x08
blt .Licleanup
/*
* If source is not aligned, finish slow.
*/
ands r3, r0, #0x03
bne .Licleanup
cmp r2, #0x60 /* Must be > 0x5f for unrolled cacheline */
blt .Licleanup8
/*
* Align destination to cacheline boundary.
* If source and destination are nicely aligned, this can be a big
* win. If not, it's still cheaper to copy in groups of 32 even if
* we don't get the nice cacheline alignment.
*/
and r6, r1, #0x1f
ldr pc, [pc, r6]
b .Licaligned
.word .Licaligned
.word .Lical28
.word .Lical24
.word .Lical20
.word .Lical16
.word .Lical12
.word .Lical8
.word .Lical4
.Lical28:ldrt r6, [r0], #4
sub r2, r2, #4
str r6, [r1], #4
.Lical24:ldrt r7, [r0], #4
sub r2, r2, #4
str r7, [r1], #4
.Lical20:ldrt r6, [r0], #4
sub r2, r2, #4
str r6, [r1], #4
.Lical16:ldrt r7, [r0], #4
sub r2, r2, #4
str r7, [r1], #4
.Lical12:ldrt r6, [r0], #4
sub r2, r2, #4
str r6, [r1], #4
.Lical8:ldrt r7, [r0], #4
sub r2, r2, #4
str r7, [r1], #4
.Lical4:ldrt r6, [r0], #4
sub r2, r2, #4
str r6, [r1], #4
/*
* We start with > 0x40 bytes to copy (>= 0x60 got us into this
* part of the code, and we may have knocked that down by as much
* as 0x1c getting aligned).
*
* This loop basically works out to:
* do {
* prefetch-next-cacheline(s)
* bytes -= 0x20;
* copy cacheline
* } while (bytes >= 0x40);
* bytes -= 0x20;
* copy cacheline
*/
.Licaligned:
PREFETCH(r0, 32)
PREFETCH(r1, 32)
sub r2, r2, #0x20
/* Copy a cacheline */
ldrt r10, [r0], #4
ldrt r11, [r0], #4
ldrt r6, [r0], #4
ldrt r7, [r0], #4
ldrt r8, [r0], #4
ldrt r9, [r0], #4
stmia r1!, {r10-r11}
ldrt r10, [r0], #4
ldrt r11, [r0], #4
stmia r1!, {r6-r11}
cmp r2, #0x40
bge .Licaligned
sub r2, r2, #0x20
/* Copy a cacheline */
ldrt r10, [r0], #4
ldrt r11, [r0], #4
ldrt r6, [r0], #4
ldrt r7, [r0], #4
ldrt r8, [r0], #4
ldrt r9, [r0], #4
stmia r1!, {r10-r11}
ldrt r10, [r0], #4
ldrt r11, [r0], #4
stmia r1!, {r6-r11}
cmp r2, #0x08
blt .Liprecleanup
.Licleanup8:
ldrt r8, [r0], #4
ldrt r9, [r0], #4
sub r2, r2, #8
stmia r1!, {r8, r9}
cmp r2, #8
bge .Licleanup8
.Liprecleanup:
/*
* If we're done, bail.
*/
cmp r2, #0
beq .Lout
.Licleanup:
and r6, r2, #0x3
ldr pc, [pc, r6, lsl #2]
b .Licend
.word .Lic4
.word .Lic1
.word .Lic2
.word .Lic3
.Lic4: ldrbt r6, [r0], #1
sub r2, r2, #1
strb r6, [r1], #1
.Lic3: ldrbt r7, [r0], #1
sub r2, r2, #1
strb r7, [r1], #1
.Lic2: ldrbt r6, [r0], #1
sub r2, r2, #1
strb r6, [r1], #1
.Lic1: ldrbt r7, [r0], #1
subs r2, r2, #1
strb r7, [r1], #1
.Licend:
bne .Licleanup
.Liout:
mov r0, #0
str r5, [r4, #PCB_ONFAULT]
RESTORE_REGS
RET
.Lcopyfault:
mov r0, #14 /* EFAULT */
str r5, [r4, #PCB_ONFAULT]
RESTORE_REGS
RET
/*
* r0 = kernel space address
* r1 = user space address
* r2 = length
*
* Copies bytes from kernel space to user space
*
* We save/restore r4-r11:
* r4-r11 are scratch
*/
ENTRY(copyout)
/* Quick exit if length is zero */
teq r2, #0
moveq r0, #0
RETeq
ldr r3, .L_arm_memcpy
ldr r3, [r3]
cmp r3, #0
beq .Lnormale
ldr r3, .L_min_memcpy_size
ldr r3, [r3]
cmp r2, r3
blt .Lnormale
stmfd sp!, {r0-r2, r4, lr}
mov r3, r0
mov r0, r1
mov r1, r3
mov r3, #1 /* DST_IS_USER */
ldr r4, .L_arm_memcpy
mov lr, pc
ldr pc, [r4]
cmp r0, #0
ldmfd sp!, {r0-r2, r4, lr}
moveq r0, #0
RETeq
.Lnormale:
SAVE_REGS
#ifdef MULTIPROCESSOR
/* XXX Probably not appropriate for non-Hydra SMPs */
stmfd sp!, {r0-r2, r14}
bl _C_LABEL(cpu_number)
ldr r4, .Lcpu_info
ldr r4, [r4, r0, lsl #2]
ldr r4, [r4, #CI_CURPCB]
ldmfd sp!, {r0-r2, r14}
#else
ldr r4, .Lcurpcb
ldr r4, [r4]
#endif
ldr r5, [r4, #PCB_ONFAULT]
adr r3, .Lcopyfault
str r3, [r4, #PCB_ONFAULT]
PREFETCH(r0, 0)
PREFETCH(r1, 0)
/*
* If not too many bytes, take the slow path.
*/
cmp r2, #0x08
blt .Lcleanup
/*
* Align destination to word boundary.
*/
and r6, r1, #0x3
ldr pc, [pc, r6, lsl #2]
b .Lalend
.word .Lalend
.word .Lal3
.word .Lal2
.word .Lal1
.Lal3: ldrb r6, [r0], #1
sub r2, r2, #1
strbt r6, [r1], #1
.Lal2: ldrb r7, [r0], #1
sub r2, r2, #1
strbt r7, [r1], #1
.Lal1: ldrb r6, [r0], #1
sub r2, r2, #1
strbt r6, [r1], #1
.Lalend:
/*
* If few bytes left, finish slow.
*/
cmp r2, #0x08
blt .Lcleanup
/*
* If source is not aligned, finish slow.
*/
ands r3, r0, #0x03
bne .Lcleanup
cmp r2, #0x60 /* Must be > 0x5f for unrolled cacheline */
blt .Lcleanup8
/*
* Align source & destination to cacheline boundary.
*/
and r6, r1, #0x1f
ldr pc, [pc, r6]
b .Lcaligned
.word .Lcaligned
.word .Lcal28
.word .Lcal24
.word .Lcal20
.word .Lcal16
.word .Lcal12
.word .Lcal8
.word .Lcal4
.Lcal28:ldr r6, [r0], #4
sub r2, r2, #4
strt r6, [r1], #4
.Lcal24:ldr r7, [r0], #4
sub r2, r2, #4
strt r7, [r1], #4
.Lcal20:ldr r6, [r0], #4
sub r2, r2, #4
strt r6, [r1], #4
.Lcal16:ldr r7, [r0], #4
sub r2, r2, #4
strt r7, [r1], #4
.Lcal12:ldr r6, [r0], #4
sub r2, r2, #4
strt r6, [r1], #4
.Lcal8: ldr r7, [r0], #4
sub r2, r2, #4
strt r7, [r1], #4
.Lcal4: ldr r6, [r0], #4
sub r2, r2, #4
strt r6, [r1], #4
/*
* We start with > 0x40 bytes to copy (>= 0x60 got us into this
* part of the code, and we may have knocked that down by as much
* as 0x1c getting aligned).
*
* This loop basically works out to:
* do {
* prefetch-next-cacheline(s)
* bytes -= 0x20;
* copy cacheline
* } while (bytes >= 0x40);
* bytes -= 0x20;
* copy cacheline
*/
.Lcaligned:
PREFETCH(r0, 32)
PREFETCH(r1, 32)
sub r2, r2, #0x20
/* Copy a cacheline */
ldmia r0!, {r6-r11}
strt r6, [r1], #4
strt r7, [r1], #4
ldmia r0!, {r6-r7}
strt r8, [r1], #4
strt r9, [r1], #4
strt r10, [r1], #4
strt r11, [r1], #4
strt r6, [r1], #4
strt r7, [r1], #4
cmp r2, #0x40
bge .Lcaligned
sub r2, r2, #0x20
/* Copy a cacheline */
ldmia r0!, {r6-r11}
strt r6, [r1], #4
strt r7, [r1], #4
ldmia r0!, {r6-r7}
strt r8, [r1], #4
strt r9, [r1], #4
strt r10, [r1], #4
strt r11, [r1], #4
strt r6, [r1], #4
strt r7, [r1], #4
cmp r2, #0x08
blt .Lprecleanup
.Lcleanup8:
ldmia r0!, {r8-r9}
sub r2, r2, #8
strt r8, [r1], #4
strt r9, [r1], #4
cmp r2, #8
bge .Lcleanup8
.Lprecleanup:
/*
* If we're done, bail.
*/
cmp r2, #0
beq .Lout
.Lcleanup:
and r6, r2, #0x3
ldr pc, [pc, r6, lsl #2]
b .Lcend
.word .Lc4
.word .Lc1
.word .Lc2
.word .Lc3
.Lc4: ldrb r6, [r0], #1
sub r2, r2, #1
strbt r6, [r1], #1
.Lc3: ldrb r7, [r0], #1
sub r2, r2, #1
strbt r7, [r1], #1
.Lc2: ldrb r6, [r0], #1
sub r2, r2, #1
strbt r6, [r1], #1
.Lc1: ldrb r7, [r0], #1
subs r2, r2, #1
strbt r7, [r1], #1
.Lcend:
bne .Lcleanup
.Lout:
mov r0, #0
str r5, [r4, #PCB_ONFAULT]
RESTORE_REGS
RET
#endif
/*
* int badaddr_read_1(const uint8_t *src, uint8_t *dest)
*
* Copies a single 8-bit value from src to dest, returning 0 on success,
* else EFAULT if a page fault occurred.
*/
ENTRY(badaddr_read_1)
#ifdef MULTIPROCESSOR
/* XXX Probably not appropriate for non-Hydra SMPs */
stmfd sp!, {r0-r1, r14}
bl _C_LABEL(cpu_number)
ldr r2, .Lcpu_info
ldr r2, [r2, r0, lsl #2]
ldr r2, [r2, #CI_CURPCB]
ldmfd sp!, {r0-r1, r14}
#else
ldr r2, .Lcurpcb
ldr r2, [r2]
#endif
ldr ip, [r2, #PCB_ONFAULT]
adr r3, 1f
str r3, [r2, #PCB_ONFAULT]
nop
nop
nop
ldrb r3, [r0]
nop
nop
nop
strb r3, [r1]
mov r0, #0 /* No fault */
1: str ip, [r2, #PCB_ONFAULT]
RET
/*
* int badaddr_read_2(const uint16_t *src, uint16_t *dest)
*
* Copies a single 16-bit value from src to dest, returning 0 on success,
* else EFAULT if a page fault occurred.
*/
ENTRY(badaddr_read_2)
#ifdef MULTIPROCESSOR
/* XXX Probably not appropriate for non-Hydra SMPs */
stmfd sp!, {r0-r1, r14}
bl _C_LABEL(cpu_number)
ldr r2, .Lcpu_info
ldr r2, [r2, r0, lsl #2]
ldr r2, [r2, #CI_CURPCB]
ldmfd sp!, {r0-r1, r14}
#else
ldr r2, .Lcurpcb
ldr r2, [r2]
#endif
ldr ip, [r2, #PCB_ONFAULT]
adr r3, 1f
str r3, [r2, #PCB_ONFAULT]
nop
nop
nop
ldrh r3, [r0]
nop
nop
nop
strh r3, [r1]
mov r0, #0 /* No fault */
1: str ip, [r2, #PCB_ONFAULT]
RET
/*
* int badaddr_read_4(const uint32_t *src, uint32_t *dest)
*
* Copies a single 32-bit value from src to dest, returning 0 on success,
* else EFAULT if a page fault occurred.
*/
ENTRY(badaddr_read_4)
#ifdef MULTIPROCESSOR
/* XXX Probably not appropriate for non-Hydra SMPs */
stmfd sp!, {r0-r1, r14}
bl _C_LABEL(cpu_number)
ldr r2, .Lcpu_info
ldr r2, [r2, r0, lsl #2]
ldr r2, [r2, #CI_CURPCB]
ldmfd sp!, {r0-r1, r14}
#else
ldr r2, .Lcurpcb
ldr r2, [r2]
#endif
ldr ip, [r2, #PCB_ONFAULT]
adr r3, 1f
str r3, [r2, #PCB_ONFAULT]
nop
nop
nop
ldr r3, [r0]
nop
nop
nop
str r3, [r1]
mov r0, #0 /* No fault */
1: str ip, [r2, #PCB_ONFAULT]
RET