e16c18650c
For copies shorter than 512 bytes, the data is copied using plain ld/std instructions. For 512 bytes or more, the copy is done in 3 phases: Phase 1: copy from the src buffer until it's aligned at a 16-byte boundary Phase 2: copy as many aligned 64-byte blocks from the src buffer as possible Phase 3: copy the remaining data, if any In phase 2, this code uses VSX instructions when available. Otherwise, it uses ldx/stdx. Submitted by: Luis Pires <lffpires_ruabrasil.org> (original version) Reviewed by: jhibbits Differential Revision: https://reviews.freebsd.org/D15118
307 lines
7.0 KiB
ArmAsm
307 lines
7.0 KiB
ArmAsm
/*-
|
|
* Copyright (c) 2018 Instituto de Pesquisas Eldorado
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* 3. Neither the name of the author nor the names of its contributors may
|
|
* be used to endorse or promote products derived from this software
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
* SUCH DAMAGE.
|
|
*
|
|
*/
|
|
|
|
#include <machine/asm.h>
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
#define BLOCK_SIZE_BITS 6
|
|
#define BLOCK_SIZE (1 << BLOCK_SIZE_BITS)
|
|
#define BLOCK_SIZE_MASK (BLOCK_SIZE - 1)
|
|
|
|
#define MULTI_PHASE_THRESHOLD 512
|
|
|
|
#ifndef FN_NAME
|
|
#ifdef MEMMOVE
|
|
#define FN_NAME __memmove
|
|
WEAK_REFERENCE(__memmove, memmove);
|
|
#else
|
|
#define FN_NAME __bcopy
|
|
WEAK_REFERENCE(__bcopy, bcopy);
|
|
#endif
|
|
#endif
|
|
|
|
/*
|
|
* r3: dst
|
|
* r4: src
|
|
* r5: len
|
|
*/
|
|
|
|
ENTRY(FN_NAME)
|
|
cmpld %r3, %r4 /* src == dst? nothing to do */
|
|
beqlr-
|
|
cmpdi %r5, 0 /* len == 0? nothing to do */
|
|
beqlr-
|
|
|
|
#ifdef MEMMOVE
|
|
std %r3, -8(%r1) /* save dst */
|
|
#else /* bcopy: swap src/dst */
|
|
mr %r0, %r3
|
|
mr %r3, %r4
|
|
mr %r4, %r0
|
|
#endif
|
|
|
|
cmpldi %r5, MULTI_PHASE_THRESHOLD
|
|
bge .Lmulti_phase
|
|
|
|
/* align src */
|
|
cmpd %r4, %r3 /* forward or backward copy? */
|
|
blt .Lbackward_align
|
|
|
|
.align 5
|
|
.Lalign:
|
|
andi. %r0, %r4, 15
|
|
beq .Lsingle_copy
|
|
lbz %r0, 0(%r4)
|
|
addi %r4, %r4, 1
|
|
stb %r0, 0(%r3)
|
|
addi %r3, %r3, 1
|
|
addi %r5, %r5, -1
|
|
cmpdi %r5, 0
|
|
beq- .Ldone
|
|
b .Lalign
|
|
|
|
.Lbackward_align:
|
|
/* advance src and dst to end (past last byte) */
|
|
add %r3, %r3, %r5
|
|
add %r4, %r4, %r5
|
|
.align 5
|
|
.Lbackward_align_loop:
|
|
andi. %r0, %r4, 15
|
|
beq .Lbackward_single_copy
|
|
lbzu %r0, -1(%r4)
|
|
addi %r5, %r5, -1
|
|
stbu %r0, -1(%r3)
|
|
cmpdi %r5, 0
|
|
beq- .Ldone
|
|
b .Lbackward_align_loop
|
|
|
|
.Lsingle_copy:
|
|
/* forward copy */
|
|
li %r0, 1
|
|
li %r8, 16
|
|
li %r9, 0
|
|
b .Lsingle_phase
|
|
|
|
.Lbackward_single_copy:
|
|
/* backward copy */
|
|
li %r0, -1
|
|
li %r8, -16
|
|
li %r9, -15
|
|
/* point src and dst to last byte */
|
|
addi %r3, %r3, -1
|
|
addi %r4, %r4, -1
|
|
|
|
.Lsingle_phase:
|
|
srdi. %r6, %r5, 4 /* number of 16-bytes */
|
|
beq .Lsingle_1
|
|
|
|
/* pre-adjustment */
|
|
add %r3, %r3, %r9
|
|
add %r4, %r4, %r9
|
|
|
|
mtctr %r6
|
|
.align 5
|
|
.Lsingle_16_loop:
|
|
ld %r6, 0(%r4)
|
|
ld %r7, 8(%r4)
|
|
add %r4, %r4, %r8
|
|
std %r6, 0(%r3)
|
|
std %r7, 8(%r3)
|
|
add %r3, %r3, %r8
|
|
bdnz .Lsingle_16_loop
|
|
|
|
/* post-adjustment */
|
|
sub %r3, %r3, %r9
|
|
sub %r4, %r4, %r9
|
|
|
|
.Lsingle_1:
|
|
andi. %r6, %r5, 0x0f /* number of 1-bytes */
|
|
beq .Ldone /* 1-bytes == 0? done */
|
|
|
|
mtctr %r6
|
|
.align 5
|
|
.Lsingle_1_loop:
|
|
lbz %r6, 0(%r4)
|
|
add %r4, %r4, %r0 /* increment */
|
|
stb %r6, 0(%r3)
|
|
add %r3, %r3, %r0 /* increment */
|
|
bdnz .Lsingle_1_loop
|
|
|
|
.Ldone:
|
|
#ifdef MEMMOVE
|
|
ld %r3, -8(%r1) /* restore dst */
|
|
#endif
|
|
blr
|
|
|
|
|
|
.Lmulti_phase:
|
|
/* set up multi-phase copy parameters */
|
|
|
|
/* r7 = bytes before the aligned section of the buffer */
|
|
andi. %r6, %r4, 15
|
|
subfic %r7, %r6, 16
|
|
/* r8 = bytes in and after the aligned section of the buffer */
|
|
sub %r8, %r5, %r7
|
|
/* r9 = bytes after the aligned section of the buffer */
|
|
andi. %r9, %r8, BLOCK_SIZE_MASK
|
|
/* r10 = BLOCKS in the aligned section of the buffer */
|
|
srdi %r10, %r8, BLOCK_SIZE_BITS
|
|
|
|
/* forward or backward copy? */
|
|
cmpd %r4, %r3
|
|
blt .Lbackward_multi_copy
|
|
|
|
/* set up forward copy parameters */
|
|
std %r7, -32(%r1) /* bytes to copy in phase 1 */
|
|
std %r10, -40(%r1) /* BLOCKS to copy in phase 2 */
|
|
std %r9, -48(%r1) /* bytes to copy in phase 3 */
|
|
|
|
li %r0, 1 /* increment for phases 1 and 3 */
|
|
li %r5, BLOCK_SIZE /* increment for phase 2 */
|
|
|
|
/* op offsets for phase 2 */
|
|
li %r7, 0
|
|
li %r8, 16
|
|
li %r9, 32
|
|
li %r10, 48
|
|
|
|
std %r8, -16(%r1) /* 16-byte increment (16) */
|
|
std %r7, -24(%r1) /* 16-byte pre/post adjustment (0) */
|
|
|
|
b .Lphase1
|
|
|
|
.Lbackward_multi_copy:
|
|
/* set up backward copy parameters */
|
|
std %r9, -32(%r1) /* bytes to copy in phase 1 */
|
|
std %r10, -40(%r1) /* BLOCKS to copy in phase 2 */
|
|
std %r7, -48(%r1) /* bytes to copy in phase 3 */
|
|
|
|
li %r0, -1 /* increment for phases 1 and 3 */
|
|
add %r6, %r5, %r0 /* r6 = len - 1 */
|
|
li %r5, -BLOCK_SIZE /* increment for phase 2 */
|
|
/* advance src and dst to the last position */
|
|
add %r3, %r3, %r6
|
|
add %r4, %r4, %r6
|
|
|
|
/* op offsets for phase 2 */
|
|
li %r7, -15
|
|
li %r8, -31
|
|
li %r9, -47
|
|
li %r10, -63
|
|
|
|
add %r6, %r7, %r0 /* r6 = -16 */
|
|
std %r6, -16(%r1) /* 16-byte increment (-16) */
|
|
std %r7, -24(%r1) /* 16-byte pre/post adjustment (-15) */
|
|
|
|
.Lphase1:
|
|
ld %r6, -32(%r1) /* bytes to copy in phase 1 */
|
|
cmpldi %r6, 0 /* r6 == 0? skip phase 1 */
|
|
beq+ .Lphase2
|
|
|
|
mtctr %r6
|
|
.align 5
|
|
.Lphase1_loop:
|
|
lbz %r6, 0(%r4)
|
|
add %r4, %r4, %r0 /* phase 1 increment */
|
|
stb %r6, 0(%r3)
|
|
add %r3, %r3, %r0 /* phase 1 increment */
|
|
bdnz .Lphase1_loop
|
|
|
|
.Lphase2:
|
|
ld %r6, -40(%r1) /* BLOCKS to copy in phase 2 */
|
|
cmpldi %r6, 0 /* %r6 == 0? skip phase 2 */
|
|
beq .Lphase3
|
|
|
|
#ifdef FN_PHASE2
|
|
FN_PHASE2
|
|
#else
|
|
/* save registers */
|
|
std %r14, -56(%r1)
|
|
std %r15, -64(%r1)
|
|
std %r16, -72(%r1)
|
|
std %r17, -80(%r1)
|
|
std %r18, -88(%r1)
|
|
std %r19, -96(%r1)
|
|
std %r20, -104(%r1)
|
|
std %r21, -112(%r1)
|
|
|
|
addi %r18, %r7, 8
|
|
addi %r19, %r8, 8
|
|
addi %r20, %r9, 8
|
|
addi %r21, %r10, 8
|
|
|
|
mtctr %r6
|
|
.align 5
|
|
.Lphase2_loop:
|
|
ldx %r14, %r7, %r4
|
|
ldx %r15, %r18, %r4
|
|
ldx %r16, %r8, %r4
|
|
ldx %r17, %r19, %r4
|
|
stdx %r14, %r7, %r3
|
|
stdx %r15, %r18, %r3
|
|
stdx %r16, %r8, %r3
|
|
stdx %r17, %r19, %r3
|
|
|
|
ldx %r14, %r9, %r4
|
|
ldx %r15, %r20, %r4
|
|
ldx %r16, %r10, %r4
|
|
ldx %r17, %r21, %r4
|
|
stdx %r14, %r9, %r3
|
|
stdx %r15, %r20, %r3
|
|
stdx %r16, %r10, %r3
|
|
stdx %r17, %r21, %r3
|
|
|
|
add %r4, %r4, %r5 /* phase 2 increment */
|
|
add %r3, %r3, %r5 /* phase 2 increment */
|
|
|
|
bdnz .Lphase2_loop
|
|
|
|
/* restore registers */
|
|
ld %r14, -56(%r1)
|
|
ld %r15, -64(%r1)
|
|
ld %r16, -72(%r1)
|
|
ld %r17, -80(%r1)
|
|
ld %r18, -88(%r1)
|
|
ld %r19, -96(%r1)
|
|
ld %r20, -104(%r1)
|
|
ld %r21, -112(%r1)
|
|
#endif
|
|
|
|
.Lphase3:
|
|
/* load registers for transitioning into the single-phase logic */
|
|
ld %r5, -48(%r1) /* bytes to copy in phase 3 */
|
|
ld %r8, -16(%r1) /* 16-byte increment */
|
|
ld %r9, -24(%r1) /* 16-byte pre/post adjustment */
|
|
b .Lsingle_phase
|
|
|
|
END(FN_NAME)
|
|
|
|
.section .note.GNU-stack,"",%progbits
|
|
|