Merge "Remove arm assembler not referenced from any makefile."
This commit is contained in:
commit
ad01c98319
3 changed files with 0 additions and 1617 deletions
|
@ -1,686 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (C) 2008 The Android Open Source Project
|
|
||||||
* All rights reserved.
|
|
||||||
*
|
|
||||||
* Redistribution and use in source and binary forms, with or without
|
|
||||||
* modification, are permitted provided that the following conditions
|
|
||||||
* are met:
|
|
||||||
* * Redistributions of source code must retain the above copyright
|
|
||||||
* notice, this list of conditions and the following disclaimer.
|
|
||||||
* * Redistributions in binary form must reproduce the above copyright
|
|
||||||
* notice, this list of conditions and the following disclaimer in
|
|
||||||
* the documentation and/or other materials provided with the
|
|
||||||
* distribution.
|
|
||||||
*
|
|
||||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
||||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
||||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
||||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
|
||||||
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
||||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
||||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
|
|
||||||
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
|
|
||||||
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
||||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
|
||||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
||||||
* SUCH DAMAGE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <machine/cpu-features.h>
|
|
||||||
#include <private/bionic_asm.h>
|
|
||||||
|
|
||||||
#if defined(__ARM_NEON__) && !defined(ARCH_ARM_USE_NON_NEON_MEMCPY)
|
|
||||||
|
|
||||||
.text
|
|
||||||
.fpu neon
|
|
||||||
|
|
||||||
#ifdef HAVE_32_BYTE_CACHE_LINE
|
|
||||||
/* a prefetch distance of 2 cache-lines */
|
|
||||||
#define CACHE_LINE_SIZE 32
|
|
||||||
#else
|
|
||||||
/* a prefetch distance of 4 cache-lines works best experimentally */
|
|
||||||
#define CACHE_LINE_SIZE 64
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ENTRY(memcpy)
|
|
||||||
.save {r0, lr}
|
|
||||||
/* start preloading as early as possible */
|
|
||||||
pld [r1, #(CACHE_LINE_SIZE * 0)]
|
|
||||||
stmfd sp!, {r0, lr}
|
|
||||||
pld [r1, #(CACHE_LINE_SIZE * 1)]
|
|
||||||
|
|
||||||
/* If Neon supports unaligned access then remove the align code,
|
|
||||||
* unless a size limit has been specified.
|
|
||||||
*/
|
|
||||||
#ifndef NEON_UNALIGNED_ACCESS
|
|
||||||
/* do we have at least 16-bytes to copy (needed for alignment below) */
|
|
||||||
cmp r2, #16
|
|
||||||
blo 5f
|
|
||||||
|
|
||||||
/* check if buffers are aligned. If so, run arm-only version */
|
|
||||||
eor r3, r0, r1
|
|
||||||
ands r3, r3, #0x3
|
|
||||||
beq 11f
|
|
||||||
|
|
||||||
/* align destination to cache-line for the write-buffer */
|
|
||||||
rsb r3, r0, #0
|
|
||||||
ands r3, r3, #0xF
|
|
||||||
beq 2f
|
|
||||||
|
|
||||||
/* copy up to 15-bytes (count in r3) */
|
|
||||||
sub r2, r2, r3
|
|
||||||
movs ip, r3, lsl #31
|
|
||||||
ldrmib lr, [r1], #1
|
|
||||||
strmib lr, [r0], #1
|
|
||||||
ldrcsb ip, [r1], #1
|
|
||||||
ldrcsb lr, [r1], #1
|
|
||||||
strcsb ip, [r0], #1
|
|
||||||
strcsb lr, [r0], #1
|
|
||||||
movs ip, r3, lsl #29
|
|
||||||
bge 1f
|
|
||||||
// copies 4 bytes, destination 32-bits aligned
|
|
||||||
vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
|
|
||||||
vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
|
|
||||||
1: bcc 2f
|
|
||||||
// copies 8 bytes, destination 64-bits aligned
|
|
||||||
vld1.8 {d0}, [r1]!
|
|
||||||
vst1.8 {d0}, [r0, :64]!
|
|
||||||
2:
|
|
||||||
/* preload immediately the next cache line, which we may need */
|
|
||||||
pld [r1, #(CACHE_LINE_SIZE * 0)]
|
|
||||||
pld [r1, #(CACHE_LINE_SIZE * 1)]
|
|
||||||
|
|
||||||
#ifdef HAVE_32_BYTE_CACHE_LINE
|
|
||||||
/* make sure we have at least 32 bytes to copy */
|
|
||||||
subs r2, r2, #32
|
|
||||||
blo 4f
|
|
||||||
|
|
||||||
/* preload all the cache lines we need.
|
|
||||||
* NOTE: the number of pld below depends on PREFETCH_DISTANCE,
|
|
||||||
* ideally would would increase the distance in the main loop to
|
|
||||||
* avoid the goofy code below. In practice this doesn't seem to make
|
|
||||||
* a big difference.
|
|
||||||
*/
|
|
||||||
pld [r1, #(PREFETCH_DISTANCE)]
|
|
||||||
|
|
||||||
1: /* The main loop copies 32 bytes at a time */
|
|
||||||
vld1.8 {d0 - d3}, [r1]!
|
|
||||||
pld [r1, #(PREFETCH_DISTANCE)]
|
|
||||||
subs r2, r2, #32
|
|
||||||
vst1.8 {d0 - d3}, [r0, :128]!
|
|
||||||
bhs 1b
|
|
||||||
#else
|
|
||||||
/* make sure we have at least 64 bytes to copy */
|
|
||||||
subs r2, r2, #64
|
|
||||||
blo 2f
|
|
||||||
|
|
||||||
/* preload all the cache lines we need. */
|
|
||||||
pld [r1, #(CACHE_LINE_SIZE * 2)]
|
|
||||||
pld [r1, #(CACHE_LINE_SIZE * 3)]
|
|
||||||
|
|
||||||
1: /* The main loop copies 64 bytes at a time */
|
|
||||||
vld1.8 {d0 - d3}, [r1]!
|
|
||||||
vld1.8 {d4 - d7}, [r1]!
|
|
||||||
#ifdef HAVE_32_BYTE_CACHE_LINE
|
|
||||||
pld [r1, #(CACHE_LINE_SIZE * 2)]
|
|
||||||
pld [r1, #(CACHE_LINE_SIZE * 3)]
|
|
||||||
#else
|
|
||||||
pld [r1, #(CACHE_LINE_SIZE * 3)]
|
|
||||||
#endif
|
|
||||||
subs r2, r2, #64
|
|
||||||
vst1.8 {d0 - d3}, [r0, :128]!
|
|
||||||
vst1.8 {d4 - d7}, [r0, :128]!
|
|
||||||
bhs 1b
|
|
||||||
|
|
||||||
2: /* fix-up the remaining count and make sure we have >= 32 bytes left */
|
|
||||||
add r2, r2, #64
|
|
||||||
subs r2, r2, #32
|
|
||||||
blo 4f
|
|
||||||
|
|
||||||
3: /* 32 bytes at a time. These cache lines were already preloaded */
|
|
||||||
vld1.8 {d0 - d3}, [r1]!
|
|
||||||
subs r2, r2, #32
|
|
||||||
vst1.8 {d0 - d3}, [r0, :128]!
|
|
||||||
bhs 3b
|
|
||||||
#endif
|
|
||||||
4: /* less than 32 left */
|
|
||||||
add r2, r2, #32
|
|
||||||
tst r2, #0x10
|
|
||||||
beq 5f
|
|
||||||
// copies 16 bytes, 128-bits aligned
|
|
||||||
vld1.8 {d0, d1}, [r1]!
|
|
||||||
vst1.8 {d0, d1}, [r0, :128]!
|
|
||||||
5: /* copy up to 15-bytes (count in r2) */
|
|
||||||
movs ip, r2, lsl #29
|
|
||||||
bcc 1f
|
|
||||||
vld1.8 {d0}, [r1]!
|
|
||||||
vst1.8 {d0}, [r0]!
|
|
||||||
1: bge 2f
|
|
||||||
vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
|
|
||||||
vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
|
|
||||||
2: movs ip, r2, lsl #31
|
|
||||||
ldrmib r3, [r1], #1
|
|
||||||
ldrcsb ip, [r1], #1
|
|
||||||
ldrcsb lr, [r1], #1
|
|
||||||
strmib r3, [r0], #1
|
|
||||||
strcsb ip, [r0], #1
|
|
||||||
strcsb lr, [r0], #1
|
|
||||||
|
|
||||||
ldmfd sp!, {r0, lr}
|
|
||||||
bx lr
|
|
||||||
|
|
||||||
#else /* NEON_UNALIGNED_ACCESS */
|
|
||||||
|
|
||||||
// Check so divider is at least 16 bytes, needed for alignment code.
|
|
||||||
cmp r2, #16
|
|
||||||
blo 5f
|
|
||||||
|
|
||||||
#ifdef NEON_MEMCPY_ALIGNMENT_DIVIDER
|
|
||||||
/* Check the upper size limit for Neon unaligned memory access in memcpy */
|
|
||||||
#if NEON_MEMCPY_ALIGNMENT_DIVIDER >= 16
|
|
||||||
cmp r2, #NEON_MEMCPY_ALIGNMENT_DIVIDER
|
|
||||||
blo 3f
|
|
||||||
#endif
|
|
||||||
/* check if buffers are aligned. If so, run arm-only version */
|
|
||||||
eor r3, r0, r1
|
|
||||||
ands r3, r3, #0x3
|
|
||||||
beq 11f
|
|
||||||
|
|
||||||
/* align destination to 16 bytes for the write-buffer */
|
|
||||||
rsb r3, r0, #0
|
|
||||||
ands r3, r3, #0xF
|
|
||||||
beq 3f
|
|
||||||
|
|
||||||
/* copy up to 15-bytes (count in r3) */
|
|
||||||
sub r2, r2, r3
|
|
||||||
movs ip, r3, lsl #31
|
|
||||||
ldrmib lr, [r1], #1
|
|
||||||
strmib lr, [r0], #1
|
|
||||||
ldrcsb ip, [r1], #1
|
|
||||||
ldrcsb lr, [r1], #1
|
|
||||||
strcsb ip, [r0], #1
|
|
||||||
strcsb lr, [r0], #1
|
|
||||||
movs ip, r3, lsl #29
|
|
||||||
bge 1f
|
|
||||||
// copies 4 bytes, destination 32-bits aligned
|
|
||||||
vld1.32 {d0[0]}, [r1]!
|
|
||||||
vst1.32 {d0[0]}, [r0, :32]!
|
|
||||||
1: bcc 2f
|
|
||||||
// copies 8 bytes, destination 64-bits aligned
|
|
||||||
vld1.8 {d0}, [r1]!
|
|
||||||
vst1.8 {d0}, [r0, :64]!
|
|
||||||
2:
|
|
||||||
/* preload immediately the next cache line, which we may need */
|
|
||||||
pld [r1, #(CACHE_LINE_SIZE * 0)]
|
|
||||||
pld [r1, #(CACHE_LINE_SIZE * 1)]
|
|
||||||
3:
|
|
||||||
#endif
|
|
||||||
/* make sure we have at least 64 bytes to copy */
|
|
||||||
subs r2, r2, #64
|
|
||||||
blo 2f
|
|
||||||
|
|
||||||
/* preload all the cache lines we need */
|
|
||||||
pld [r1, #(CACHE_LINE_SIZE * 2)]
|
|
||||||
pld [r1, #(CACHE_LINE_SIZE * 3)]
|
|
||||||
|
|
||||||
1: /* The main loop copies 64 bytes at a time */
|
|
||||||
vld1.8 {d0 - d3}, [r1]!
|
|
||||||
vld1.8 {d4 - d7}, [r1]!
|
|
||||||
#ifdef HAVE_32_BYTE_CACHE_LINE
|
|
||||||
pld [r1, #(CACHE_LINE_SIZE * 2)]
|
|
||||||
pld [r1, #(CACHE_LINE_SIZE * 3)]
|
|
||||||
#else
|
|
||||||
pld [r1, #(CACHE_LINE_SIZE * 3)]
|
|
||||||
#endif
|
|
||||||
subs r2, r2, #64
|
|
||||||
vst1.8 {d0 - d3}, [r0]!
|
|
||||||
vst1.8 {d4 - d7}, [r0]!
|
|
||||||
bhs 1b
|
|
||||||
|
|
||||||
2: /* fix-up the remaining count and make sure we have >= 32 bytes left */
|
|
||||||
add r2, r2, #64
|
|
||||||
subs r2, r2, #32
|
|
||||||
blo 4f
|
|
||||||
|
|
||||||
3: /* 32 bytes at a time. These cache lines were already preloaded */
|
|
||||||
vld1.8 {d0 - d3}, [r1]!
|
|
||||||
subs r2, r2, #32
|
|
||||||
vst1.8 {d0 - d3}, [r0]!
|
|
||||||
bhs 3b
|
|
||||||
|
|
||||||
4: /* less than 32 left */
|
|
||||||
add r2, r2, #32
|
|
||||||
tst r2, #0x10
|
|
||||||
beq 5f
|
|
||||||
// copies 16 bytes, 128-bits aligned
|
|
||||||
vld1.8 {d0, d1}, [r1]!
|
|
||||||
vst1.8 {d0, d1}, [r0]!
|
|
||||||
5: /* copy up to 15-bytes (count in r2) */
|
|
||||||
movs ip, r2, lsl #29
|
|
||||||
bcc 1f
|
|
||||||
vld1.8 {d0}, [r1]!
|
|
||||||
vst1.8 {d0}, [r0]!
|
|
||||||
1: bge 2f
|
|
||||||
vld1.32 {d0[0]}, [r1]!
|
|
||||||
vst1.32 {d0[0]}, [r0]!
|
|
||||||
2: movs ip, r2, lsl #31
|
|
||||||
ldrmib r3, [r1], #1
|
|
||||||
ldrcsb ip, [r1], #1
|
|
||||||
ldrcsb lr, [r1], #1
|
|
||||||
strmib r3, [r0], #1
|
|
||||||
strcsb ip, [r0], #1
|
|
||||||
strcsb lr, [r0], #1
|
|
||||||
|
|
||||||
ldmfd sp!, {r0, lr}
|
|
||||||
bx lr
|
|
||||||
#endif /* NEON_UNALIGNED_ACCESS */
|
|
||||||
11:
|
|
||||||
/* Simple arm-only copy loop to handle aligned copy operations */
|
|
||||||
stmfd sp!, {r4, r5, r6, r7, r8}
|
|
||||||
pld [r1, #(CACHE_LINE_SIZE * 2)]
|
|
||||||
|
|
||||||
/* Check alignment */
|
|
||||||
rsb r3, r1, #0
|
|
||||||
ands r3, #3
|
|
||||||
beq 2f
|
|
||||||
|
|
||||||
/* align source to 32 bits. We need to insert 2 instructions between
|
|
||||||
* a ldr[b|h] and str[b|h] because byte and half-word instructions
|
|
||||||
* stall 2 cycles.
|
|
||||||
*/
|
|
||||||
movs r12, r3, lsl #31
|
|
||||||
sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */
|
|
||||||
ldrmib r3, [r1], #1
|
|
||||||
ldrcsb r4, [r1], #1
|
|
||||||
ldrcsb r5, [r1], #1
|
|
||||||
strmib r3, [r0], #1
|
|
||||||
strcsb r4, [r0], #1
|
|
||||||
strcsb r5, [r0], #1
|
|
||||||
2:
|
|
||||||
subs r2, #32
|
|
||||||
blt 5f
|
|
||||||
pld [r1, #(CACHE_LINE_SIZE * 3)]
|
|
||||||
3: /* Main copy loop, copying 32 bytes at a time */
|
|
||||||
pld [r1, #(CACHE_LINE_SIZE * 4)]
|
|
||||||
ldmia r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
|
|
||||||
subs r2, r2, #32
|
|
||||||
stmia r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
|
|
||||||
bge 3b
|
|
||||||
5: /* Handle any remaining bytes */
|
|
||||||
adds r2, #32
|
|
||||||
beq 6f
|
|
||||||
|
|
||||||
movs r12, r2, lsl #28
|
|
||||||
ldmcsia r1!, {r3, r4, r5, r6} /* 16 bytes */
|
|
||||||
ldmmiia r1!, {r7, r8} /* 8 bytes */
|
|
||||||
stmcsia r0!, {r3, r4, r5, r6}
|
|
||||||
stmmiia r0!, {r7, r8}
|
|
||||||
movs r12, r2, lsl #30
|
|
||||||
ldrcs r3, [r1], #4 /* 4 bytes */
|
|
||||||
ldrmih r4, [r1], #2 /* 2 bytes */
|
|
||||||
strcs r3, [r0], #4
|
|
||||||
strmih r4, [r0], #2
|
|
||||||
tst r2, #0x1
|
|
||||||
ldrneb r3, [r1] /* last byte */
|
|
||||||
strneb r3, [r0]
|
|
||||||
6:
|
|
||||||
ldmfd sp!, {r4, r5, r6, r7, r8}
|
|
||||||
ldmfd sp!, {r0, pc}
|
|
||||||
END(memcpy)
|
|
||||||
|
|
||||||
|
|
||||||
#else /* __ARM_ARCH__ < 7 */
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Optimized memcpy() for ARM.
|
|
||||||
*
|
|
||||||
* note that memcpy() always returns the destination pointer,
|
|
||||||
* so we have to preserve R0.
|
|
||||||
*/
|
|
||||||
|
|
||||||
ENTRY(memcpy)
|
|
||||||
/* The stack must always be 64-bits aligned to be compliant with the
|
|
||||||
* ARM ABI. Since we have to save R0, we might as well save R4
|
|
||||||
* which we can use for better pipelining of the reads below
|
|
||||||
*/
|
|
||||||
.save {r0, r4, lr}
|
|
||||||
stmfd sp!, {r0, r4, lr}
|
|
||||||
/* Making room for r5-r11 which will be spilled later */
|
|
||||||
.pad #28
|
|
||||||
sub sp, sp, #28
|
|
||||||
|
|
||||||
// preload the destination because we'll align it to a cache line
|
|
||||||
// with small writes. Also start the source "pump".
|
|
||||||
pld [r0, #0]
|
|
||||||
pld [r1, #0]
|
|
||||||
pld [r1, #32]
|
|
||||||
|
|
||||||
/* it simplifies things to take care of len<4 early */
|
|
||||||
cmp r2, #4
|
|
||||||
blo copy_last_3_and_return
|
|
||||||
|
|
||||||
/* compute the offset to align the source
|
|
||||||
* offset = (4-(src&3))&3 = -src & 3
|
|
||||||
*/
|
|
||||||
rsb r3, r1, #0
|
|
||||||
ands r3, r3, #3
|
|
||||||
beq src_aligned
|
|
||||||
|
|
||||||
/* align source to 32 bits. We need to insert 2 instructions between
|
|
||||||
* a ldr[b|h] and str[b|h] because byte and half-word instructions
|
|
||||||
* stall 2 cycles.
|
|
||||||
*/
|
|
||||||
movs r12, r3, lsl #31
|
|
||||||
sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */
|
|
||||||
ldrmib r3, [r1], #1
|
|
||||||
ldrcsb r4, [r1], #1
|
|
||||||
ldrcsb r12,[r1], #1
|
|
||||||
strmib r3, [r0], #1
|
|
||||||
strcsb r4, [r0], #1
|
|
||||||
strcsb r12,[r0], #1
|
|
||||||
|
|
||||||
src_aligned:
|
|
||||||
|
|
||||||
/* see if src and dst are aligned together (congruent) */
|
|
||||||
eor r12, r0, r1
|
|
||||||
tst r12, #3
|
|
||||||
bne non_congruent
|
|
||||||
|
|
||||||
/* Use post-incriment mode for stm to spill r5-r11 to reserved stack
|
|
||||||
* frame. Don't update sp.
|
|
||||||
*/
|
|
||||||
stmea sp, {r5-r11}
|
|
||||||
|
|
||||||
/* align the destination to a cache-line */
|
|
||||||
rsb r3, r0, #0
|
|
||||||
ands r3, r3, #0x1C
|
|
||||||
beq congruent_aligned32
|
|
||||||
cmp r3, r2
|
|
||||||
andhi r3, r2, #0x1C
|
|
||||||
|
|
||||||
/* conditionnaly copies 0 to 7 words (length in r3) */
|
|
||||||
movs r12, r3, lsl #28
|
|
||||||
ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */
|
|
||||||
ldmmiia r1!, {r8, r9} /* 8 bytes */
|
|
||||||
stmcsia r0!, {r4, r5, r6, r7}
|
|
||||||
stmmiia r0!, {r8, r9}
|
|
||||||
tst r3, #0x4
|
|
||||||
ldrne r10,[r1], #4 /* 4 bytes */
|
|
||||||
strne r10,[r0], #4
|
|
||||||
sub r2, r2, r3
|
|
||||||
|
|
||||||
congruent_aligned32:
|
|
||||||
/*
|
|
||||||
* here source is aligned to 32 bytes.
|
|
||||||
*/
|
|
||||||
|
|
||||||
cached_aligned32:
|
|
||||||
subs r2, r2, #32
|
|
||||||
blo less_than_32_left
|
|
||||||
|
|
||||||
/*
|
|
||||||
* We preload a cache-line up to 64 bytes ahead. On the 926, this will
|
|
||||||
* stall only until the requested world is fetched, but the linefill
|
|
||||||
* continues in the the background.
|
|
||||||
* While the linefill is going, we write our previous cache-line
|
|
||||||
* into the write-buffer (which should have some free space).
|
|
||||||
* When the linefill is done, the writebuffer will
|
|
||||||
* start dumping its content into memory
|
|
||||||
*
|
|
||||||
* While all this is going, we then load a full cache line into
|
|
||||||
* 8 registers, this cache line should be in the cache by now
|
|
||||||
* (or partly in the cache).
|
|
||||||
*
|
|
||||||
* This code should work well regardless of the source/dest alignment.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
|
|
||||||
// Align the preload register to a cache-line because the cpu does
|
|
||||||
// "critical word first" (the first word requested is loaded first).
|
|
||||||
bic r12, r1, #0x1F
|
|
||||||
add r12, r12, #64
|
|
||||||
|
|
||||||
1: ldmia r1!, { r4-r11 }
|
|
||||||
pld [r12, #64]
|
|
||||||
subs r2, r2, #32
|
|
||||||
|
|
||||||
// NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
|
|
||||||
// for ARM9 preload will not be safely guarded by the preceding subs.
|
|
||||||
// When it is safely guarded the only possibility to have SIGSEGV here
|
|
||||||
// is because the caller overstates the length.
|
|
||||||
ldrhi r3, [r12], #32 /* cheap ARM9 preload */
|
|
||||||
stmia r0!, { r4-r11 }
|
|
||||||
bhs 1b
|
|
||||||
|
|
||||||
add r2, r2, #32
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
less_than_32_left:
|
|
||||||
/*
|
|
||||||
* less than 32 bytes left at this point (length in r2)
|
|
||||||
*/
|
|
||||||
|
|
||||||
/* skip all this if there is nothing to do, which should
|
|
||||||
* be a common case (if not executed the code below takes
|
|
||||||
* about 16 cycles)
|
|
||||||
*/
|
|
||||||
tst r2, #0x1F
|
|
||||||
beq 1f
|
|
||||||
|
|
||||||
/* conditionnaly copies 0 to 31 bytes */
|
|
||||||
movs r12, r2, lsl #28
|
|
||||||
ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */
|
|
||||||
ldmmiia r1!, {r8, r9} /* 8 bytes */
|
|
||||||
stmcsia r0!, {r4, r5, r6, r7}
|
|
||||||
stmmiia r0!, {r8, r9}
|
|
||||||
movs r12, r2, lsl #30
|
|
||||||
ldrcs r3, [r1], #4 /* 4 bytes */
|
|
||||||
ldrmih r4, [r1], #2 /* 2 bytes */
|
|
||||||
strcs r3, [r0], #4
|
|
||||||
strmih r4, [r0], #2
|
|
||||||
tst r2, #0x1
|
|
||||||
ldrneb r3, [r1] /* last byte */
|
|
||||||
strneb r3, [r0]
|
|
||||||
|
|
||||||
/* we're done! restore everything and return */
|
|
||||||
1: ldmfd sp!, {r5-r11}
|
|
||||||
ldmfd sp!, {r0, r4, lr}
|
|
||||||
bx lr
|
|
||||||
|
|
||||||
/********************************************************************/
|
|
||||||
|
|
||||||
non_congruent:
|
|
||||||
/*
|
|
||||||
* here source is aligned to 4 bytes
|
|
||||||
* but destination is not.
|
|
||||||
*
|
|
||||||
* in the code below r2 is the number of bytes read
|
|
||||||
* (the number of bytes written is always smaller, because we have
|
|
||||||
* partial words in the shift queue)
|
|
||||||
*/
|
|
||||||
cmp r2, #4
|
|
||||||
blo copy_last_3_and_return
|
|
||||||
|
|
||||||
/* Use post-incriment mode for stm to spill r5-r11 to reserved stack
|
|
||||||
* frame. Don't update sp.
|
|
||||||
*/
|
|
||||||
stmea sp, {r5-r11}
|
|
||||||
|
|
||||||
/* compute shifts needed to align src to dest */
|
|
||||||
rsb r5, r0, #0
|
|
||||||
and r5, r5, #3 /* r5 = # bytes in partial words */
|
|
||||||
mov r12, r5, lsl #3 /* r12 = right */
|
|
||||||
rsb lr, r12, #32 /* lr = left */
|
|
||||||
|
|
||||||
/* read the first word */
|
|
||||||
ldr r3, [r1], #4
|
|
||||||
sub r2, r2, #4
|
|
||||||
|
|
||||||
/* write a partial word (0 to 3 bytes), such that destination
|
|
||||||
* becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
|
|
||||||
*/
|
|
||||||
movs r5, r5, lsl #31
|
|
||||||
strmib r3, [r0], #1
|
|
||||||
movmi r3, r3, lsr #8
|
|
||||||
strcsb r3, [r0], #1
|
|
||||||
movcs r3, r3, lsr #8
|
|
||||||
strcsb r3, [r0], #1
|
|
||||||
movcs r3, r3, lsr #8
|
|
||||||
|
|
||||||
cmp r2, #4
|
|
||||||
blo partial_word_tail
|
|
||||||
|
|
||||||
/* Align destination to 32 bytes (cache line boundary) */
|
|
||||||
1: tst r0, #0x1c
|
|
||||||
beq 2f
|
|
||||||
ldr r5, [r1], #4
|
|
||||||
sub r2, r2, #4
|
|
||||||
orr r4, r3, r5, lsl lr
|
|
||||||
mov r3, r5, lsr r12
|
|
||||||
str r4, [r0], #4
|
|
||||||
cmp r2, #4
|
|
||||||
bhs 1b
|
|
||||||
blo partial_word_tail
|
|
||||||
|
|
||||||
/* copy 32 bytes at a time */
|
|
||||||
2: subs r2, r2, #32
|
|
||||||
blo less_than_thirtytwo
|
|
||||||
|
|
||||||
/* Use immediate mode for the shifts, because there is an extra cycle
|
|
||||||
* for register shifts, which could account for up to 50% of
|
|
||||||
* performance hit.
|
|
||||||
*/
|
|
||||||
|
|
||||||
cmp r12, #24
|
|
||||||
beq loop24
|
|
||||||
cmp r12, #8
|
|
||||||
beq loop8
|
|
||||||
|
|
||||||
loop16:
|
|
||||||
ldr r12, [r1], #4
|
|
||||||
1: mov r4, r12
|
|
||||||
ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
|
|
||||||
pld [r1, #64]
|
|
||||||
subs r2, r2, #32
|
|
||||||
ldrhs r12, [r1], #4
|
|
||||||
orr r3, r3, r4, lsl #16
|
|
||||||
mov r4, r4, lsr #16
|
|
||||||
orr r4, r4, r5, lsl #16
|
|
||||||
mov r5, r5, lsr #16
|
|
||||||
orr r5, r5, r6, lsl #16
|
|
||||||
mov r6, r6, lsr #16
|
|
||||||
orr r6, r6, r7, lsl #16
|
|
||||||
mov r7, r7, lsr #16
|
|
||||||
orr r7, r7, r8, lsl #16
|
|
||||||
mov r8, r8, lsr #16
|
|
||||||
orr r8, r8, r9, lsl #16
|
|
||||||
mov r9, r9, lsr #16
|
|
||||||
orr r9, r9, r10, lsl #16
|
|
||||||
mov r10, r10, lsr #16
|
|
||||||
orr r10, r10, r11, lsl #16
|
|
||||||
stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
|
|
||||||
mov r3, r11, lsr #16
|
|
||||||
bhs 1b
|
|
||||||
b less_than_thirtytwo
|
|
||||||
|
|
||||||
loop8:
|
|
||||||
ldr r12, [r1], #4
|
|
||||||
1: mov r4, r12
|
|
||||||
ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
|
|
||||||
pld [r1, #64]
|
|
||||||
subs r2, r2, #32
|
|
||||||
ldrhs r12, [r1], #4
|
|
||||||
orr r3, r3, r4, lsl #24
|
|
||||||
mov r4, r4, lsr #8
|
|
||||||
orr r4, r4, r5, lsl #24
|
|
||||||
mov r5, r5, lsr #8
|
|
||||||
orr r5, r5, r6, lsl #24
|
|
||||||
mov r6, r6, lsr #8
|
|
||||||
orr r6, r6, r7, lsl #24
|
|
||||||
mov r7, r7, lsr #8
|
|
||||||
orr r7, r7, r8, lsl #24
|
|
||||||
mov r8, r8, lsr #8
|
|
||||||
orr r8, r8, r9, lsl #24
|
|
||||||
mov r9, r9, lsr #8
|
|
||||||
orr r9, r9, r10, lsl #24
|
|
||||||
mov r10, r10, lsr #8
|
|
||||||
orr r10, r10, r11, lsl #24
|
|
||||||
stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
|
|
||||||
mov r3, r11, lsr #8
|
|
||||||
bhs 1b
|
|
||||||
b less_than_thirtytwo
|
|
||||||
|
|
||||||
loop24:
|
|
||||||
ldr r12, [r1], #4
|
|
||||||
1: mov r4, r12
|
|
||||||
ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
|
|
||||||
pld [r1, #64]
|
|
||||||
subs r2, r2, #32
|
|
||||||
ldrhs r12, [r1], #4
|
|
||||||
orr r3, r3, r4, lsl #8
|
|
||||||
mov r4, r4, lsr #24
|
|
||||||
orr r4, r4, r5, lsl #8
|
|
||||||
mov r5, r5, lsr #24
|
|
||||||
orr r5, r5, r6, lsl #8
|
|
||||||
mov r6, r6, lsr #24
|
|
||||||
orr r6, r6, r7, lsl #8
|
|
||||||
mov r7, r7, lsr #24
|
|
||||||
orr r7, r7, r8, lsl #8
|
|
||||||
mov r8, r8, lsr #24
|
|
||||||
orr r8, r8, r9, lsl #8
|
|
||||||
mov r9, r9, lsr #24
|
|
||||||
orr r9, r9, r10, lsl #8
|
|
||||||
mov r10, r10, lsr #24
|
|
||||||
orr r10, r10, r11, lsl #8
|
|
||||||
stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
|
|
||||||
mov r3, r11, lsr #24
|
|
||||||
bhs 1b
|
|
||||||
|
|
||||||
|
|
||||||
less_than_thirtytwo:
|
|
||||||
/* copy the last 0 to 31 bytes of the source */
|
|
||||||
rsb r12, lr, #32 /* we corrupted r12, recompute it */
|
|
||||||
add r2, r2, #32
|
|
||||||
cmp r2, #4
|
|
||||||
blo partial_word_tail
|
|
||||||
|
|
||||||
1: ldr r5, [r1], #4
|
|
||||||
sub r2, r2, #4
|
|
||||||
orr r4, r3, r5, lsl lr
|
|
||||||
mov r3, r5, lsr r12
|
|
||||||
str r4, [r0], #4
|
|
||||||
cmp r2, #4
|
|
||||||
bhs 1b
|
|
||||||
|
|
||||||
partial_word_tail:
|
|
||||||
/* we have a partial word in the input buffer */
|
|
||||||
movs r5, lr, lsl #(31-3)
|
|
||||||
strmib r3, [r0], #1
|
|
||||||
movmi r3, r3, lsr #8
|
|
||||||
strcsb r3, [r0], #1
|
|
||||||
movcs r3, r3, lsr #8
|
|
||||||
strcsb r3, [r0], #1
|
|
||||||
|
|
||||||
/* Refill spilled registers from the stack. Don't update sp. */
|
|
||||||
ldmfd sp, {r5-r11}
|
|
||||||
|
|
||||||
copy_last_3_and_return:
|
|
||||||
movs r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */
|
|
||||||
ldrmib r2, [r1], #1
|
|
||||||
ldrcsb r3, [r1], #1
|
|
||||||
ldrcsb r12,[r1]
|
|
||||||
strmib r2, [r0], #1
|
|
||||||
strcsb r3, [r0], #1
|
|
||||||
strcsb r12,[r0]
|
|
||||||
|
|
||||||
/* we're done! restore sp and spilled registers and return */
|
|
||||||
add sp, sp, #28
|
|
||||||
ldmfd sp!, {r0, r4, lr}
|
|
||||||
bx lr
|
|
||||||
END(memcpy)
|
|
||||||
|
|
||||||
|
|
||||||
#endif /* __ARM_ARCH__ < 7 */
|
|
|
@ -1,614 +0,0 @@
|
||||||
/* Copyright (c) 2013, Linaro Limited
|
|
||||||
All rights reserved.
|
|
||||||
|
|
||||||
Redistribution and use in source and binary forms, with or without
|
|
||||||
modification, are permitted provided that the following conditions
|
|
||||||
are met:
|
|
||||||
|
|
||||||
* Redistributions of source code must retain the above copyright
|
|
||||||
notice, this list of conditions and the following disclaimer.
|
|
||||||
|
|
||||||
* Redistributions in binary form must reproduce the above copyright
|
|
||||||
notice, this list of conditions and the following disclaimer in the
|
|
||||||
documentation and/or other materials provided with the distribution.
|
|
||||||
|
|
||||||
* Neither the name of Linaro Limited nor the names of its
|
|
||||||
contributors may be used to endorse or promote products derived
|
|
||||||
from this software without specific prior written permission.
|
|
||||||
|
|
||||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
||||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
||||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
||||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
||||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
||||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
||||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
||||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
||||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
||||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
||||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
|
||||||
This memcpy routine is optimised for Cortex-A15 cores and takes advantage
|
|
||||||
of VFP or NEON when built with the appropriate flags.
|
|
||||||
|
|
||||||
Assumptions:
|
|
||||||
|
|
||||||
ARMv6 (ARMv7-a if using Neon)
|
|
||||||
ARM state
|
|
||||||
Unaligned accesses
|
|
||||||
LDRD/STRD support unaligned word accesses
|
|
||||||
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <machine/cpu-features.h>
|
|
||||||
#include <private/bionic_asm.h>
|
|
||||||
|
|
||||||
.syntax unified
|
|
||||||
/* This implementation requires ARM state. */
|
|
||||||
.arm
|
|
||||||
|
|
||||||
#ifdef __ARM_NEON__
|
|
||||||
|
|
||||||
.fpu neon
|
|
||||||
.arch armv7-a
|
|
||||||
# define FRAME_SIZE 4
|
|
||||||
# define USE_VFP
|
|
||||||
# define USE_NEON
|
|
||||||
|
|
||||||
#elif !defined (__SOFTFP__)
|
|
||||||
|
|
||||||
.arch armv6
|
|
||||||
.fpu vfpv2
|
|
||||||
# define FRAME_SIZE 32
|
|
||||||
# define USE_VFP
|
|
||||||
|
|
||||||
#else
|
|
||||||
.arch armv6
|
|
||||||
# define FRAME_SIZE 32
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* Old versions of GAS incorrectly implement the NEON align semantics. */
|
|
||||||
#ifdef BROKEN_ASM_NEON_ALIGN
|
|
||||||
#define ALIGN(addr, align) addr,:align
|
|
||||||
#else
|
|
||||||
#define ALIGN(addr, align) addr:align
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define PC_OFFSET 8 /* PC pipeline compensation. */
|
|
||||||
#define INSN_SIZE 4
|
|
||||||
|
|
||||||
/* Call parameters. */
|
|
||||||
#define dstin r0
|
|
||||||
#define src r1
|
|
||||||
#define count r2
|
|
||||||
|
|
||||||
/* Locals. */
|
|
||||||
#define tmp1 r3
|
|
||||||
#define dst ip
|
|
||||||
#define tmp2 r10
|
|
||||||
|
|
||||||
#ifndef USE_NEON
|
|
||||||
/* For bulk copies using GP registers. */
|
|
||||||
#define A_l r2 /* Call-clobbered. */
|
|
||||||
#define A_h r3 /* Call-clobbered. */
|
|
||||||
#define B_l r4
|
|
||||||
#define B_h r5
|
|
||||||
#define C_l r6
|
|
||||||
#define C_h r7
|
|
||||||
#define D_l r8
|
|
||||||
#define D_h r9
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* Number of lines ahead to pre-fetch data. If you change this the code
|
|
||||||
below will need adjustment to compensate. */
|
|
||||||
|
|
||||||
#define prefetch_lines 5
|
|
||||||
|
|
||||||
#ifdef USE_VFP
|
|
||||||
.macro cpy_line_vfp vreg, base
|
|
||||||
vstr \vreg, [dst, #\base]
|
|
||||||
vldr \vreg, [src, #\base]
|
|
||||||
vstr d0, [dst, #\base + 8]
|
|
||||||
vldr d0, [src, #\base + 8]
|
|
||||||
vstr d1, [dst, #\base + 16]
|
|
||||||
vldr d1, [src, #\base + 16]
|
|
||||||
vstr d2, [dst, #\base + 24]
|
|
||||||
vldr d2, [src, #\base + 24]
|
|
||||||
vstr \vreg, [dst, #\base + 32]
|
|
||||||
vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]
|
|
||||||
vstr d0, [dst, #\base + 40]
|
|
||||||
vldr d0, [src, #\base + 40]
|
|
||||||
vstr d1, [dst, #\base + 48]
|
|
||||||
vldr d1, [src, #\base + 48]
|
|
||||||
vstr d2, [dst, #\base + 56]
|
|
||||||
vldr d2, [src, #\base + 56]
|
|
||||||
.endm
|
|
||||||
|
|
||||||
.macro cpy_tail_vfp vreg, base
|
|
||||||
vstr \vreg, [dst, #\base]
|
|
||||||
vldr \vreg, [src, #\base]
|
|
||||||
vstr d0, [dst, #\base + 8]
|
|
||||||
vldr d0, [src, #\base + 8]
|
|
||||||
vstr d1, [dst, #\base + 16]
|
|
||||||
vldr d1, [src, #\base + 16]
|
|
||||||
vstr d2, [dst, #\base + 24]
|
|
||||||
vldr d2, [src, #\base + 24]
|
|
||||||
vstr \vreg, [dst, #\base + 32]
|
|
||||||
vstr d0, [dst, #\base + 40]
|
|
||||||
vldr d0, [src, #\base + 40]
|
|
||||||
vstr d1, [dst, #\base + 48]
|
|
||||||
vldr d1, [src, #\base + 48]
|
|
||||||
vstr d2, [dst, #\base + 56]
|
|
||||||
vldr d2, [src, #\base + 56]
|
|
||||||
.endm
|
|
||||||
#endif
|
|
||||||
|
|
||||||
.p2align 6
|
|
||||||
ENTRY(memcpy)
|
|
||||||
|
|
||||||
mov dst, dstin /* Preserve dstin, we need to return it. */
|
|
||||||
cmp count, #64
|
|
||||||
bge .Lcpy_not_short
|
|
||||||
/* Deal with small copies quickly by dropping straight into the
|
|
||||||
exit block. */
|
|
||||||
|
|
||||||
.Ltail63unaligned:
|
|
||||||
#ifdef USE_NEON
|
|
||||||
and tmp1, count, #0x38
|
|
||||||
rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
|
|
||||||
add pc, pc, tmp1
|
|
||||||
vld1.8 {d0}, [src]! /* 14 words to go. */
|
|
||||||
vst1.8 {d0}, [dst]!
|
|
||||||
vld1.8 {d0}, [src]! /* 12 words to go. */
|
|
||||||
vst1.8 {d0}, [dst]!
|
|
||||||
vld1.8 {d0}, [src]! /* 10 words to go. */
|
|
||||||
vst1.8 {d0}, [dst]!
|
|
||||||
vld1.8 {d0}, [src]! /* 8 words to go. */
|
|
||||||
vst1.8 {d0}, [dst]!
|
|
||||||
vld1.8 {d0}, [src]! /* 6 words to go. */
|
|
||||||
vst1.8 {d0}, [dst]!
|
|
||||||
vld1.8 {d0}, [src]! /* 4 words to go. */
|
|
||||||
vst1.8 {d0}, [dst]!
|
|
||||||
vld1.8 {d0}, [src]! /* 2 words to go. */
|
|
||||||
vst1.8 {d0}, [dst]!
|
|
||||||
|
|
||||||
tst count, #4
|
|
||||||
ldrne tmp1, [src], #4
|
|
||||||
strne tmp1, [dst], #4
|
|
||||||
#else
|
|
||||||
/* Copy up to 15 full words of data. May not be aligned. */
|
|
||||||
/* Cannot use VFP for unaligned data. */
|
|
||||||
and tmp1, count, #0x3c
|
|
||||||
add dst, dst, tmp1
|
|
||||||
add src, src, tmp1
|
|
||||||
rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
|
|
||||||
/* Jump directly into the sequence below at the correct offset. */
|
|
||||||
add pc, pc, tmp1, lsl #1
|
|
||||||
|
|
||||||
ldr tmp1, [src, #-60] /* 15 words to go. */
|
|
||||||
str tmp1, [dst, #-60]
|
|
||||||
|
|
||||||
ldr tmp1, [src, #-56] /* 14 words to go. */
|
|
||||||
str tmp1, [dst, #-56]
|
|
||||||
ldr tmp1, [src, #-52]
|
|
||||||
str tmp1, [dst, #-52]
|
|
||||||
|
|
||||||
ldr tmp1, [src, #-48] /* 12 words to go. */
|
|
||||||
str tmp1, [dst, #-48]
|
|
||||||
ldr tmp1, [src, #-44]
|
|
||||||
str tmp1, [dst, #-44]
|
|
||||||
|
|
||||||
ldr tmp1, [src, #-40] /* 10 words to go. */
|
|
||||||
str tmp1, [dst, #-40]
|
|
||||||
ldr tmp1, [src, #-36]
|
|
||||||
str tmp1, [dst, #-36]
|
|
||||||
|
|
||||||
ldr tmp1, [src, #-32] /* 8 words to go. */
|
|
||||||
str tmp1, [dst, #-32]
|
|
||||||
ldr tmp1, [src, #-28]
|
|
||||||
str tmp1, [dst, #-28]
|
|
||||||
|
|
||||||
ldr tmp1, [src, #-24] /* 6 words to go. */
|
|
||||||
str tmp1, [dst, #-24]
|
|
||||||
ldr tmp1, [src, #-20]
|
|
||||||
str tmp1, [dst, #-20]
|
|
||||||
|
|
||||||
ldr tmp1, [src, #-16] /* 4 words to go. */
|
|
||||||
str tmp1, [dst, #-16]
|
|
||||||
ldr tmp1, [src, #-12]
|
|
||||||
str tmp1, [dst, #-12]
|
|
||||||
|
|
||||||
ldr tmp1, [src, #-8] /* 2 words to go. */
|
|
||||||
str tmp1, [dst, #-8]
|
|
||||||
ldr tmp1, [src, #-4]
|
|
||||||
str tmp1, [dst, #-4]
|
|
||||||
#endif
|
|
||||||
|
|
||||||
lsls count, count, #31
|
|
||||||
ldrhcs tmp1, [src], #2
|
|
||||||
ldrbne src, [src] /* Src is dead, use as a scratch. */
|
|
||||||
strhcs tmp1, [dst], #2
|
|
||||||
strbne src, [dst]
|
|
||||||
bx lr
|
|
||||||
|
|
||||||
.Lcpy_not_short:
|
|
||||||
/* At least 64 bytes to copy, but don't know the alignment yet. */
|
|
||||||
str tmp2, [sp, #-FRAME_SIZE]!
|
|
||||||
and tmp2, src, #7
|
|
||||||
and tmp1, dst, #7
|
|
||||||
cmp tmp1, tmp2
|
|
||||||
bne .Lcpy_notaligned
|
|
||||||
|
|
||||||
#ifdef USE_VFP
|
|
||||||
/* Magic dust alert! Force VFP on Cortex-A9. Experiments show
|
|
||||||
that the FP pipeline is much better at streaming loads and
|
|
||||||
stores. This is outside the critical loop. */
|
|
||||||
vmov.f32 s0, s0
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* SRC and DST have the same mutual 32-bit alignment, but we may
|
|
||||||
still need to pre-copy some bytes to get to natural alignment.
|
|
||||||
We bring DST into full 64-bit alignment. */
|
|
||||||
lsls tmp2, dst, #29
|
|
||||||
beq 1f
|
|
||||||
rsbs tmp2, tmp2, #0
|
|
||||||
sub count, count, tmp2, lsr #29
|
|
||||||
ldrmi tmp1, [src], #4
|
|
||||||
strmi tmp1, [dst], #4
|
|
||||||
lsls tmp2, tmp2, #2
|
|
||||||
ldrhcs tmp1, [src], #2
|
|
||||||
ldrbne tmp2, [src], #1
|
|
||||||
strhcs tmp1, [dst], #2
|
|
||||||
strbne tmp2, [dst], #1
|
|
||||||
|
|
||||||
1:
|
|
||||||
subs tmp2, count, #64 /* Use tmp2 for count. */
|
|
||||||
blt .Ltail63aligned
|
|
||||||
|
|
||||||
cmp tmp2, #512
|
|
||||||
bge .Lcpy_body_long
|
|
||||||
|
|
||||||
.Lcpy_body_medium: /* Count in tmp2. */
|
|
||||||
#ifdef USE_VFP
|
|
||||||
1:
|
|
||||||
vldr d0, [src, #0]
|
|
||||||
subs tmp2, tmp2, #64
|
|
||||||
vldr d1, [src, #8]
|
|
||||||
vstr d0, [dst, #0]
|
|
||||||
vldr d0, [src, #16]
|
|
||||||
vstr d1, [dst, #8]
|
|
||||||
vldr d1, [src, #24]
|
|
||||||
vstr d0, [dst, #16]
|
|
||||||
vldr d0, [src, #32]
|
|
||||||
vstr d1, [dst, #24]
|
|
||||||
vldr d1, [src, #40]
|
|
||||||
vstr d0, [dst, #32]
|
|
||||||
vldr d0, [src, #48]
|
|
||||||
vstr d1, [dst, #40]
|
|
||||||
vldr d1, [src, #56]
|
|
||||||
vstr d0, [dst, #48]
|
|
||||||
add src, src, #64
|
|
||||||
vstr d1, [dst, #56]
|
|
||||||
add dst, dst, #64
|
|
||||||
bge 1b
|
|
||||||
tst tmp2, #0x3f
|
|
||||||
beq .Ldone
|
|
||||||
|
|
||||||
.Ltail63aligned: /* Count in tmp2. */
|
|
||||||
and tmp1, tmp2, #0x38
|
|
||||||
add dst, dst, tmp1
|
|
||||||
add src, src, tmp1
|
|
||||||
rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
|
|
||||||
add pc, pc, tmp1
|
|
||||||
|
|
||||||
vldr d0, [src, #-56] /* 14 words to go. */
|
|
||||||
vstr d0, [dst, #-56]
|
|
||||||
vldr d0, [src, #-48] /* 12 words to go. */
|
|
||||||
vstr d0, [dst, #-48]
|
|
||||||
vldr d0, [src, #-40] /* 10 words to go. */
|
|
||||||
vstr d0, [dst, #-40]
|
|
||||||
vldr d0, [src, #-32] /* 8 words to go. */
|
|
||||||
vstr d0, [dst, #-32]
|
|
||||||
vldr d0, [src, #-24] /* 6 words to go. */
|
|
||||||
vstr d0, [dst, #-24]
|
|
||||||
vldr d0, [src, #-16] /* 4 words to go. */
|
|
||||||
vstr d0, [dst, #-16]
|
|
||||||
vldr d0, [src, #-8] /* 2 words to go. */
|
|
||||||
vstr d0, [dst, #-8]
|
|
||||||
#else
|
|
||||||
sub src, src, #8
|
|
||||||
sub dst, dst, #8
|
|
||||||
1:
|
|
||||||
ldrd A_l, A_h, [src, #8]
|
|
||||||
strd A_l, A_h, [dst, #8]
|
|
||||||
ldrd A_l, A_h, [src, #16]
|
|
||||||
strd A_l, A_h, [dst, #16]
|
|
||||||
ldrd A_l, A_h, [src, #24]
|
|
||||||
strd A_l, A_h, [dst, #24]
|
|
||||||
ldrd A_l, A_h, [src, #32]
|
|
||||||
strd A_l, A_h, [dst, #32]
|
|
||||||
ldrd A_l, A_h, [src, #40]
|
|
||||||
strd A_l, A_h, [dst, #40]
|
|
||||||
ldrd A_l, A_h, [src, #48]
|
|
||||||
strd A_l, A_h, [dst, #48]
|
|
||||||
ldrd A_l, A_h, [src, #56]
|
|
||||||
strd A_l, A_h, [dst, #56]
|
|
||||||
ldrd A_l, A_h, [src, #64]!
|
|
||||||
strd A_l, A_h, [dst, #64]!
|
|
||||||
subs tmp2, tmp2, #64
|
|
||||||
bge 1b
|
|
||||||
tst tmp2, #0x3f
|
|
||||||
bne 1f
|
|
||||||
ldr tmp2,[sp], #FRAME_SIZE
|
|
||||||
bx lr
|
|
||||||
1:
|
|
||||||
add src, src, #8
|
|
||||||
add dst, dst, #8
|
|
||||||
|
|
||||||
.Ltail63aligned: /* Count in tmp2. */
|
|
||||||
/* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
|
|
||||||
we know that the src and dest are 32-bit aligned so we can use
|
|
||||||
LDRD/STRD to improve efficiency. */
|
|
||||||
/* TMP2 is now negative, but we don't care about that. The bottom
|
|
||||||
six bits still tell us how many bytes are left to copy. */
|
|
||||||
|
|
||||||
and tmp1, tmp2, #0x38
|
|
||||||
add dst, dst, tmp1
|
|
||||||
add src, src, tmp1
|
|
||||||
rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
|
|
||||||
add pc, pc, tmp1
|
|
||||||
ldrd A_l, A_h, [src, #-56] /* 14 words to go. */
|
|
||||||
strd A_l, A_h, [dst, #-56]
|
|
||||||
ldrd A_l, A_h, [src, #-48] /* 12 words to go. */
|
|
||||||
strd A_l, A_h, [dst, #-48]
|
|
||||||
ldrd A_l, A_h, [src, #-40] /* 10 words to go. */
|
|
||||||
strd A_l, A_h, [dst, #-40]
|
|
||||||
ldrd A_l, A_h, [src, #-32] /* 8 words to go. */
|
|
||||||
strd A_l, A_h, [dst, #-32]
|
|
||||||
ldrd A_l, A_h, [src, #-24] /* 6 words to go. */
|
|
||||||
strd A_l, A_h, [dst, #-24]
|
|
||||||
ldrd A_l, A_h, [src, #-16] /* 4 words to go. */
|
|
||||||
strd A_l, A_h, [dst, #-16]
|
|
||||||
ldrd A_l, A_h, [src, #-8] /* 2 words to go. */
|
|
||||||
strd A_l, A_h, [dst, #-8]
|
|
||||||
|
|
||||||
#endif
|
|
||||||
tst tmp2, #4
|
|
||||||
ldrne tmp1, [src], #4
|
|
||||||
strne tmp1, [dst], #4
|
|
||||||
lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
|
|
||||||
ldrhcs tmp1, [src], #2
|
|
||||||
ldrbne tmp2, [src]
|
|
||||||
strhcs tmp1, [dst], #2
|
|
||||||
strbne tmp2, [dst]
|
|
||||||
|
|
||||||
.Ldone:
|
|
||||||
ldr tmp2, [sp], #FRAME_SIZE
|
|
||||||
bx lr
|
|
||||||
|
|
||||||
.Lcpy_body_long: /* Count in tmp2. */
|
|
||||||
|
|
||||||
/* Long copy. We know that there's at least (prefetch_lines * 64)
|
|
||||||
bytes to go. */
|
|
||||||
#ifdef USE_VFP
|
|
||||||
/* Don't use PLD. Instead, read some data in advance of the current
|
|
||||||
copy position into a register. This should act like a PLD
|
|
||||||
operation but we won't have to repeat the transfer. */
|
|
||||||
|
|
||||||
vldr d3, [src, #0]
|
|
||||||
vldr d4, [src, #64]
|
|
||||||
vldr d5, [src, #128]
|
|
||||||
vldr d6, [src, #192]
|
|
||||||
vldr d7, [src, #256]
|
|
||||||
|
|
||||||
vldr d0, [src, #8]
|
|
||||||
vldr d1, [src, #16]
|
|
||||||
vldr d2, [src, #24]
|
|
||||||
add src, src, #32
|
|
||||||
|
|
||||||
subs tmp2, tmp2, #prefetch_lines * 64 * 2
|
|
||||||
blt 2f
|
|
||||||
1:
|
|
||||||
cpy_line_vfp d3, 0
|
|
||||||
cpy_line_vfp d4, 64
|
|
||||||
cpy_line_vfp d5, 128
|
|
||||||
add dst, dst, #3 * 64
|
|
||||||
add src, src, #3 * 64
|
|
||||||
cpy_line_vfp d6, 0
|
|
||||||
cpy_line_vfp d7, 64
|
|
||||||
add dst, dst, #2 * 64
|
|
||||||
add src, src, #2 * 64
|
|
||||||
subs tmp2, tmp2, #prefetch_lines * 64
|
|
||||||
bge 1b
|
|
||||||
|
|
||||||
2:
|
|
||||||
cpy_tail_vfp d3, 0
|
|
||||||
cpy_tail_vfp d4, 64
|
|
||||||
cpy_tail_vfp d5, 128
|
|
||||||
add src, src, #3 * 64
|
|
||||||
add dst, dst, #3 * 64
|
|
||||||
cpy_tail_vfp d6, 0
|
|
||||||
vstr d7, [dst, #64]
|
|
||||||
vldr d7, [src, #64]
|
|
||||||
vstr d0, [dst, #64 + 8]
|
|
||||||
vldr d0, [src, #64 + 8]
|
|
||||||
vstr d1, [dst, #64 + 16]
|
|
||||||
vldr d1, [src, #64 + 16]
|
|
||||||
vstr d2, [dst, #64 + 24]
|
|
||||||
vldr d2, [src, #64 + 24]
|
|
||||||
vstr d7, [dst, #64 + 32]
|
|
||||||
add src, src, #96
|
|
||||||
vstr d0, [dst, #64 + 40]
|
|
||||||
vstr d1, [dst, #64 + 48]
|
|
||||||
vstr d2, [dst, #64 + 56]
|
|
||||||
add dst, dst, #128
|
|
||||||
add tmp2, tmp2, #prefetch_lines * 64
|
|
||||||
b .Lcpy_body_medium
|
|
||||||
#else
|
|
||||||
/* Long copy. Use an SMS style loop to maximize the I/O
|
|
||||||
bandwidth of the core. We don't have enough spare registers
|
|
||||||
to synthesise prefetching, so use PLD operations. */
|
|
||||||
/* Pre-bias src and dst. */
|
|
||||||
sub src, src, #8
|
|
||||||
sub dst, dst, #8
|
|
||||||
pld [src, #8]
|
|
||||||
pld [src, #72]
|
|
||||||
subs tmp2, tmp2, #64
|
|
||||||
pld [src, #136]
|
|
||||||
ldrd A_l, A_h, [src, #8]
|
|
||||||
strd B_l, B_h, [sp, #8]
|
|
||||||
ldrd B_l, B_h, [src, #16]
|
|
||||||
strd C_l, C_h, [sp, #16]
|
|
||||||
ldrd C_l, C_h, [src, #24]
|
|
||||||
strd D_l, D_h, [sp, #24]
|
|
||||||
pld [src, #200]
|
|
||||||
ldrd D_l, D_h, [src, #32]!
|
|
||||||
b 1f
|
|
||||||
.p2align 6
|
|
||||||
2:
|
|
||||||
pld [src, #232]
|
|
||||||
strd A_l, A_h, [dst, #40]
|
|
||||||
ldrd A_l, A_h, [src, #40]
|
|
||||||
strd B_l, B_h, [dst, #48]
|
|
||||||
ldrd B_l, B_h, [src, #48]
|
|
||||||
strd C_l, C_h, [dst, #56]
|
|
||||||
ldrd C_l, C_h, [src, #56]
|
|
||||||
strd D_l, D_h, [dst, #64]!
|
|
||||||
ldrd D_l, D_h, [src, #64]!
|
|
||||||
subs tmp2, tmp2, #64
|
|
||||||
1:
|
|
||||||
strd A_l, A_h, [dst, #8]
|
|
||||||
ldrd A_l, A_h, [src, #8]
|
|
||||||
strd B_l, B_h, [dst, #16]
|
|
||||||
ldrd B_l, B_h, [src, #16]
|
|
||||||
strd C_l, C_h, [dst, #24]
|
|
||||||
ldrd C_l, C_h, [src, #24]
|
|
||||||
strd D_l, D_h, [dst, #32]
|
|
||||||
ldrd D_l, D_h, [src, #32]
|
|
||||||
bcs 2b
|
|
||||||
/* Save the remaining bytes and restore the callee-saved regs. */
|
|
||||||
strd A_l, A_h, [dst, #40]
|
|
||||||
add src, src, #40
|
|
||||||
strd B_l, B_h, [dst, #48]
|
|
||||||
ldrd B_l, B_h, [sp, #8]
|
|
||||||
strd C_l, C_h, [dst, #56]
|
|
||||||
ldrd C_l, C_h, [sp, #16]
|
|
||||||
strd D_l, D_h, [dst, #64]
|
|
||||||
ldrd D_l, D_h, [sp, #24]
|
|
||||||
add dst, dst, #72
|
|
||||||
tst tmp2, #0x3f
|
|
||||||
bne .Ltail63aligned
|
|
||||||
ldr tmp2, [sp], #FRAME_SIZE
|
|
||||||
bx lr
|
|
||||||
#endif
|
|
||||||
|
|
||||||
.Lcpy_notaligned:
|
|
||||||
pld [src]
|
|
||||||
pld [src, #64]
|
|
||||||
/* There's at least 64 bytes to copy, but there is no mutual
|
|
||||||
alignment. */
|
|
||||||
/* Bring DST to 64-bit alignment. */
|
|
||||||
lsls tmp2, dst, #29
|
|
||||||
pld [src, #(2 * 64)]
|
|
||||||
beq 1f
|
|
||||||
rsbs tmp2, tmp2, #0
|
|
||||||
sub count, count, tmp2, lsr #29
|
|
||||||
ldrmi tmp1, [src], #4
|
|
||||||
strmi tmp1, [dst], #4
|
|
||||||
lsls tmp2, tmp2, #2
|
|
||||||
ldrbne tmp1, [src], #1
|
|
||||||
ldrhcs tmp2, [src], #2
|
|
||||||
strbne tmp1, [dst], #1
|
|
||||||
strhcs tmp2, [dst], #2
|
|
||||||
1:
|
|
||||||
pld [src, #(3 * 64)]
|
|
||||||
subs count, count, #64
|
|
||||||
ldrmi tmp2, [sp], #FRAME_SIZE
|
|
||||||
bmi .Ltail63unaligned
|
|
||||||
pld [src, #(4 * 64)]
|
|
||||||
|
|
||||||
#ifdef USE_NEON
|
|
||||||
vld1.8 {d0-d3}, [src]!
|
|
||||||
vld1.8 {d4-d7}, [src]!
|
|
||||||
subs count, count, #64
|
|
||||||
bmi 2f
|
|
||||||
1:
|
|
||||||
pld [src, #(4 * 64)]
|
|
||||||
vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
|
|
||||||
vld1.8 {d0-d3}, [src]!
|
|
||||||
vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
|
|
||||||
vld1.8 {d4-d7}, [src]!
|
|
||||||
subs count, count, #64
|
|
||||||
bpl 1b
|
|
||||||
2:
|
|
||||||
vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
|
|
||||||
vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
|
|
||||||
ands count, count, #0x3f
|
|
||||||
#else
|
|
||||||
/* Use an SMS style loop to maximize the I/O bandwidth. */
|
|
||||||
sub src, src, #4
|
|
||||||
sub dst, dst, #8
|
|
||||||
subs tmp2, count, #64 /* Use tmp2 for count. */
|
|
||||||
ldr A_l, [src, #4]
|
|
||||||
ldr A_h, [src, #8]
|
|
||||||
strd B_l, B_h, [sp, #8]
|
|
||||||
ldr B_l, [src, #12]
|
|
||||||
ldr B_h, [src, #16]
|
|
||||||
strd C_l, C_h, [sp, #16]
|
|
||||||
ldr C_l, [src, #20]
|
|
||||||
ldr C_h, [src, #24]
|
|
||||||
strd D_l, D_h, [sp, #24]
|
|
||||||
ldr D_l, [src, #28]
|
|
||||||
ldr D_h, [src, #32]!
|
|
||||||
b 1f
|
|
||||||
.p2align 6
|
|
||||||
2:
|
|
||||||
pld [src, #(5 * 64) - (32 - 4)]
|
|
||||||
strd A_l, A_h, [dst, #40]
|
|
||||||
ldr A_l, [src, #36]
|
|
||||||
ldr A_h, [src, #40]
|
|
||||||
strd B_l, B_h, [dst, #48]
|
|
||||||
ldr B_l, [src, #44]
|
|
||||||
ldr B_h, [src, #48]
|
|
||||||
strd C_l, C_h, [dst, #56]
|
|
||||||
ldr C_l, [src, #52]
|
|
||||||
ldr C_h, [src, #56]
|
|
||||||
strd D_l, D_h, [dst, #64]!
|
|
||||||
ldr D_l, [src, #60]
|
|
||||||
ldr D_h, [src, #64]!
|
|
||||||
subs tmp2, tmp2, #64
|
|
||||||
1:
|
|
||||||
strd A_l, A_h, [dst, #8]
|
|
||||||
ldr A_l, [src, #4]
|
|
||||||
ldr A_h, [src, #8]
|
|
||||||
strd B_l, B_h, [dst, #16]
|
|
||||||
ldr B_l, [src, #12]
|
|
||||||
ldr B_h, [src, #16]
|
|
||||||
strd C_l, C_h, [dst, #24]
|
|
||||||
ldr C_l, [src, #20]
|
|
||||||
ldr C_h, [src, #24]
|
|
||||||
strd D_l, D_h, [dst, #32]
|
|
||||||
ldr D_l, [src, #28]
|
|
||||||
ldr D_h, [src, #32]
|
|
||||||
bcs 2b
|
|
||||||
|
|
||||||
/* Save the remaining bytes and restore the callee-saved regs. */
|
|
||||||
strd A_l, A_h, [dst, #40]
|
|
||||||
add src, src, #36
|
|
||||||
strd B_l, B_h, [dst, #48]
|
|
||||||
ldrd B_l, B_h, [sp, #8]
|
|
||||||
strd C_l, C_h, [dst, #56]
|
|
||||||
ldrd C_l, C_h, [sp, #16]
|
|
||||||
strd D_l, D_h, [dst, #64]
|
|
||||||
ldrd D_l, D_h, [sp, #24]
|
|
||||||
add dst, dst, #72
|
|
||||||
ands count, tmp2, #0x3f
|
|
||||||
#endif
|
|
||||||
ldr tmp2, [sp], #FRAME_SIZE
|
|
||||||
bne .Ltail63unaligned
|
|
||||||
bx lr
|
|
||||||
END(memcpy)
|
|
|
@ -1,317 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (c) 2011 The Android Open Source Project
|
|
||||||
* Copyright (c) 2008 ARM Ltd
|
|
||||||
* All rights reserved.
|
|
||||||
*
|
|
||||||
* Redistribution and use in source and binary forms, with or without
|
|
||||||
* modification, are permitted provided that the following conditions
|
|
||||||
* are met:
|
|
||||||
* 1. Redistributions of source code must retain the above copyright
|
|
||||||
* notice, this list of conditions and the following disclaimer.
|
|
||||||
* 2. Redistributions in binary form must reproduce the above copyright
|
|
||||||
* notice, this list of conditions and the following disclaimer in the
|
|
||||||
* documentation and/or other materials provided with the distribution.
|
|
||||||
* 3. The name of the company may not be used to endorse or promote
|
|
||||||
* products derived from this software without specific prior written
|
|
||||||
* permission.
|
|
||||||
*
|
|
||||||
* THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
|
|
||||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
|
||||||
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
|
||||||
* IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
||||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
|
||||||
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
||||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
||||||
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
||||||
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
||||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <machine/cpu-features.h>
|
|
||||||
#include <private/bionic_asm.h>
|
|
||||||
|
|
||||||
.text
|
|
||||||
|
|
||||||
#ifdef __ARMEB__
|
|
||||||
#define SHFT2LSB lsl
|
|
||||||
#define SHFT2LSBEQ lsleq
|
|
||||||
#define SHFT2MSB lsr
|
|
||||||
#define SHFT2MSBEQ lsreq
|
|
||||||
#define MSB 0x000000ff
|
|
||||||
#define LSB 0xff000000
|
|
||||||
#else
|
|
||||||
#define SHFT2LSB lsr
|
|
||||||
#define SHFT2LSBEQ lsreq
|
|
||||||
#define SHFT2MSB lsl
|
|
||||||
#define SHFT2MSBEQ lsleq
|
|
||||||
#define MSB 0xff000000
|
|
||||||
#define LSB 0x000000ff
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define magic1(REG) REG
|
|
||||||
#define magic2(REG) REG, lsl #7
|
|
||||||
|
|
||||||
ENTRY(strcmp)
|
|
||||||
pld [r0, #0]
|
|
||||||
pld [r1, #0]
|
|
||||||
eor r2, r0, r1
|
|
||||||
tst r2, #3
|
|
||||||
|
|
||||||
/* Strings not at same byte offset from a word boundary. */
|
|
||||||
bne .Lstrcmp_unaligned
|
|
||||||
ands r2, r0, #3
|
|
||||||
bic r0, r0, #3
|
|
||||||
bic r1, r1, #3
|
|
||||||
ldr ip, [r0], #4
|
|
||||||
it eq
|
|
||||||
ldreq r3, [r1], #4
|
|
||||||
beq 1f
|
|
||||||
|
|
||||||
/* Although s1 and s2 have identical initial alignment, they are
|
|
||||||
* not currently word aligned. Rather than comparing bytes,
|
|
||||||
* make sure that any bytes fetched from before the addressed
|
|
||||||
* bytes are forced to 0xff. Then they will always compare
|
|
||||||
* equal.
|
|
||||||
*/
|
|
||||||
eor r2, r2, #3
|
|
||||||
lsl r2, r2, #3
|
|
||||||
mvn r3, #MSB
|
|
||||||
SHFT2LSB r2, r3, r2
|
|
||||||
ldr r3, [r1], #4
|
|
||||||
orr ip, ip, r2
|
|
||||||
orr r3, r3, r2
|
|
||||||
1:
|
|
||||||
/* Load the 'magic' constant 0x01010101. */
|
|
||||||
str r4, [sp, #-4]!
|
|
||||||
mov r4, #1
|
|
||||||
orr r4, r4, r4, lsl #8
|
|
||||||
orr r4, r4, r4, lsl #16
|
|
||||||
.p2align 2
|
|
||||||
4:
|
|
||||||
pld [r0, #8]
|
|
||||||
pld [r1, #8]
|
|
||||||
sub r2, ip, magic1(r4)
|
|
||||||
cmp ip, r3
|
|
||||||
itttt eq
|
|
||||||
|
|
||||||
/* check for any zero bytes in first word */
|
|
||||||
biceq r2, r2, ip
|
|
||||||
tsteq r2, magic2(r4)
|
|
||||||
ldreq ip, [r0], #4
|
|
||||||
ldreq r3, [r1], #4
|
|
||||||
beq 4b
|
|
||||||
2:
|
|
||||||
/* There's a zero or a different byte in the word */
|
|
||||||
SHFT2MSB r0, ip, #24
|
|
||||||
SHFT2LSB ip, ip, #8
|
|
||||||
cmp r0, #1
|
|
||||||
it cs
|
|
||||||
cmpcs r0, r3, SHFT2MSB #24
|
|
||||||
it eq
|
|
||||||
SHFT2LSBEQ r3, r3, #8
|
|
||||||
beq 2b
|
|
||||||
/* On a big-endian machine, r0 contains the desired byte in bits
|
|
||||||
* 0-7; on a little-endian machine they are in bits 24-31. In
|
|
||||||
* both cases the other bits in r0 are all zero. For r3 the
|
|
||||||
* interesting byte is at the other end of the word, but the
|
|
||||||
* other bits are not necessarily zero. We need a signed result
|
|
||||||
* representing the differnece in the unsigned bytes, so for the
|
|
||||||
* little-endian case we can't just shift the interesting bits up.
|
|
||||||
*/
|
|
||||||
#ifdef __ARMEB__
|
|
||||||
sub r0, r0, r3, lsr #24
|
|
||||||
#else
|
|
||||||
and r3, r3, #255
|
|
||||||
/* No RSB instruction in Thumb2 */
|
|
||||||
#ifdef __thumb2__
|
|
||||||
lsr r0, r0, #24
|
|
||||||
sub r0, r0, r3
|
|
||||||
#else
|
|
||||||
rsb r0, r3, r0, lsr #24
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
ldr r4, [sp], #4
|
|
||||||
bx lr
|
|
||||||
|
|
||||||
.Lstrcmp_unaligned:
|
|
||||||
wp1 .req r0
|
|
||||||
wp2 .req r1
|
|
||||||
b1 .req r2
|
|
||||||
w1 .req r4
|
|
||||||
w2 .req r5
|
|
||||||
t1 .req ip
|
|
||||||
@ r3 is scratch
|
|
||||||
|
|
||||||
/* First of all, compare bytes until wp1(sp1) is word-aligned. */
|
|
||||||
1:
|
|
||||||
tst wp1, #3
|
|
||||||
beq 2f
|
|
||||||
ldrb r2, [wp1], #1
|
|
||||||
ldrb r3, [wp2], #1
|
|
||||||
cmp r2, #1
|
|
||||||
it cs
|
|
||||||
cmpcs r2, r3
|
|
||||||
beq 1b
|
|
||||||
sub r0, r2, r3
|
|
||||||
bx lr
|
|
||||||
|
|
||||||
2:
|
|
||||||
str r5, [sp, #-4]!
|
|
||||||
str r4, [sp, #-4]!
|
|
||||||
mov b1, #1
|
|
||||||
orr b1, b1, b1, lsl #8
|
|
||||||
orr b1, b1, b1, lsl #16
|
|
||||||
|
|
||||||
and t1, wp2, #3
|
|
||||||
bic wp2, wp2, #3
|
|
||||||
ldr w1, [wp1], #4
|
|
||||||
ldr w2, [wp2], #4
|
|
||||||
cmp t1, #2
|
|
||||||
beq 2f
|
|
||||||
bhi 3f
|
|
||||||
|
|
||||||
/* Critical inner Loop: Block with 3 bytes initial overlap */
|
|
||||||
.p2align 2
|
|
||||||
1:
|
|
||||||
bic t1, w1, #MSB
|
|
||||||
cmp t1, w2, SHFT2LSB #8
|
|
||||||
sub r3, w1, b1
|
|
||||||
bic r3, r3, w1
|
|
||||||
bne 4f
|
|
||||||
ands r3, r3, b1, lsl #7
|
|
||||||
it eq
|
|
||||||
ldreq w2, [wp2], #4
|
|
||||||
bne 5f
|
|
||||||
eor t1, t1, w1
|
|
||||||
cmp t1, w2, SHFT2MSB #24
|
|
||||||
bne 6f
|
|
||||||
ldr w1, [wp1], #4
|
|
||||||
b 1b
|
|
||||||
4:
|
|
||||||
SHFT2LSB w2, w2, #8
|
|
||||||
b 8f
|
|
||||||
|
|
||||||
5:
|
|
||||||
#ifdef __ARMEB__
|
|
||||||
/* The syndrome value may contain false ones if the string ends
|
|
||||||
* with the bytes 0x01 0x00
|
|
||||||
*/
|
|
||||||
tst w1, #0xff000000
|
|
||||||
itt ne
|
|
||||||
tstne w1, #0x00ff0000
|
|
||||||
tstne w1, #0x0000ff00
|
|
||||||
beq 7f
|
|
||||||
#else
|
|
||||||
bics r3, r3, #0xff000000
|
|
||||||
bne 7f
|
|
||||||
#endif
|
|
||||||
ldrb w2, [wp2]
|
|
||||||
SHFT2LSB t1, w1, #24
|
|
||||||
#ifdef __ARMEB__
|
|
||||||
lsl w2, w2, #24
|
|
||||||
#endif
|
|
||||||
b 8f
|
|
||||||
|
|
||||||
6:
|
|
||||||
SHFT2LSB t1, w1, #24
|
|
||||||
and w2, w2, #LSB
|
|
||||||
b 8f
|
|
||||||
|
|
||||||
/* Critical inner Loop: Block with 2 bytes initial overlap */
|
|
||||||
.p2align 2
|
|
||||||
2:
|
|
||||||
SHFT2MSB t1, w1, #16
|
|
||||||
sub r3, w1, b1
|
|
||||||
SHFT2LSB t1, t1, #16
|
|
||||||
bic r3, r3, w1
|
|
||||||
cmp t1, w2, SHFT2LSB #16
|
|
||||||
bne 4f
|
|
||||||
ands r3, r3, b1, lsl #7
|
|
||||||
it eq
|
|
||||||
ldreq w2, [wp2], #4
|
|
||||||
bne 5f
|
|
||||||
eor t1, t1, w1
|
|
||||||
cmp t1, w2, SHFT2MSB #16
|
|
||||||
bne 6f
|
|
||||||
ldr w1, [wp1], #4
|
|
||||||
b 2b
|
|
||||||
|
|
||||||
5:
|
|
||||||
#ifdef __ARMEB__
|
|
||||||
/* The syndrome value may contain false ones if the string ends
|
|
||||||
* with the bytes 0x01 0x00
|
|
||||||
*/
|
|
||||||
tst w1, #0xff000000
|
|
||||||
it ne
|
|
||||||
tstne w1, #0x00ff0000
|
|
||||||
beq 7f
|
|
||||||
#else
|
|
||||||
lsls r3, r3, #16
|
|
||||||
bne 7f
|
|
||||||
#endif
|
|
||||||
ldrh w2, [wp2]
|
|
||||||
SHFT2LSB t1, w1, #16
|
|
||||||
#ifdef __ARMEB__
|
|
||||||
lsl w2, w2, #16
|
|
||||||
#endif
|
|
||||||
b 8f
|
|
||||||
|
|
||||||
6:
|
|
||||||
SHFT2MSB w2, w2, #16
|
|
||||||
SHFT2LSB t1, w1, #16
|
|
||||||
4:
|
|
||||||
SHFT2LSB w2, w2, #16
|
|
||||||
b 8f
|
|
||||||
|
|
||||||
/* Critical inner Loop: Block with 1 byte initial overlap */
|
|
||||||
.p2align 2
|
|
||||||
3:
|
|
||||||
and t1, w1, #LSB
|
|
||||||
cmp t1, w2, SHFT2LSB #24
|
|
||||||
sub r3, w1, b1
|
|
||||||
bic r3, r3, w1
|
|
||||||
bne 4f
|
|
||||||
ands r3, r3, b1, lsl #7
|
|
||||||
it eq
|
|
||||||
ldreq w2, [wp2], #4
|
|
||||||
bne 5f
|
|
||||||
eor t1, t1, w1
|
|
||||||
cmp t1, w2, SHFT2MSB #8
|
|
||||||
bne 6f
|
|
||||||
ldr w1, [wp1], #4
|
|
||||||
b 3b
|
|
||||||
4:
|
|
||||||
SHFT2LSB w2, w2, #24
|
|
||||||
b 8f
|
|
||||||
5:
|
|
||||||
/* The syndrome value may contain false ones if the string ends
|
|
||||||
* with the bytes 0x01 0x00
|
|
||||||
*/
|
|
||||||
tst w1, #LSB
|
|
||||||
beq 7f
|
|
||||||
ldr w2, [wp2], #4
|
|
||||||
6:
|
|
||||||
SHFT2LSB t1, w1, #8
|
|
||||||
bic w2, w2, #MSB
|
|
||||||
b 8f
|
|
||||||
7:
|
|
||||||
mov r0, #0
|
|
||||||
ldr r4, [sp], #4
|
|
||||||
ldr r5, [sp], #4
|
|
||||||
bx lr
|
|
||||||
|
|
||||||
8:
|
|
||||||
and r2, t1, #LSB
|
|
||||||
and r0, w2, #LSB
|
|
||||||
cmp r0, #1
|
|
||||||
it cs
|
|
||||||
cmpcs r0, r2
|
|
||||||
itt eq
|
|
||||||
SHFT2LSBEQ t1, t1, #8
|
|
||||||
SHFT2LSBEQ w2, w2, #8
|
|
||||||
beq 8b
|
|
||||||
sub r0, r2, r0
|
|
||||||
ldr r4, [sp], #4
|
|
||||||
ldr r5, [sp], #4
|
|
||||||
bx lr
|
|
||||||
END(strcmp)
|
|
Loading…
Reference in a new issue