Merge "Remove arm assembler not referenced from any makefile."

This commit is contained in:
Elliott Hughes 2014-12-15 18:46:16 +00:00 committed by Gerrit Code Review
commit ad01c98319
3 changed files with 0 additions and 1617 deletions

View file

@ -1,686 +0,0 @@
/*
* Copyright (C) 2008 The Android Open Source Project
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <machine/cpu-features.h>
#include <private/bionic_asm.h>
#if defined(__ARM_NEON__) && !defined(ARCH_ARM_USE_NON_NEON_MEMCPY)
.text
.fpu neon
#ifdef HAVE_32_BYTE_CACHE_LINE
/* a prefetch distance of 2 cache-lines */
#define CACHE_LINE_SIZE 32
#else
/* a prefetch distance of 4 cache-lines works best experimentally */
#define CACHE_LINE_SIZE 64
#endif
ENTRY(memcpy)
.save {r0, lr}
/* start preloading as early as possible */
pld [r1, #(CACHE_LINE_SIZE * 0)]
stmfd sp!, {r0, lr}
pld [r1, #(CACHE_LINE_SIZE * 1)]
/* If Neon supports unaligned access then remove the align code,
* unless a size limit has been specified.
*/
#ifndef NEON_UNALIGNED_ACCESS
/* do we have at least 16-bytes to copy (needed for alignment below) */
cmp r2, #16
blo 5f
/* check if buffers are aligned. If so, run arm-only version */
eor r3, r0, r1
ands r3, r3, #0x3
beq 11f
/* align destination to cache-line for the write-buffer */
rsb r3, r0, #0
ands r3, r3, #0xF
beq 2f
/* copy up to 15-bytes (count in r3) */
sub r2, r2, r3
movs ip, r3, lsl #31
ldrmib lr, [r1], #1
strmib lr, [r0], #1
ldrcsb ip, [r1], #1
ldrcsb lr, [r1], #1
strcsb ip, [r0], #1
strcsb lr, [r0], #1
movs ip, r3, lsl #29
bge 1f
// copies 4 bytes, destination 32-bits aligned
vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
1: bcc 2f
// copies 8 bytes, destination 64-bits aligned
vld1.8 {d0}, [r1]!
vst1.8 {d0}, [r0, :64]!
2:
/* preload immediately the next cache line, which we may need */
pld [r1, #(CACHE_LINE_SIZE * 0)]
pld [r1, #(CACHE_LINE_SIZE * 1)]
#ifdef HAVE_32_BYTE_CACHE_LINE
/* make sure we have at least 32 bytes to copy */
subs r2, r2, #32
blo 4f
/* preload all the cache lines we need.
* NOTE: the number of pld below depends on PREFETCH_DISTANCE,
* ideally would would increase the distance in the main loop to
* avoid the goofy code below. In practice this doesn't seem to make
* a big difference.
*/
pld [r1, #(PREFETCH_DISTANCE)]
1: /* The main loop copies 32 bytes at a time */
vld1.8 {d0 - d3}, [r1]!
pld [r1, #(PREFETCH_DISTANCE)]
subs r2, r2, #32
vst1.8 {d0 - d3}, [r0, :128]!
bhs 1b
#else
/* make sure we have at least 64 bytes to copy */
subs r2, r2, #64
blo 2f
/* preload all the cache lines we need. */
pld [r1, #(CACHE_LINE_SIZE * 2)]
pld [r1, #(CACHE_LINE_SIZE * 3)]
1: /* The main loop copies 64 bytes at a time */
vld1.8 {d0 - d3}, [r1]!
vld1.8 {d4 - d7}, [r1]!
#ifdef HAVE_32_BYTE_CACHE_LINE
pld [r1, #(CACHE_LINE_SIZE * 2)]
pld [r1, #(CACHE_LINE_SIZE * 3)]
#else
pld [r1, #(CACHE_LINE_SIZE * 3)]
#endif
subs r2, r2, #64
vst1.8 {d0 - d3}, [r0, :128]!
vst1.8 {d4 - d7}, [r0, :128]!
bhs 1b
2: /* fix-up the remaining count and make sure we have >= 32 bytes left */
add r2, r2, #64
subs r2, r2, #32
blo 4f
3: /* 32 bytes at a time. These cache lines were already preloaded */
vld1.8 {d0 - d3}, [r1]!
subs r2, r2, #32
vst1.8 {d0 - d3}, [r0, :128]!
bhs 3b
#endif
4: /* less than 32 left */
add r2, r2, #32
tst r2, #0x10
beq 5f
// copies 16 bytes, 128-bits aligned
vld1.8 {d0, d1}, [r1]!
vst1.8 {d0, d1}, [r0, :128]!
5: /* copy up to 15-bytes (count in r2) */
movs ip, r2, lsl #29
bcc 1f
vld1.8 {d0}, [r1]!
vst1.8 {d0}, [r0]!
1: bge 2f
vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
2: movs ip, r2, lsl #31
ldrmib r3, [r1], #1
ldrcsb ip, [r1], #1
ldrcsb lr, [r1], #1
strmib r3, [r0], #1
strcsb ip, [r0], #1
strcsb lr, [r0], #1
ldmfd sp!, {r0, lr}
bx lr
#else /* NEON_UNALIGNED_ACCESS */
// Check so divider is at least 16 bytes, needed for alignment code.
cmp r2, #16
blo 5f
#ifdef NEON_MEMCPY_ALIGNMENT_DIVIDER
/* Check the upper size limit for Neon unaligned memory access in memcpy */
#if NEON_MEMCPY_ALIGNMENT_DIVIDER >= 16
cmp r2, #NEON_MEMCPY_ALIGNMENT_DIVIDER
blo 3f
#endif
/* check if buffers are aligned. If so, run arm-only version */
eor r3, r0, r1
ands r3, r3, #0x3
beq 11f
/* align destination to 16 bytes for the write-buffer */
rsb r3, r0, #0
ands r3, r3, #0xF
beq 3f
/* copy up to 15-bytes (count in r3) */
sub r2, r2, r3
movs ip, r3, lsl #31
ldrmib lr, [r1], #1
strmib lr, [r0], #1
ldrcsb ip, [r1], #1
ldrcsb lr, [r1], #1
strcsb ip, [r0], #1
strcsb lr, [r0], #1
movs ip, r3, lsl #29
bge 1f
// copies 4 bytes, destination 32-bits aligned
vld1.32 {d0[0]}, [r1]!
vst1.32 {d0[0]}, [r0, :32]!
1: bcc 2f
// copies 8 bytes, destination 64-bits aligned
vld1.8 {d0}, [r1]!
vst1.8 {d0}, [r0, :64]!
2:
/* preload immediately the next cache line, which we may need */
pld [r1, #(CACHE_LINE_SIZE * 0)]
pld [r1, #(CACHE_LINE_SIZE * 1)]
3:
#endif
/* make sure we have at least 64 bytes to copy */
subs r2, r2, #64
blo 2f
/* preload all the cache lines we need */
pld [r1, #(CACHE_LINE_SIZE * 2)]
pld [r1, #(CACHE_LINE_SIZE * 3)]
1: /* The main loop copies 64 bytes at a time */
vld1.8 {d0 - d3}, [r1]!
vld1.8 {d4 - d7}, [r1]!
#ifdef HAVE_32_BYTE_CACHE_LINE
pld [r1, #(CACHE_LINE_SIZE * 2)]
pld [r1, #(CACHE_LINE_SIZE * 3)]
#else
pld [r1, #(CACHE_LINE_SIZE * 3)]
#endif
subs r2, r2, #64
vst1.8 {d0 - d3}, [r0]!
vst1.8 {d4 - d7}, [r0]!
bhs 1b
2: /* fix-up the remaining count and make sure we have >= 32 bytes left */
add r2, r2, #64
subs r2, r2, #32
blo 4f
3: /* 32 bytes at a time. These cache lines were already preloaded */
vld1.8 {d0 - d3}, [r1]!
subs r2, r2, #32
vst1.8 {d0 - d3}, [r0]!
bhs 3b
4: /* less than 32 left */
add r2, r2, #32
tst r2, #0x10
beq 5f
// copies 16 bytes, 128-bits aligned
vld1.8 {d0, d1}, [r1]!
vst1.8 {d0, d1}, [r0]!
5: /* copy up to 15-bytes (count in r2) */
movs ip, r2, lsl #29
bcc 1f
vld1.8 {d0}, [r1]!
vst1.8 {d0}, [r0]!
1: bge 2f
vld1.32 {d0[0]}, [r1]!
vst1.32 {d0[0]}, [r0]!
2: movs ip, r2, lsl #31
ldrmib r3, [r1], #1
ldrcsb ip, [r1], #1
ldrcsb lr, [r1], #1
strmib r3, [r0], #1
strcsb ip, [r0], #1
strcsb lr, [r0], #1
ldmfd sp!, {r0, lr}
bx lr
#endif /* NEON_UNALIGNED_ACCESS */
11:
/* Simple arm-only copy loop to handle aligned copy operations */
stmfd sp!, {r4, r5, r6, r7, r8}
pld [r1, #(CACHE_LINE_SIZE * 2)]
/* Check alignment */
rsb r3, r1, #0
ands r3, #3
beq 2f
/* align source to 32 bits. We need to insert 2 instructions between
* a ldr[b|h] and str[b|h] because byte and half-word instructions
* stall 2 cycles.
*/
movs r12, r3, lsl #31
sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */
ldrmib r3, [r1], #1
ldrcsb r4, [r1], #1
ldrcsb r5, [r1], #1
strmib r3, [r0], #1
strcsb r4, [r0], #1
strcsb r5, [r0], #1
2:
subs r2, #32
blt 5f
pld [r1, #(CACHE_LINE_SIZE * 3)]
3: /* Main copy loop, copying 32 bytes at a time */
pld [r1, #(CACHE_LINE_SIZE * 4)]
ldmia r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
subs r2, r2, #32
stmia r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
bge 3b
5: /* Handle any remaining bytes */
adds r2, #32
beq 6f
movs r12, r2, lsl #28
ldmcsia r1!, {r3, r4, r5, r6} /* 16 bytes */
ldmmiia r1!, {r7, r8} /* 8 bytes */
stmcsia r0!, {r3, r4, r5, r6}
stmmiia r0!, {r7, r8}
movs r12, r2, lsl #30
ldrcs r3, [r1], #4 /* 4 bytes */
ldrmih r4, [r1], #2 /* 2 bytes */
strcs r3, [r0], #4
strmih r4, [r0], #2
tst r2, #0x1
ldrneb r3, [r1] /* last byte */
strneb r3, [r0]
6:
ldmfd sp!, {r4, r5, r6, r7, r8}
ldmfd sp!, {r0, pc}
END(memcpy)
#else /* __ARM_ARCH__ < 7 */
/*
* Optimized memcpy() for ARM.
*
* note that memcpy() always returns the destination pointer,
* so we have to preserve R0.
*/
ENTRY(memcpy)
/* The stack must always be 64-bits aligned to be compliant with the
* ARM ABI. Since we have to save R0, we might as well save R4
* which we can use for better pipelining of the reads below
*/
.save {r0, r4, lr}
stmfd sp!, {r0, r4, lr}
/* Making room for r5-r11 which will be spilled later */
.pad #28
sub sp, sp, #28
// preload the destination because we'll align it to a cache line
// with small writes. Also start the source "pump".
pld [r0, #0]
pld [r1, #0]
pld [r1, #32]
/* it simplifies things to take care of len<4 early */
cmp r2, #4
blo copy_last_3_and_return
/* compute the offset to align the source
* offset = (4-(src&3))&3 = -src & 3
*/
rsb r3, r1, #0
ands r3, r3, #3
beq src_aligned
/* align source to 32 bits. We need to insert 2 instructions between
* a ldr[b|h] and str[b|h] because byte and half-word instructions
* stall 2 cycles.
*/
movs r12, r3, lsl #31
sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */
ldrmib r3, [r1], #1
ldrcsb r4, [r1], #1
ldrcsb r12,[r1], #1
strmib r3, [r0], #1
strcsb r4, [r0], #1
strcsb r12,[r0], #1
src_aligned:
/* see if src and dst are aligned together (congruent) */
eor r12, r0, r1
tst r12, #3
bne non_congruent
/* Use post-incriment mode for stm to spill r5-r11 to reserved stack
* frame. Don't update sp.
*/
stmea sp, {r5-r11}
/* align the destination to a cache-line */
rsb r3, r0, #0
ands r3, r3, #0x1C
beq congruent_aligned32
cmp r3, r2
andhi r3, r2, #0x1C
/* conditionnaly copies 0 to 7 words (length in r3) */
movs r12, r3, lsl #28
ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */
ldmmiia r1!, {r8, r9} /* 8 bytes */
stmcsia r0!, {r4, r5, r6, r7}
stmmiia r0!, {r8, r9}
tst r3, #0x4
ldrne r10,[r1], #4 /* 4 bytes */
strne r10,[r0], #4
sub r2, r2, r3
congruent_aligned32:
/*
* here source is aligned to 32 bytes.
*/
cached_aligned32:
subs r2, r2, #32
blo less_than_32_left
/*
* We preload a cache-line up to 64 bytes ahead. On the 926, this will
* stall only until the requested world is fetched, but the linefill
* continues in the the background.
* While the linefill is going, we write our previous cache-line
* into the write-buffer (which should have some free space).
* When the linefill is done, the writebuffer will
* start dumping its content into memory
*
* While all this is going, we then load a full cache line into
* 8 registers, this cache line should be in the cache by now
* (or partly in the cache).
*
* This code should work well regardless of the source/dest alignment.
*
*/
// Align the preload register to a cache-line because the cpu does
// "critical word first" (the first word requested is loaded first).
bic r12, r1, #0x1F
add r12, r12, #64
1: ldmia r1!, { r4-r11 }
pld [r12, #64]
subs r2, r2, #32
// NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
// for ARM9 preload will not be safely guarded by the preceding subs.
// When it is safely guarded the only possibility to have SIGSEGV here
// is because the caller overstates the length.
ldrhi r3, [r12], #32 /* cheap ARM9 preload */
stmia r0!, { r4-r11 }
bhs 1b
add r2, r2, #32
less_than_32_left:
/*
* less than 32 bytes left at this point (length in r2)
*/
/* skip all this if there is nothing to do, which should
* be a common case (if not executed the code below takes
* about 16 cycles)
*/
tst r2, #0x1F
beq 1f
/* conditionnaly copies 0 to 31 bytes */
movs r12, r2, lsl #28
ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */
ldmmiia r1!, {r8, r9} /* 8 bytes */
stmcsia r0!, {r4, r5, r6, r7}
stmmiia r0!, {r8, r9}
movs r12, r2, lsl #30
ldrcs r3, [r1], #4 /* 4 bytes */
ldrmih r4, [r1], #2 /* 2 bytes */
strcs r3, [r0], #4
strmih r4, [r0], #2
tst r2, #0x1
ldrneb r3, [r1] /* last byte */
strneb r3, [r0]
/* we're done! restore everything and return */
1: ldmfd sp!, {r5-r11}
ldmfd sp!, {r0, r4, lr}
bx lr
/********************************************************************/
non_congruent:
/*
* here source is aligned to 4 bytes
* but destination is not.
*
* in the code below r2 is the number of bytes read
* (the number of bytes written is always smaller, because we have
* partial words in the shift queue)
*/
cmp r2, #4
blo copy_last_3_and_return
/* Use post-incriment mode for stm to spill r5-r11 to reserved stack
* frame. Don't update sp.
*/
stmea sp, {r5-r11}
/* compute shifts needed to align src to dest */
rsb r5, r0, #0
and r5, r5, #3 /* r5 = # bytes in partial words */
mov r12, r5, lsl #3 /* r12 = right */
rsb lr, r12, #32 /* lr = left */
/* read the first word */
ldr r3, [r1], #4
sub r2, r2, #4
/* write a partial word (0 to 3 bytes), such that destination
* becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
*/
movs r5, r5, lsl #31
strmib r3, [r0], #1
movmi r3, r3, lsr #8
strcsb r3, [r0], #1
movcs r3, r3, lsr #8
strcsb r3, [r0], #1
movcs r3, r3, lsr #8
cmp r2, #4
blo partial_word_tail
/* Align destination to 32 bytes (cache line boundary) */
1: tst r0, #0x1c
beq 2f
ldr r5, [r1], #4
sub r2, r2, #4
orr r4, r3, r5, lsl lr
mov r3, r5, lsr r12
str r4, [r0], #4
cmp r2, #4
bhs 1b
blo partial_word_tail
/* copy 32 bytes at a time */
2: subs r2, r2, #32
blo less_than_thirtytwo
/* Use immediate mode for the shifts, because there is an extra cycle
* for register shifts, which could account for up to 50% of
* performance hit.
*/
cmp r12, #24
beq loop24
cmp r12, #8
beq loop8
loop16:
ldr r12, [r1], #4
1: mov r4, r12
ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
pld [r1, #64]
subs r2, r2, #32
ldrhs r12, [r1], #4
orr r3, r3, r4, lsl #16
mov r4, r4, lsr #16
orr r4, r4, r5, lsl #16
mov r5, r5, lsr #16
orr r5, r5, r6, lsl #16
mov r6, r6, lsr #16
orr r6, r6, r7, lsl #16
mov r7, r7, lsr #16
orr r7, r7, r8, lsl #16
mov r8, r8, lsr #16
orr r8, r8, r9, lsl #16
mov r9, r9, lsr #16
orr r9, r9, r10, lsl #16
mov r10, r10, lsr #16
orr r10, r10, r11, lsl #16
stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
mov r3, r11, lsr #16
bhs 1b
b less_than_thirtytwo
loop8:
ldr r12, [r1], #4
1: mov r4, r12
ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
pld [r1, #64]
subs r2, r2, #32
ldrhs r12, [r1], #4
orr r3, r3, r4, lsl #24
mov r4, r4, lsr #8
orr r4, r4, r5, lsl #24
mov r5, r5, lsr #8
orr r5, r5, r6, lsl #24
mov r6, r6, lsr #8
orr r6, r6, r7, lsl #24
mov r7, r7, lsr #8
orr r7, r7, r8, lsl #24
mov r8, r8, lsr #8
orr r8, r8, r9, lsl #24
mov r9, r9, lsr #8
orr r9, r9, r10, lsl #24
mov r10, r10, lsr #8
orr r10, r10, r11, lsl #24
stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
mov r3, r11, lsr #8
bhs 1b
b less_than_thirtytwo
loop24:
ldr r12, [r1], #4
1: mov r4, r12
ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
pld [r1, #64]
subs r2, r2, #32
ldrhs r12, [r1], #4
orr r3, r3, r4, lsl #8
mov r4, r4, lsr #24
orr r4, r4, r5, lsl #8
mov r5, r5, lsr #24
orr r5, r5, r6, lsl #8
mov r6, r6, lsr #24
orr r6, r6, r7, lsl #8
mov r7, r7, lsr #24
orr r7, r7, r8, lsl #8
mov r8, r8, lsr #24
orr r8, r8, r9, lsl #8
mov r9, r9, lsr #24
orr r9, r9, r10, lsl #8
mov r10, r10, lsr #24
orr r10, r10, r11, lsl #8
stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
mov r3, r11, lsr #24
bhs 1b
less_than_thirtytwo:
/* copy the last 0 to 31 bytes of the source */
rsb r12, lr, #32 /* we corrupted r12, recompute it */
add r2, r2, #32
cmp r2, #4
blo partial_word_tail
1: ldr r5, [r1], #4
sub r2, r2, #4
orr r4, r3, r5, lsl lr
mov r3, r5, lsr r12
str r4, [r0], #4
cmp r2, #4
bhs 1b
partial_word_tail:
/* we have a partial word in the input buffer */
movs r5, lr, lsl #(31-3)
strmib r3, [r0], #1
movmi r3, r3, lsr #8
strcsb r3, [r0], #1
movcs r3, r3, lsr #8
strcsb r3, [r0], #1
/* Refill spilled registers from the stack. Don't update sp. */
ldmfd sp, {r5-r11}
copy_last_3_and_return:
movs r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */
ldrmib r2, [r1], #1
ldrcsb r3, [r1], #1
ldrcsb r12,[r1]
strmib r2, [r0], #1
strcsb r3, [r0], #1
strcsb r12,[r0]
/* we're done! restore sp and spilled registers and return */
add sp, sp, #28
ldmfd sp!, {r0, r4, lr}
bx lr
END(memcpy)
#endif /* __ARM_ARCH__ < 7 */

View file

@ -1,614 +0,0 @@
/* Copyright (c) 2013, Linaro Limited
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Linaro Limited nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
This memcpy routine is optimised for Cortex-A15 cores and takes advantage
of VFP or NEON when built with the appropriate flags.
Assumptions:
ARMv6 (ARMv7-a if using Neon)
ARM state
Unaligned accesses
LDRD/STRD support unaligned word accesses
*/
#include <machine/cpu-features.h>
#include <private/bionic_asm.h>
.syntax unified
/* This implementation requires ARM state. */
.arm
#ifdef __ARM_NEON__
.fpu neon
.arch armv7-a
# define FRAME_SIZE 4
# define USE_VFP
# define USE_NEON
#elif !defined (__SOFTFP__)
.arch armv6
.fpu vfpv2
# define FRAME_SIZE 32
# define USE_VFP
#else
.arch armv6
# define FRAME_SIZE 32
#endif
/* Old versions of GAS incorrectly implement the NEON align semantics. */
#ifdef BROKEN_ASM_NEON_ALIGN
#define ALIGN(addr, align) addr,:align
#else
#define ALIGN(addr, align) addr:align
#endif
#define PC_OFFSET 8 /* PC pipeline compensation. */
#define INSN_SIZE 4
/* Call parameters. */
#define dstin r0
#define src r1
#define count r2
/* Locals. */
#define tmp1 r3
#define dst ip
#define tmp2 r10
#ifndef USE_NEON
/* For bulk copies using GP registers. */
#define A_l r2 /* Call-clobbered. */
#define A_h r3 /* Call-clobbered. */
#define B_l r4
#define B_h r5
#define C_l r6
#define C_h r7
#define D_l r8
#define D_h r9
#endif
/* Number of lines ahead to pre-fetch data. If you change this the code
below will need adjustment to compensate. */
#define prefetch_lines 5
#ifdef USE_VFP
.macro cpy_line_vfp vreg, base
vstr \vreg, [dst, #\base]
vldr \vreg, [src, #\base]
vstr d0, [dst, #\base + 8]
vldr d0, [src, #\base + 8]
vstr d1, [dst, #\base + 16]
vldr d1, [src, #\base + 16]
vstr d2, [dst, #\base + 24]
vldr d2, [src, #\base + 24]
vstr \vreg, [dst, #\base + 32]
vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]
vstr d0, [dst, #\base + 40]
vldr d0, [src, #\base + 40]
vstr d1, [dst, #\base + 48]
vldr d1, [src, #\base + 48]
vstr d2, [dst, #\base + 56]
vldr d2, [src, #\base + 56]
.endm
.macro cpy_tail_vfp vreg, base
vstr \vreg, [dst, #\base]
vldr \vreg, [src, #\base]
vstr d0, [dst, #\base + 8]
vldr d0, [src, #\base + 8]
vstr d1, [dst, #\base + 16]
vldr d1, [src, #\base + 16]
vstr d2, [dst, #\base + 24]
vldr d2, [src, #\base + 24]
vstr \vreg, [dst, #\base + 32]
vstr d0, [dst, #\base + 40]
vldr d0, [src, #\base + 40]
vstr d1, [dst, #\base + 48]
vldr d1, [src, #\base + 48]
vstr d2, [dst, #\base + 56]
vldr d2, [src, #\base + 56]
.endm
#endif
.p2align 6
ENTRY(memcpy)
mov dst, dstin /* Preserve dstin, we need to return it. */
cmp count, #64
bge .Lcpy_not_short
/* Deal with small copies quickly by dropping straight into the
exit block. */
.Ltail63unaligned:
#ifdef USE_NEON
and tmp1, count, #0x38
rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
add pc, pc, tmp1
vld1.8 {d0}, [src]! /* 14 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 12 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 10 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 8 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 6 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 4 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 2 words to go. */
vst1.8 {d0}, [dst]!
tst count, #4
ldrne tmp1, [src], #4
strne tmp1, [dst], #4
#else
/* Copy up to 15 full words of data. May not be aligned. */
/* Cannot use VFP for unaligned data. */
and tmp1, count, #0x3c
add dst, dst, tmp1
add src, src, tmp1
rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
/* Jump directly into the sequence below at the correct offset. */
add pc, pc, tmp1, lsl #1
ldr tmp1, [src, #-60] /* 15 words to go. */
str tmp1, [dst, #-60]
ldr tmp1, [src, #-56] /* 14 words to go. */
str tmp1, [dst, #-56]
ldr tmp1, [src, #-52]
str tmp1, [dst, #-52]
ldr tmp1, [src, #-48] /* 12 words to go. */
str tmp1, [dst, #-48]
ldr tmp1, [src, #-44]
str tmp1, [dst, #-44]
ldr tmp1, [src, #-40] /* 10 words to go. */
str tmp1, [dst, #-40]
ldr tmp1, [src, #-36]
str tmp1, [dst, #-36]
ldr tmp1, [src, #-32] /* 8 words to go. */
str tmp1, [dst, #-32]
ldr tmp1, [src, #-28]
str tmp1, [dst, #-28]
ldr tmp1, [src, #-24] /* 6 words to go. */
str tmp1, [dst, #-24]
ldr tmp1, [src, #-20]
str tmp1, [dst, #-20]
ldr tmp1, [src, #-16] /* 4 words to go. */
str tmp1, [dst, #-16]
ldr tmp1, [src, #-12]
str tmp1, [dst, #-12]
ldr tmp1, [src, #-8] /* 2 words to go. */
str tmp1, [dst, #-8]
ldr tmp1, [src, #-4]
str tmp1, [dst, #-4]
#endif
lsls count, count, #31
ldrhcs tmp1, [src], #2
ldrbne src, [src] /* Src is dead, use as a scratch. */
strhcs tmp1, [dst], #2
strbne src, [dst]
bx lr
.Lcpy_not_short:
/* At least 64 bytes to copy, but don't know the alignment yet. */
str tmp2, [sp, #-FRAME_SIZE]!
and tmp2, src, #7
and tmp1, dst, #7
cmp tmp1, tmp2
bne .Lcpy_notaligned
#ifdef USE_VFP
/* Magic dust alert! Force VFP on Cortex-A9. Experiments show
that the FP pipeline is much better at streaming loads and
stores. This is outside the critical loop. */
vmov.f32 s0, s0
#endif
/* SRC and DST have the same mutual 32-bit alignment, but we may
still need to pre-copy some bytes to get to natural alignment.
We bring DST into full 64-bit alignment. */
lsls tmp2, dst, #29
beq 1f
rsbs tmp2, tmp2, #0
sub count, count, tmp2, lsr #29
ldrmi tmp1, [src], #4
strmi tmp1, [dst], #4
lsls tmp2, tmp2, #2
ldrhcs tmp1, [src], #2
ldrbne tmp2, [src], #1
strhcs tmp1, [dst], #2
strbne tmp2, [dst], #1
1:
subs tmp2, count, #64 /* Use tmp2 for count. */
blt .Ltail63aligned
cmp tmp2, #512
bge .Lcpy_body_long
.Lcpy_body_medium: /* Count in tmp2. */
#ifdef USE_VFP
1:
vldr d0, [src, #0]
subs tmp2, tmp2, #64
vldr d1, [src, #8]
vstr d0, [dst, #0]
vldr d0, [src, #16]
vstr d1, [dst, #8]
vldr d1, [src, #24]
vstr d0, [dst, #16]
vldr d0, [src, #32]
vstr d1, [dst, #24]
vldr d1, [src, #40]
vstr d0, [dst, #32]
vldr d0, [src, #48]
vstr d1, [dst, #40]
vldr d1, [src, #56]
vstr d0, [dst, #48]
add src, src, #64
vstr d1, [dst, #56]
add dst, dst, #64
bge 1b
tst tmp2, #0x3f
beq .Ldone
.Ltail63aligned: /* Count in tmp2. */
and tmp1, tmp2, #0x38
add dst, dst, tmp1
add src, src, tmp1
rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
add pc, pc, tmp1
vldr d0, [src, #-56] /* 14 words to go. */
vstr d0, [dst, #-56]
vldr d0, [src, #-48] /* 12 words to go. */
vstr d0, [dst, #-48]
vldr d0, [src, #-40] /* 10 words to go. */
vstr d0, [dst, #-40]
vldr d0, [src, #-32] /* 8 words to go. */
vstr d0, [dst, #-32]
vldr d0, [src, #-24] /* 6 words to go. */
vstr d0, [dst, #-24]
vldr d0, [src, #-16] /* 4 words to go. */
vstr d0, [dst, #-16]
vldr d0, [src, #-8] /* 2 words to go. */
vstr d0, [dst, #-8]
#else
sub src, src, #8
sub dst, dst, #8
1:
ldrd A_l, A_h, [src, #8]
strd A_l, A_h, [dst, #8]
ldrd A_l, A_h, [src, #16]
strd A_l, A_h, [dst, #16]
ldrd A_l, A_h, [src, #24]
strd A_l, A_h, [dst, #24]
ldrd A_l, A_h, [src, #32]
strd A_l, A_h, [dst, #32]
ldrd A_l, A_h, [src, #40]
strd A_l, A_h, [dst, #40]
ldrd A_l, A_h, [src, #48]
strd A_l, A_h, [dst, #48]
ldrd A_l, A_h, [src, #56]
strd A_l, A_h, [dst, #56]
ldrd A_l, A_h, [src, #64]!
strd A_l, A_h, [dst, #64]!
subs tmp2, tmp2, #64
bge 1b
tst tmp2, #0x3f
bne 1f
ldr tmp2,[sp], #FRAME_SIZE
bx lr
1:
add src, src, #8
add dst, dst, #8
.Ltail63aligned: /* Count in tmp2. */
/* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
we know that the src and dest are 32-bit aligned so we can use
LDRD/STRD to improve efficiency. */
/* TMP2 is now negative, but we don't care about that. The bottom
six bits still tell us how many bytes are left to copy. */
and tmp1, tmp2, #0x38
add dst, dst, tmp1
add src, src, tmp1
rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
add pc, pc, tmp1
ldrd A_l, A_h, [src, #-56] /* 14 words to go. */
strd A_l, A_h, [dst, #-56]
ldrd A_l, A_h, [src, #-48] /* 12 words to go. */
strd A_l, A_h, [dst, #-48]
ldrd A_l, A_h, [src, #-40] /* 10 words to go. */
strd A_l, A_h, [dst, #-40]
ldrd A_l, A_h, [src, #-32] /* 8 words to go. */
strd A_l, A_h, [dst, #-32]
ldrd A_l, A_h, [src, #-24] /* 6 words to go. */
strd A_l, A_h, [dst, #-24]
ldrd A_l, A_h, [src, #-16] /* 4 words to go. */
strd A_l, A_h, [dst, #-16]
ldrd A_l, A_h, [src, #-8] /* 2 words to go. */
strd A_l, A_h, [dst, #-8]
#endif
tst tmp2, #4
ldrne tmp1, [src], #4
strne tmp1, [dst], #4
lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
ldrhcs tmp1, [src], #2
ldrbne tmp2, [src]
strhcs tmp1, [dst], #2
strbne tmp2, [dst]
.Ldone:
ldr tmp2, [sp], #FRAME_SIZE
bx lr
.Lcpy_body_long: /* Count in tmp2. */
/* Long copy. We know that there's at least (prefetch_lines * 64)
bytes to go. */
#ifdef USE_VFP
/* Don't use PLD. Instead, read some data in advance of the current
copy position into a register. This should act like a PLD
operation but we won't have to repeat the transfer. */
vldr d3, [src, #0]
vldr d4, [src, #64]
vldr d5, [src, #128]
vldr d6, [src, #192]
vldr d7, [src, #256]
vldr d0, [src, #8]
vldr d1, [src, #16]
vldr d2, [src, #24]
add src, src, #32
subs tmp2, tmp2, #prefetch_lines * 64 * 2
blt 2f
1:
cpy_line_vfp d3, 0
cpy_line_vfp d4, 64
cpy_line_vfp d5, 128
add dst, dst, #3 * 64
add src, src, #3 * 64
cpy_line_vfp d6, 0
cpy_line_vfp d7, 64
add dst, dst, #2 * 64
add src, src, #2 * 64
subs tmp2, tmp2, #prefetch_lines * 64
bge 1b
2:
cpy_tail_vfp d3, 0
cpy_tail_vfp d4, 64
cpy_tail_vfp d5, 128
add src, src, #3 * 64
add dst, dst, #3 * 64
cpy_tail_vfp d6, 0
vstr d7, [dst, #64]
vldr d7, [src, #64]
vstr d0, [dst, #64 + 8]
vldr d0, [src, #64 + 8]
vstr d1, [dst, #64 + 16]
vldr d1, [src, #64 + 16]
vstr d2, [dst, #64 + 24]
vldr d2, [src, #64 + 24]
vstr d7, [dst, #64 + 32]
add src, src, #96
vstr d0, [dst, #64 + 40]
vstr d1, [dst, #64 + 48]
vstr d2, [dst, #64 + 56]
add dst, dst, #128
add tmp2, tmp2, #prefetch_lines * 64
b .Lcpy_body_medium
#else
/* Long copy. Use an SMS style loop to maximize the I/O
bandwidth of the core. We don't have enough spare registers
to synthesise prefetching, so use PLD operations. */
/* Pre-bias src and dst. */
sub src, src, #8
sub dst, dst, #8
pld [src, #8]
pld [src, #72]
subs tmp2, tmp2, #64
pld [src, #136]
ldrd A_l, A_h, [src, #8]
strd B_l, B_h, [sp, #8]
ldrd B_l, B_h, [src, #16]
strd C_l, C_h, [sp, #16]
ldrd C_l, C_h, [src, #24]
strd D_l, D_h, [sp, #24]
pld [src, #200]
ldrd D_l, D_h, [src, #32]!
b 1f
.p2align 6
2:
pld [src, #232]
strd A_l, A_h, [dst, #40]
ldrd A_l, A_h, [src, #40]
strd B_l, B_h, [dst, #48]
ldrd B_l, B_h, [src, #48]
strd C_l, C_h, [dst, #56]
ldrd C_l, C_h, [src, #56]
strd D_l, D_h, [dst, #64]!
ldrd D_l, D_h, [src, #64]!
subs tmp2, tmp2, #64
1:
strd A_l, A_h, [dst, #8]
ldrd A_l, A_h, [src, #8]
strd B_l, B_h, [dst, #16]
ldrd B_l, B_h, [src, #16]
strd C_l, C_h, [dst, #24]
ldrd C_l, C_h, [src, #24]
strd D_l, D_h, [dst, #32]
ldrd D_l, D_h, [src, #32]
bcs 2b
/* Save the remaining bytes and restore the callee-saved regs. */
strd A_l, A_h, [dst, #40]
add src, src, #40
strd B_l, B_h, [dst, #48]
ldrd B_l, B_h, [sp, #8]
strd C_l, C_h, [dst, #56]
ldrd C_l, C_h, [sp, #16]
strd D_l, D_h, [dst, #64]
ldrd D_l, D_h, [sp, #24]
add dst, dst, #72
tst tmp2, #0x3f
bne .Ltail63aligned
ldr tmp2, [sp], #FRAME_SIZE
bx lr
#endif
.Lcpy_notaligned:
pld [src]
pld [src, #64]
/* There's at least 64 bytes to copy, but there is no mutual
alignment. */
/* Bring DST to 64-bit alignment. */
lsls tmp2, dst, #29
pld [src, #(2 * 64)]
beq 1f
rsbs tmp2, tmp2, #0
sub count, count, tmp2, lsr #29
ldrmi tmp1, [src], #4
strmi tmp1, [dst], #4
lsls tmp2, tmp2, #2
ldrbne tmp1, [src], #1
ldrhcs tmp2, [src], #2
strbne tmp1, [dst], #1
strhcs tmp2, [dst], #2
1:
pld [src, #(3 * 64)]
subs count, count, #64
ldrmi tmp2, [sp], #FRAME_SIZE
bmi .Ltail63unaligned
pld [src, #(4 * 64)]
#ifdef USE_NEON
vld1.8 {d0-d3}, [src]!
vld1.8 {d4-d7}, [src]!
subs count, count, #64
bmi 2f
1:
pld [src, #(4 * 64)]
vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
vld1.8 {d0-d3}, [src]!
vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
vld1.8 {d4-d7}, [src]!
subs count, count, #64
bpl 1b
2:
vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
ands count, count, #0x3f
#else
/* Use an SMS style loop to maximize the I/O bandwidth. */
sub src, src, #4
sub dst, dst, #8
subs tmp2, count, #64 /* Use tmp2 for count. */
ldr A_l, [src, #4]
ldr A_h, [src, #8]
strd B_l, B_h, [sp, #8]
ldr B_l, [src, #12]
ldr B_h, [src, #16]
strd C_l, C_h, [sp, #16]
ldr C_l, [src, #20]
ldr C_h, [src, #24]
strd D_l, D_h, [sp, #24]
ldr D_l, [src, #28]
ldr D_h, [src, #32]!
b 1f
.p2align 6
2:
pld [src, #(5 * 64) - (32 - 4)]
strd A_l, A_h, [dst, #40]
ldr A_l, [src, #36]
ldr A_h, [src, #40]
strd B_l, B_h, [dst, #48]
ldr B_l, [src, #44]
ldr B_h, [src, #48]
strd C_l, C_h, [dst, #56]
ldr C_l, [src, #52]
ldr C_h, [src, #56]
strd D_l, D_h, [dst, #64]!
ldr D_l, [src, #60]
ldr D_h, [src, #64]!
subs tmp2, tmp2, #64
1:
strd A_l, A_h, [dst, #8]
ldr A_l, [src, #4]
ldr A_h, [src, #8]
strd B_l, B_h, [dst, #16]
ldr B_l, [src, #12]
ldr B_h, [src, #16]
strd C_l, C_h, [dst, #24]
ldr C_l, [src, #20]
ldr C_h, [src, #24]
strd D_l, D_h, [dst, #32]
ldr D_l, [src, #28]
ldr D_h, [src, #32]
bcs 2b
/* Save the remaining bytes and restore the callee-saved regs. */
strd A_l, A_h, [dst, #40]
add src, src, #36
strd B_l, B_h, [dst, #48]
ldrd B_l, B_h, [sp, #8]
strd C_l, C_h, [dst, #56]
ldrd C_l, C_h, [sp, #16]
strd D_l, D_h, [dst, #64]
ldrd D_l, D_h, [sp, #24]
add dst, dst, #72
ands count, tmp2, #0x3f
#endif
ldr tmp2, [sp], #FRAME_SIZE
bne .Ltail63unaligned
bx lr
END(memcpy)

View file

@ -1,317 +0,0 @@
/*
* Copyright (c) 2011 The Android Open Source Project
* Copyright (c) 2008 ARM Ltd
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the company may not be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <machine/cpu-features.h>
#include <private/bionic_asm.h>
.text
#ifdef __ARMEB__
#define SHFT2LSB lsl
#define SHFT2LSBEQ lsleq
#define SHFT2MSB lsr
#define SHFT2MSBEQ lsreq
#define MSB 0x000000ff
#define LSB 0xff000000
#else
#define SHFT2LSB lsr
#define SHFT2LSBEQ lsreq
#define SHFT2MSB lsl
#define SHFT2MSBEQ lsleq
#define MSB 0xff000000
#define LSB 0x000000ff
#endif
#define magic1(REG) REG
#define magic2(REG) REG, lsl #7
ENTRY(strcmp)
pld [r0, #0]
pld [r1, #0]
eor r2, r0, r1
tst r2, #3
/* Strings not at same byte offset from a word boundary. */
bne .Lstrcmp_unaligned
ands r2, r0, #3
bic r0, r0, #3
bic r1, r1, #3
ldr ip, [r0], #4
it eq
ldreq r3, [r1], #4
beq 1f
/* Although s1 and s2 have identical initial alignment, they are
* not currently word aligned. Rather than comparing bytes,
* make sure that any bytes fetched from before the addressed
* bytes are forced to 0xff. Then they will always compare
* equal.
*/
eor r2, r2, #3
lsl r2, r2, #3
mvn r3, #MSB
SHFT2LSB r2, r3, r2
ldr r3, [r1], #4
orr ip, ip, r2
orr r3, r3, r2
1:
/* Load the 'magic' constant 0x01010101. */
str r4, [sp, #-4]!
mov r4, #1
orr r4, r4, r4, lsl #8
orr r4, r4, r4, lsl #16
.p2align 2
4:
pld [r0, #8]
pld [r1, #8]
sub r2, ip, magic1(r4)
cmp ip, r3
itttt eq
/* check for any zero bytes in first word */
biceq r2, r2, ip
tsteq r2, magic2(r4)
ldreq ip, [r0], #4
ldreq r3, [r1], #4
beq 4b
2:
/* There's a zero or a different byte in the word */
SHFT2MSB r0, ip, #24
SHFT2LSB ip, ip, #8
cmp r0, #1
it cs
cmpcs r0, r3, SHFT2MSB #24
it eq
SHFT2LSBEQ r3, r3, #8
beq 2b
/* On a big-endian machine, r0 contains the desired byte in bits
* 0-7; on a little-endian machine they are in bits 24-31. In
* both cases the other bits in r0 are all zero. For r3 the
* interesting byte is at the other end of the word, but the
* other bits are not necessarily zero. We need a signed result
* representing the differnece in the unsigned bytes, so for the
* little-endian case we can't just shift the interesting bits up.
*/
#ifdef __ARMEB__
sub r0, r0, r3, lsr #24
#else
and r3, r3, #255
/* No RSB instruction in Thumb2 */
#ifdef __thumb2__
lsr r0, r0, #24
sub r0, r0, r3
#else
rsb r0, r3, r0, lsr #24
#endif
#endif
ldr r4, [sp], #4
bx lr
.Lstrcmp_unaligned:
wp1 .req r0
wp2 .req r1
b1 .req r2
w1 .req r4
w2 .req r5
t1 .req ip
@ r3 is scratch
/* First of all, compare bytes until wp1(sp1) is word-aligned. */
1:
tst wp1, #3
beq 2f
ldrb r2, [wp1], #1
ldrb r3, [wp2], #1
cmp r2, #1
it cs
cmpcs r2, r3
beq 1b
sub r0, r2, r3
bx lr
2:
str r5, [sp, #-4]!
str r4, [sp, #-4]!
mov b1, #1
orr b1, b1, b1, lsl #8
orr b1, b1, b1, lsl #16
and t1, wp2, #3
bic wp2, wp2, #3
ldr w1, [wp1], #4
ldr w2, [wp2], #4
cmp t1, #2
beq 2f
bhi 3f
/* Critical inner Loop: Block with 3 bytes initial overlap */
.p2align 2
1:
bic t1, w1, #MSB
cmp t1, w2, SHFT2LSB #8
sub r3, w1, b1
bic r3, r3, w1
bne 4f
ands r3, r3, b1, lsl #7
it eq
ldreq w2, [wp2], #4
bne 5f
eor t1, t1, w1
cmp t1, w2, SHFT2MSB #24
bne 6f
ldr w1, [wp1], #4
b 1b
4:
SHFT2LSB w2, w2, #8
b 8f
5:
#ifdef __ARMEB__
/* The syndrome value may contain false ones if the string ends
* with the bytes 0x01 0x00
*/
tst w1, #0xff000000
itt ne
tstne w1, #0x00ff0000
tstne w1, #0x0000ff00
beq 7f
#else
bics r3, r3, #0xff000000
bne 7f
#endif
ldrb w2, [wp2]
SHFT2LSB t1, w1, #24
#ifdef __ARMEB__
lsl w2, w2, #24
#endif
b 8f
6:
SHFT2LSB t1, w1, #24
and w2, w2, #LSB
b 8f
/* Critical inner Loop: Block with 2 bytes initial overlap */
.p2align 2
2:
SHFT2MSB t1, w1, #16
sub r3, w1, b1
SHFT2LSB t1, t1, #16
bic r3, r3, w1
cmp t1, w2, SHFT2LSB #16
bne 4f
ands r3, r3, b1, lsl #7
it eq
ldreq w2, [wp2], #4
bne 5f
eor t1, t1, w1
cmp t1, w2, SHFT2MSB #16
bne 6f
ldr w1, [wp1], #4
b 2b
5:
#ifdef __ARMEB__
/* The syndrome value may contain false ones if the string ends
* with the bytes 0x01 0x00
*/
tst w1, #0xff000000
it ne
tstne w1, #0x00ff0000
beq 7f
#else
lsls r3, r3, #16
bne 7f
#endif
ldrh w2, [wp2]
SHFT2LSB t1, w1, #16
#ifdef __ARMEB__
lsl w2, w2, #16
#endif
b 8f
6:
SHFT2MSB w2, w2, #16
SHFT2LSB t1, w1, #16
4:
SHFT2LSB w2, w2, #16
b 8f
/* Critical inner Loop: Block with 1 byte initial overlap */
.p2align 2
3:
and t1, w1, #LSB
cmp t1, w2, SHFT2LSB #24
sub r3, w1, b1
bic r3, r3, w1
bne 4f
ands r3, r3, b1, lsl #7
it eq
ldreq w2, [wp2], #4
bne 5f
eor t1, t1, w1
cmp t1, w2, SHFT2MSB #8
bne 6f
ldr w1, [wp1], #4
b 3b
4:
SHFT2LSB w2, w2, #24
b 8f
5:
/* The syndrome value may contain false ones if the string ends
* with the bytes 0x01 0x00
*/
tst w1, #LSB
beq 7f
ldr w2, [wp2], #4
6:
SHFT2LSB t1, w1, #8
bic w2, w2, #MSB
b 8f
7:
mov r0, #0
ldr r4, [sp], #4
ldr r5, [sp], #4
bx lr
8:
and r2, t1, #LSB
and r0, w2, #LSB
cmp r0, #1
it cs
cmpcs r0, r2
itt eq
SHFT2LSBEQ t1, t1, #8
SHFT2LSBEQ w2, w2, #8
beq 8b
sub r0, r2, r0
ldr r4, [sp], #4
ldr r5, [sp], #4
bx lr
END(strcmp)