5b5d6e7045
Add 32-bit bionic implementation for denver. Use denver version of memcpy/ memset. Use Cortex-A15 version of strlen/strcat/strcpy/strcmp. Change-Id: I4c6b675f20cf41a29cadf70a11d1635d7df5b30a
234 lines
7.7 KiB
ArmAsm
234 lines
7.7 KiB
ArmAsm
/*
|
|
* Copyright (C) 2008 The Android Open Source Project
|
|
* All rights reserved.
|
|
* Copyright (c) 2013-2014, NVIDIA Corporation. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in
|
|
* the documentation and/or other materials provided with the
|
|
* distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
|
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
|
|
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
|
|
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
|
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
* SUCH DAMAGE.
|
|
*/
|
|
|
|
#define CACHE_LINE_SIZE (64)
|
|
#define PREFETCH_DISTANCE (CACHE_LINE_SIZE*6)
|
|
|
|
ENTRY_PRIVATE(MEMCPY_BASE)
|
|
.cfi_def_cfa_offset 8
|
|
.cfi_rel_offset r0, 0
|
|
.cfi_rel_offset lr, 4
|
|
|
|
cmp r2, #0
|
|
beq .L_memcpy_done
|
|
cmp r0, r1
|
|
beq .L_memcpy_done
|
|
|
|
/* preload next cache line */
|
|
pld [r1, #CACHE_LINE_SIZE*1]
|
|
|
|
/* Deal with very small blocks (< 32bytes) asap */
|
|
cmp r2, #32
|
|
blo .L_memcpy_lt_32bytes
|
|
/* no need to align if len < 128 bytes */
|
|
cmp r2, #128
|
|
blo .L_memcpy_lt_128bytes
|
|
|
|
/* large copy, align dest to 64 byte boundry */
|
|
pld [r1, #CACHE_LINE_SIZE*2]
|
|
rsb r3, r0, #0
|
|
ands r3, r3, #0x3F
|
|
pld [r1, #CACHE_LINE_SIZE*3]
|
|
beq .L_memcpy_dispatch
|
|
sub r2, r2, r3
|
|
/* copy 1 byte */
|
|
movs ip, r3, lsl #31
|
|
itt mi
|
|
ldrbmi ip, [r1], #1
|
|
strbmi ip, [r0], #1
|
|
/* copy 2 bytes */
|
|
itt cs
|
|
ldrhcs ip, [r1], #2
|
|
strhcs ip, [r0], #2
|
|
/* copy 4 bytes */
|
|
movs ip, r3, lsl #29
|
|
itt mi
|
|
ldrmi ip, [r1], #4
|
|
strmi ip, [r0], #4
|
|
/* copy 8 bytes */
|
|
bcc 1f
|
|
vld1.8 {d0}, [r1]!
|
|
vst1.8 {d0}, [r0, :64]!
|
|
1: /* copy 16 bytes */
|
|
movs ip, r3, lsl #27
|
|
bpl 1f
|
|
vld1.8 {q0}, [r1]!
|
|
vst1.8 {q0}, [r0, :128]!
|
|
1: /* copy 32 bytes */
|
|
bcc .L_memcpy_dispatch
|
|
vld1.8 {q0, q1}, [r1]!
|
|
vst1.8 {q0, q1}, [r0, :256]!
|
|
|
|
.L_memcpy_dispatch:
|
|
// pre-decrement by 128 to detect nearly-done condition easily, but
|
|
// also need to check if we have less than 128 bytes left at this
|
|
// point due to alignment code above
|
|
subs r2, r2, #128
|
|
blo .L_memcpy_lt_128presub
|
|
|
|
// Denver does better if both source and dest are aligned so
|
|
// we'll special-case that even though the code is virually identical
|
|
tst r1, #0xF
|
|
bne .L_memcpy_neon_unalign_src_pld
|
|
|
|
// DRAM memcpy should be throttled slightly to get full bandwidth
|
|
//
|
|
cmp r2, #32768
|
|
bhi .L_memcpy_neon_unalign_src_pld
|
|
.align 4
|
|
1:
|
|
/* copy 128 bytes in each loop */
|
|
subs r2, r2, #128
|
|
|
|
/* preload a cache line */
|
|
pld [r1, #PREFETCH_DISTANCE]
|
|
/* copy a cache line */
|
|
vld1.8 {q0, q1}, [r1, :128]!
|
|
vst1.8 {q0, q1}, [r0, :256]!
|
|
vld1.8 {q0, q1}, [r1, :128]!
|
|
vst1.8 {q0, q1}, [r0, :256]!
|
|
/* preload a cache line */
|
|
pld [r1, #PREFETCH_DISTANCE]
|
|
/* copy a cache line */
|
|
vld1.8 {q0, q1}, [r1, :128]!
|
|
vst1.8 {q0, q1}, [r0, :256]!
|
|
vld1.8 {q0, q1}, [r1, :128]!
|
|
vst1.8 {q0, q1}, [r0, :256]!
|
|
|
|
bhs 1b
|
|
adds r2, r2, #128
|
|
bne .L_memcpy_lt_128bytes_align
|
|
pop {r0, pc}
|
|
|
|
.align 4
|
|
.L_memcpy_neon_unalign_src_pld:
|
|
1:
|
|
/* copy 128 bytes in each loop */
|
|
subs r2, r2, #128
|
|
|
|
/* preload a cache line */
|
|
pld [r1, #PREFETCH_DISTANCE]
|
|
/* copy a cache line */
|
|
vld1.8 {q0, q1}, [r1]!
|
|
vst1.8 {q0, q1}, [r0, :256]!
|
|
vld1.8 {q0, q1}, [r1]!
|
|
vst1.8 {q0, q1}, [r0, :256]!
|
|
/* preload a cache line */
|
|
pld [r1, #PREFETCH_DISTANCE]
|
|
/* copy a cache line */
|
|
vld1.8 {q0, q1}, [r1]!
|
|
vst1.8 {q0, q1}, [r0, :256]!
|
|
vld1.8 {q0, q1}, [r1]!
|
|
vst1.8 {q0, q1}, [r0, :256]!
|
|
|
|
bhs 1b
|
|
adds r2, r2, #128
|
|
bne .L_memcpy_lt_128bytes_align
|
|
pop {r0, pc}
|
|
|
|
.L_memcpy_lt_128presub:
|
|
add r2, r2, #128
|
|
.L_memcpy_lt_128bytes_align:
|
|
/* copy 64 bytes */
|
|
movs ip, r2, lsl #26
|
|
bcc 1f
|
|
vld1.8 {q0, q1}, [r1]!
|
|
vst1.8 {q0, q1}, [r0, :256]!
|
|
vld1.8 {q0, q1}, [r1]!
|
|
vst1.8 {q0, q1}, [r0, :256]!
|
|
1: /* copy 32 bytes */
|
|
bpl 1f
|
|
vld1.8 {q0, q1}, [r1]!
|
|
vst1.8 {q0, q1}, [r0, :256]!
|
|
1: /* copy 16 bytes */
|
|
movs ip, r2, lsl #28
|
|
bcc 1f
|
|
vld1.8 {q0}, [r1]!
|
|
vst1.8 {q0}, [r0, :128]!
|
|
1: /* copy 8 bytes */
|
|
bpl 1f
|
|
vld1.8 {d0}, [r1]!
|
|
vst1.8 {d0}, [r0, :64]!
|
|
1: /* copy 4 bytes */
|
|
tst r2, #4
|
|
itt ne
|
|
ldrne ip, [r1], #4
|
|
strne ip, [r0], #4
|
|
/* copy 2 bytes */
|
|
movs ip, r2, lsl #31
|
|
itt cs
|
|
ldrhcs ip, [r1], #2
|
|
strhcs ip, [r0], #2
|
|
/* copy 1 byte */
|
|
itt mi
|
|
ldrbmi ip, [r1]
|
|
strbmi ip, [r0]
|
|
|
|
pop {r0, pc}
|
|
|
|
.L_memcpy_lt_128bytes:
|
|
/* copy 64 bytes */
|
|
movs ip, r2, lsl #26
|
|
bcc 1f
|
|
vld1.8 {q0, q1}, [r1]!
|
|
vst1.8 {q0, q1}, [r0]!
|
|
vld1.8 {q0, q1}, [r1]!
|
|
vst1.8 {q0, q1}, [r0]!
|
|
1: /* copy 32 bytes */
|
|
bpl .L_memcpy_lt_32bytes
|
|
vld1.8 {q0, q1}, [r1]!
|
|
vst1.8 {q0, q1}, [r0]!
|
|
.L_memcpy_lt_32bytes:
|
|
/* copy 16 bytes */
|
|
movs ip, r2, lsl #28
|
|
bcc 1f
|
|
vld1.8 {q0}, [r1]!
|
|
vst1.8 {q0}, [r0]!
|
|
1: /* copy 8 bytes */
|
|
bpl 1f
|
|
vld1.8 {d0}, [r1]!
|
|
vst1.8 {d0}, [r0]!
|
|
1: /* copy 4 bytes */
|
|
tst r2, #4
|
|
itt ne
|
|
ldrne ip, [r1], #4
|
|
strne ip, [r0], #4
|
|
/* copy 2 bytes */
|
|
movs ip, r2, lsl #31
|
|
itt cs
|
|
ldrhcs ip, [r1], #2
|
|
strhcs ip, [r0], #2
|
|
/* copy 1 byte */
|
|
itt mi
|
|
ldrbmi ip, [r1]
|
|
strbmi ip, [r0]
|
|
|
|
.L_memcpy_done:
|
|
pop {r0, pc}
|
|
END(MEMCPY_BASE)
|