denver: optimize memmove
Optimize 32-bit denver memmove with reversal memcpy. Change-Id: Iaad0a9475248cdd7e4f50d58bea9db1b767abc88
This commit is contained in:
parent
5265ad6273
commit
6c80ccdeed
8 changed files with 355 additions and 4 deletions
|
@ -13,7 +13,6 @@ libc_bionic_src_files_arm := \
|
|||
libc_common_src_files_arm += \
|
||||
bionic/index.cpp \
|
||||
bionic/memchr.c \
|
||||
bionic/memmove.c.arm \
|
||||
bionic/memrchr.c \
|
||||
bionic/strchr.cpp \
|
||||
bionic/strnlen.c \
|
||||
|
|
|
@ -7,3 +7,4 @@ libc_bionic_src_files_arm += \
|
|||
arch-arm/cortex-a15/bionic/strlen.S \
|
||||
arch-arm/cortex-a15/bionic/__strcat_chk.S \
|
||||
arch-arm/cortex-a15/bionic/__strcpy_chk.S \
|
||||
bionic/memmove.c \
|
||||
|
|
|
@ -7,3 +7,4 @@ libc_bionic_src_files_arm += \
|
|||
arch-arm/cortex-a9/bionic/strlen.S \
|
||||
arch-arm/cortex-a9/bionic/__strcat_chk.S \
|
||||
arch-arm/cortex-a9/bionic/__strcpy_chk.S \
|
||||
bionic/memmove.c \
|
||||
|
|
281
libc/arch-arm/denver/bionic/memmove.S
Normal file
281
libc/arch-arm/denver/bionic/memmove.S
Normal file
|
@ -0,0 +1,281 @@
|
|||
/*
|
||||
* Copyright (C) 2013 The Android Open Source Project
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013-2014 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
|
||||
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
|
||||
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <private/bionic_asm.h>
|
||||
#include <private/libc_events.h>
|
||||
|
||||
.text
|
||||
.syntax unified
|
||||
.fpu neon
|
||||
|
||||
#define CACHE_LINE_SIZE (64)
|
||||
#define MEMCPY_BLOCK_SIZE_SMALL (32768)
|
||||
#define MEMCPY_BLOCK_SIZE_MID (1048576)
|
||||
#define PREFETCH_DISTANCE_NEAR (CACHE_LINE_SIZE*4)
|
||||
#define PREFETCH_DISTANCE_MID (CACHE_LINE_SIZE*4)
|
||||
#define PREFETCH_DISTANCE_FAR (CACHE_LINE_SIZE*16)
|
||||
|
||||
ENTRY(memmove)
|
||||
cmp r2, #0
|
||||
cmpne r0, r1
|
||||
bxeq lr
|
||||
subs r3, r0, r1
|
||||
bls .L_jump_to_memcpy
|
||||
cmp r2, r3
|
||||
bhi .L_reversed_memcpy
|
||||
|
||||
.L_jump_to_memcpy:
|
||||
b memcpy
|
||||
|
||||
.L_reversed_memcpy:
|
||||
push {r0, lr}
|
||||
.cfi_def_cfa_offset 8
|
||||
.cfi_rel_offset r0, 0
|
||||
.cfi_rel_offset lr, 4
|
||||
|
||||
add r0, r0, r2
|
||||
add r1, r1, r2
|
||||
|
||||
/* preload next cache line */
|
||||
pld [r1, #-CACHE_LINE_SIZE]
|
||||
pld [r1, #-CACHE_LINE_SIZE*2]
|
||||
|
||||
.L_reversed_memcpy_align_dest:
|
||||
/* Deal with very small blocks (< 32bytes) asap */
|
||||
cmp r2, #32
|
||||
blo .L_reversed_memcpy_lt_32bytes
|
||||
/* no need to align if len < 128 bytes */
|
||||
cmp r2, #128
|
||||
blo .L_reversed_memcpy_lt_128bytes
|
||||
/* align destination to 64 bytes (1 cache line) */
|
||||
ands r3, r0, #0x3f
|
||||
beq .L_reversed_memcpy_dispatch
|
||||
sub r2, r2, r3
|
||||
0: /* copy 1 byte */
|
||||
movs ip, r3, lsl #31
|
||||
ldrbmi ip, [r1, #-1]!
|
||||
strbmi ip, [r0, #-1]!
|
||||
1: /* copy 2 bytes */
|
||||
ldrbcs ip, [r1, #-1]!
|
||||
strbcs ip, [r0, #-1]!
|
||||
ldrbcs ip, [r1, #-1]!
|
||||
strbcs ip, [r0, #-1]!
|
||||
2: /* copy 4 bytes */
|
||||
movs ip, r3, lsl #29
|
||||
bpl 3f
|
||||
sub r1, r1, #4
|
||||
sub r0, r0, #4
|
||||
vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]
|
||||
vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]
|
||||
3: /* copy 8 bytes */
|
||||
bcc 4f
|
||||
sub r1, r1, #8
|
||||
sub r0, r0, #8
|
||||
vld1.8 {d0}, [r1]
|
||||
vst1.8 {d0}, [r0, :64]
|
||||
4: /* copy 16 bytes */
|
||||
movs ip, r3, lsl #27
|
||||
bpl 5f
|
||||
sub r1, r1, #16
|
||||
sub r0, r0, #16
|
||||
vld1.8 {q0}, [r1]
|
||||
vst1.8 {q0}, [r0, :128]
|
||||
5: /* copy 32 bytes */
|
||||
bcc .L_reversed_memcpy_dispatch
|
||||
sub r1, r1, #32
|
||||
sub r0, r0, #32
|
||||
vld1.8 {q0, q1}, [r1]
|
||||
vst1.8 {q0, q1}, [r0, :256]
|
||||
|
||||
.L_reversed_memcpy_dispatch:
|
||||
/* preload more cache lines */
|
||||
pld [r1, #-CACHE_LINE_SIZE*3]
|
||||
pld [r1, #-CACHE_LINE_SIZE*4]
|
||||
|
||||
cmp r2, #MEMCPY_BLOCK_SIZE_SMALL
|
||||
blo .L_reversed_memcpy_neon_pld_near
|
||||
cmp r2, #MEMCPY_BLOCK_SIZE_MID
|
||||
blo .L_reversed_memcpy_neon_pld_mid
|
||||
b .L_reversed_memcpy_neon_pld_far
|
||||
|
||||
.L_reversed_memcpy_neon_pld_near:
|
||||
/* less than 128 bytes? */
|
||||
subs r2, r2, #128
|
||||
blo 1f
|
||||
sub r1, r1, #32
|
||||
sub r0, r0, #32
|
||||
mov r3, #-32
|
||||
.align 4
|
||||
0:
|
||||
/* copy 128 bytes in each loop */
|
||||
subs r2, r2, #128
|
||||
|
||||
/* preload to cache */
|
||||
pld [r1, #-(PREFETCH_DISTANCE_NEAR+CACHE_LINE_SIZE*2)+32]
|
||||
/* copy a cache line */
|
||||
vld1.8 {q0, q1}, [r1], r3
|
||||
vst1.8 {q0, q1}, [r0, :256], r3
|
||||
vld1.8 {q0, q1}, [r1], r3
|
||||
vst1.8 {q0, q1}, [r0, :256], r3
|
||||
|
||||
/* preload to cache */
|
||||
pld [r1, #-(PREFETCH_DISTANCE_NEAR+CACHE_LINE_SIZE*2)+32]
|
||||
/* copy a cache line */
|
||||
vld1.8 {q0, q1}, [r1], r3
|
||||
vst1.8 {q0, q1}, [r0, :256], r3
|
||||
vld1.8 {q0, q1}, [r1], r3
|
||||
vst1.8 {q0, q1}, [r0, :256], r3
|
||||
|
||||
bhs 0b
|
||||
add r1, r1, #32
|
||||
add r0, r0, #32
|
||||
1:
|
||||
adds r2, r2, #128
|
||||
bne .L_reversed_memcpy_lt_128bytes
|
||||
pop {r0, pc}
|
||||
|
||||
.L_reversed_memcpy_neon_pld_mid:
|
||||
subs r2, r2, #128
|
||||
sub r1, r1, #32
|
||||
sub r0, r0, #32
|
||||
mov r3, #-32
|
||||
.align 4
|
||||
0:
|
||||
/* copy 128 bytes in each loop */
|
||||
subs r2, r2, #128
|
||||
|
||||
/* preload to cache */
|
||||
pld [r1, #-(PREFETCH_DISTANCE_MID+CACHE_LINE_SIZE)+32]
|
||||
/* copy a cache line */
|
||||
vld1.8 {q0, q1}, [r1], r3
|
||||
vst1.8 {q0, q1}, [r0, :256], r3
|
||||
vld1.8 {q0, q1}, [r1], r3
|
||||
vst1.8 {q0, q1}, [r0, :256], r3
|
||||
|
||||
/* preload to cache */
|
||||
pld [r1, #-(PREFETCH_DISTANCE_MID+CACHE_LINE_SIZE)+32]
|
||||
/* copy a cache line */
|
||||
vld1.8 {q0, q1}, [r1], r3
|
||||
vst1.8 {q0, q1}, [r0, :256], r3
|
||||
vld1.8 {q0, q1}, [r1], r3
|
||||
vst1.8 {q0, q1}, [r0, :256], r3
|
||||
|
||||
bhs 0b
|
||||
add r1, r1, #32
|
||||
add r0, r0, #32
|
||||
1:
|
||||
adds r2, r2, #128
|
||||
bne .L_reversed_memcpy_lt_128bytes
|
||||
pop {r0, pc}
|
||||
|
||||
.L_reversed_memcpy_neon_pld_far:
|
||||
sub r2, r2, #128
|
||||
sub r0, r0, #128
|
||||
sub r1, r1, #128
|
||||
.align 4
|
||||
0:
|
||||
/* copy 128 bytes in each loop */
|
||||
subs r2, r2, #128
|
||||
|
||||
/* preload to cache */
|
||||
pld [r1, #-(PREFETCH_DISTANCE_FAR+CACHE_LINE_SIZE*2)+128]
|
||||
pld [r1, #-(PREFETCH_DISTANCE_FAR+CACHE_LINE_SIZE)+128]
|
||||
/* read */
|
||||
vld1.8 {q0, q1}, [r1]!
|
||||
vld1.8 {q2, q3}, [r1]!
|
||||
vld1.8 {q8, q9}, [r1]!
|
||||
vld1.8 {q10, q11}, [r1]!
|
||||
/* write */
|
||||
vst1.8 {q0, q1}, [r0, :256]!
|
||||
vst1.8 {q2, q3}, [r0, :256]!
|
||||
vst1.8 {q8, q9}, [r0, :256]!
|
||||
vst1.8 {q10, q11}, [r0, :256]!
|
||||
|
||||
sub r0, r0, #256
|
||||
sub r1, r1, #256
|
||||
bhs 0b
|
||||
add r0, r0, #128
|
||||
add r1, r1, #128
|
||||
1:
|
||||
adds r2, r2, #128
|
||||
bne .L_reversed_memcpy_lt_128bytes
|
||||
pop {r0, pc}
|
||||
|
||||
.L_reversed_memcpy_lt_128bytes:
|
||||
6: /* copy 64 bytes */
|
||||
movs ip, r2, lsl #26
|
||||
bcc 5f
|
||||
sub r1, r1, #32
|
||||
sub r0, r0, #32
|
||||
vld1.8 {q0, q1}, [r1]
|
||||
vst1.8 {q0, q1}, [r0]
|
||||
sub r1, r1, #32
|
||||
sub r0, r0, #32
|
||||
vld1.8 {q0, q1}, [r1]
|
||||
vst1.8 {q0, q1}, [r0]
|
||||
5: /* copy 32 bytes */
|
||||
bpl 4f
|
||||
sub r1, r1, #32
|
||||
sub r0, r0, #32
|
||||
vld1.8 {q0, q1}, [r1]
|
||||
vst1.8 {q0, q1}, [r0]
|
||||
.L_reversed_memcpy_lt_32bytes:
|
||||
4: /* copy 16 bytes */
|
||||
movs ip, r2, lsl #28
|
||||
bcc 3f
|
||||
sub r1, r1, #16
|
||||
sub r0, r0, #16
|
||||
vld1.8 {q0}, [r1]
|
||||
vst1.8 {q0}, [r0]
|
||||
3: /* copy 8 bytes */
|
||||
bpl 2f
|
||||
sub r1, r1, #8
|
||||
sub r0, r0, #8
|
||||
vld1.8 {d0}, [r1]
|
||||
vst1.8 {d0}, [r0]
|
||||
2: /* copy 4 bytes */
|
||||
ands ip, r2, #0x4
|
||||
beq 1f
|
||||
sub r1, r1, #4
|
||||
sub r0, r0, #4
|
||||
vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]
|
||||
vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
|
||||
1: /* copy 2 bytes */
|
||||
movs ip, r2, lsl #31
|
||||
ldrbcs ip, [r1, #-1]!
|
||||
strbcs ip, [r0, #-1]!
|
||||
ldrbcs ip, [r1, #-1]!
|
||||
strbcs ip, [r0, #-1]!
|
||||
0: /* copy 1 byte */
|
||||
ldrbmi ip, [r1, #-1]!
|
||||
strbmi ip, [r0, #-1]!
|
||||
|
||||
pop {r0, pc}
|
||||
|
||||
END(memmove)
|
|
@ -1,12 +1,13 @@
|
|||
libc_bionic_src_files_arm += \
|
||||
arch-arm/denver/bionic/memcpy.S \
|
||||
arch-arm/denver/bionic/memmove.S \
|
||||
arch-arm/denver/bionic/memset.S \
|
||||
arch-arm/denver/bionic/__strcat_chk.S \
|
||||
arch-arm/denver/bionic/__strcpy_chk.S
|
||||
arch-arm/denver/bionic/__strcpy_chk.S \
|
||||
|
||||
# Use cortex-a15 versions of strcat/strcpy/strlen.
|
||||
libc_bionic_src_files_arm += \
|
||||
arch-arm/cortex-a15/bionic/strcat.S \
|
||||
arch-arm/cortex-a15/bionic/strcpy.S \
|
||||
arch-arm/cortex-a15/bionic/strlen.S \
|
||||
arch-arm/cortex-a15/bionic/strcmp.S
|
||||
arch-arm/cortex-a15/bionic/strcmp.S \
|
||||
|
|
|
@ -4,6 +4,7 @@ libc_bionic_src_files_arm += \
|
|||
arch-arm/generic/bionic/strcmp.S \
|
||||
arch-arm/generic/bionic/strcpy.S \
|
||||
arch-arm/generic/bionic/strlen.c \
|
||||
bionic/memmove.c \
|
||||
bionic/__strcat_chk.cpp \
|
||||
bionic/__strcpy_chk.cpp \
|
||||
upstream-openbsd/lib/libc/string/strcat.c \
|
||||
|
|
|
@ -5,8 +5,9 @@ libc_bionic_src_files_arm += \
|
|||
arch-arm/krait/bionic/__strcat_chk.S \
|
||||
arch-arm/krait/bionic/__strcpy_chk.S \
|
||||
|
||||
# Use cortex-a15 versions of strcat/strcpy/strlen.
|
||||
# Use cortex-a15 versions of strcat/strcpy/strlen and standard memmove
|
||||
libc_bionic_src_files_arm += \
|
||||
arch-arm/cortex-a15/bionic/strcat.S \
|
||||
arch-arm/cortex-a15/bionic/strcpy.S \
|
||||
arch-arm/cortex-a15/bionic/strlen.S \
|
||||
bionic/memmove.c \
|
||||
|
|
|
@ -909,6 +909,56 @@ TEST(string, memmove) {
|
|||
}
|
||||
}
|
||||
|
||||
static void verify_memmove(char* src_copy, char* dst, char* src, size_t size) {
|
||||
memset(dst, 0, size);
|
||||
memcpy(src, src_copy, size);
|
||||
ASSERT_EQ(dst, memmove(dst, src, size));
|
||||
ASSERT_EQ(0, memcmp(dst, src_copy, size));
|
||||
}
|
||||
|
||||
#define MEMMOVE_DATA_SIZE (1024*1024*3)
|
||||
|
||||
TEST(string, memmove_check) {
|
||||
char* buffer = reinterpret_cast<char*>(malloc(MEMMOVE_DATA_SIZE));
|
||||
ASSERT_TRUE(buffer != NULL);
|
||||
|
||||
char* src_data = reinterpret_cast<char*>(malloc(MEMMOVE_DATA_SIZE));
|
||||
ASSERT_TRUE(src_data != NULL);
|
||||
// Initialize to a known pattern to copy into src for each test and
|
||||
// to compare dst against.
|
||||
for (size_t i = 0; i < MEMMOVE_DATA_SIZE; i++) {
|
||||
src_data[i] = (i + 1) % 255;
|
||||
}
|
||||
|
||||
// Check all different dst offsets between 0 and 127 inclusive.
|
||||
char* src = buffer;
|
||||
for (size_t i = 0; i < 127; i++) {
|
||||
char* dst = buffer + 256 + i;
|
||||
// Small copy.
|
||||
verify_memmove(src_data, dst, src, 1024);
|
||||
|
||||
// Medium copy.
|
||||
verify_memmove(src_data, dst, src, 64 * 1024);
|
||||
|
||||
// Medium copy.
|
||||
verify_memmove(src_data, dst, src, 1024 * 1024 + 128 * 1024);
|
||||
}
|
||||
|
||||
// Check all leftover size offsets between 1 and 127 inclusive.
|
||||
char* dst = buffer + 256;
|
||||
src = buffer;
|
||||
for (size_t size = 1; size < 127; size++) {
|
||||
// Small copy.
|
||||
verify_memmove(src_data, dst, src, 1024);
|
||||
|
||||
// Medium copy.
|
||||
verify_memmove(src_data, dst, src, 64 * 1024);
|
||||
|
||||
// Large copy.
|
||||
verify_memmove(src_data, dst, src, 1024 * 1024 + 128 * 1024);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(string, bcopy) {
|
||||
StringTestState<char> state(LARGE);
|
||||
for (size_t i = 0; i < state.n; i++) {
|
||||
|
@ -964,6 +1014,22 @@ TEST(string, memcpy_overread) {
|
|||
RunSrcDstBufferOverreadTest(DoMemcpyTest);
|
||||
}
|
||||
|
||||
static void DoMemmoveTest(uint8_t* src, uint8_t* dst, size_t len) {
|
||||
memset(src, (len % 255) + 1, len);
|
||||
memset(dst, 0, len);
|
||||
|
||||
ASSERT_EQ(dst, memmove(dst, src, len));
|
||||
ASSERT_TRUE(memcmp(src, dst, len) == 0);
|
||||
}
|
||||
|
||||
TEST(string, memmove_align) {
|
||||
RunSrcDstBufferAlignTest(LARGE, DoMemmoveTest);
|
||||
}
|
||||
|
||||
TEST(string, memmove_overread) {
|
||||
RunSrcDstBufferOverreadTest(DoMemmoveTest);
|
||||
}
|
||||
|
||||
static void DoMemsetTest(uint8_t* buf, size_t len) {
|
||||
for (size_t i = 0; i < len; i++) {
|
||||
buf[i] = 0;
|
||||
|
|
Loading…
Reference in a new issue