Use cortex-a53/bionic/memmove.S by default for arm64
cortex-a53/bionic/memmove.S looks like a more optimized version. It should be used in most cases. It delegates small (<= 96 bytes) moves to memcpy. The only exception is denver64. It is using its own memcpy, which doesn't allow overlap for < 96 bytes copies. Only for this variant we need generic/bionic/memmove.S. Benchmark result looks pretty close through (on marlin) Before: using generic/bionic/memmove.S ------------------------------------------------------------------- Benchmark Time CPU Iterations ------------------------------------------------------------------- BM_string_memcpy/8/0/0 6 ns 6 ns 108872005 1.15787GB/s BM_string_memcpy/64/0/0 7 ns 7 ns 107387438 9.14365GB/s BM_string_memcpy/512/0/0 21 ns 20 ns 34165353 23.2734GB/s BM_string_memcpy/1024/0/0 40 ns 39 ns 17766657 24.2346GB/s BM_string_memcpy/8192/0/0 311 ns 310 ns 2259904 24.6339GB/s BM_string_memcpy/16384/0/0 616 ns 613 ns 1143027 24.8852GB/s BM_string_memcpy/32768/0/0 1322 ns 1316 ns 530799 23.1835GB/s BM_string_memcpy/65536/0/0 2672 ns 2661 ns 229638 22.937GB/s BM_string_memcpy/131072/0/0 5379 ns 5357 ns 128316 22.788GB/s After: using cortex-a53/bionic/memmove.S ------------------------------------------------------------------- Benchmark Time CPU Iterations ------------------------------------------------------------------- BM_string_memcpy/8/0/0 6 ns 6 ns 116610749 1.24646GB/s BM_string_memcpy/64/0/0 6 ns 6 ns 115634093 9.84708GB/s BM_string_memcpy/512/0/0 21 ns 21 ns 34167322 22.8938GB/s BM_string_memcpy/1024/0/0 39 ns 39 ns 17859445 24.3312GB/s BM_string_memcpy/8192/0/0 311 ns 310 ns 2260192 24.6325GB/s BM_string_memcpy/16384/0/0 610 ns 608 ns 1151889 25.0987GB/s BM_string_memcpy/32768/0/0 1488 ns 1482 ns 532508 20.5988GB/s BM_string_memcpy/65536/0/0 2421 ns 2411 ns 290502 25.3146GB/s BM_string_memcpy/131072/0/0 5278 ns 5256 ns 132710 23.2234GB/s Test: Build and benchmark on marlin Bug: http://b/63992911 Change-Id: Id85961aca18ba841bcbcfe0d8b162843eab30584
This commit is contained in:
parent
8aa6d67f2d
commit
ece43e14c9
4 changed files with 432 additions and 462 deletions
|
@ -1042,45 +1042,15 @@ cc_library_static {
|
|||
denver64: {
|
||||
srcs: [
|
||||
"arch-arm64/denver64/bionic/memcpy.S",
|
||||
"arch-arm64/denver64/bionic/memmove.S",
|
||||
"arch-arm64/denver64/bionic/memset.S",
|
||||
],
|
||||
exclude_srcs: [
|
||||
"arch-arm64/generic/bionic/memcpy.S",
|
||||
"arch-arm64/generic/bionic/memmove.S",
|
||||
"arch-arm64/generic/bionic/memset.S",
|
||||
],
|
||||
},
|
||||
cortex_a53: {
|
||||
srcs: [
|
||||
"arch-arm64/cortex-a53/bionic/memmove.S",
|
||||
],
|
||||
exclude_srcs: [
|
||||
"arch-arm64/generic/bionic/memmove.S",
|
||||
],
|
||||
},
|
||||
cortex_a55: {
|
||||
srcs: [
|
||||
"arch-arm64/cortex-a53/bionic/memmove.S",
|
||||
],
|
||||
exclude_srcs: [
|
||||
"arch-arm64/generic/bionic/memmove.S",
|
||||
],
|
||||
},
|
||||
cortex_a73: {
|
||||
srcs: [
|
||||
"arch-arm64/cortex-a53/bionic/memmove.S",
|
||||
],
|
||||
exclude_srcs: [
|
||||
"arch-arm64/generic/bionic/memmove.S",
|
||||
],
|
||||
},
|
||||
cortex_a75: {
|
||||
srcs: [
|
||||
"arch-arm64/cortex-a53/bionic/memmove.S",
|
||||
],
|
||||
exclude_srcs: [
|
||||
"arch-arm64/generic/bionic/memmove.S",
|
||||
],
|
||||
},
|
||||
},
|
||||
|
||||
mips: {
|
||||
|
|
|
@ -1,153 +0,0 @@
|
|||
/* Copyright (c) 2013, Linaro Limited
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the Linaro nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
|
||||
|
||||
/*
|
||||
* Copyright (c) 2015 ARM Ltd
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. The name of the company may not be used to endorse or promote
|
||||
* products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
* IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
||||
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64, unaligned accesses, wchar_t is 4 bytes
|
||||
*/
|
||||
|
||||
#include <private/bionic_asm.h>
|
||||
|
||||
/* Parameters and result. */
|
||||
#define dstin x0
|
||||
#define src x1
|
||||
#define count x2
|
||||
#define srcend x3
|
||||
#define dstend x4
|
||||
#define tmp1 x5
|
||||
#define A_l x6
|
||||
#define A_h x7
|
||||
#define B_l x8
|
||||
#define B_h x9
|
||||
#define C_l x10
|
||||
#define C_h x11
|
||||
#define D_l x12
|
||||
#define D_h x13
|
||||
#define E_l count
|
||||
#define E_h tmp1
|
||||
|
||||
/* All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
|
||||
Larger backwards copies are also handled by memcpy. The only remaining
|
||||
case is forward large copies. The destination is aligned, and an
|
||||
unrolled loop processes 64 bytes per iteration.
|
||||
*/
|
||||
|
||||
#if defined(WMEMMOVE)
|
||||
ENTRY(wmemmove)
|
||||
lsl count, count, #2
|
||||
#else
|
||||
ENTRY(memmove)
|
||||
#endif
|
||||
sub tmp1, dstin, src
|
||||
cmp count, 96
|
||||
ccmp tmp1, count, 2, hi
|
||||
b.hs memcpy
|
||||
|
||||
cbz tmp1, 3f
|
||||
add dstend, dstin, count
|
||||
add srcend, src, count
|
||||
|
||||
/* Align dstend to 16 byte alignment so that we don't cross cache line
|
||||
boundaries on both loads and stores. There are at least 96 bytes
|
||||
to copy, so copy 16 bytes unaligned and then align. The loop
|
||||
copies 64 bytes per iteration and prefetches one iteration ahead. */
|
||||
|
||||
and tmp1, dstend, 15
|
||||
ldp D_l, D_h, [srcend, -16]
|
||||
sub srcend, srcend, tmp1
|
||||
sub count, count, tmp1
|
||||
ldp A_l, A_h, [srcend, -16]
|
||||
stp D_l, D_h, [dstend, -16]
|
||||
ldp B_l, B_h, [srcend, -32]
|
||||
ldp C_l, C_h, [srcend, -48]
|
||||
ldp D_l, D_h, [srcend, -64]!
|
||||
sub dstend, dstend, tmp1
|
||||
subs count, count, 128
|
||||
b.ls 2f
|
||||
nop
|
||||
1:
|
||||
stp A_l, A_h, [dstend, -16]
|
||||
ldp A_l, A_h, [srcend, -16]
|
||||
stp B_l, B_h, [dstend, -32]
|
||||
ldp B_l, B_h, [srcend, -32]
|
||||
stp C_l, C_h, [dstend, -48]
|
||||
ldp C_l, C_h, [srcend, -48]
|
||||
stp D_l, D_h, [dstend, -64]!
|
||||
ldp D_l, D_h, [srcend, -64]!
|
||||
subs count, count, 64
|
||||
b.hi 1b
|
||||
|
||||
/* Write the last full set of 64 bytes. The remainder is at most 64
|
||||
bytes, so it is safe to always copy 64 bytes from the start even if
|
||||
there is just 1 byte left. */
|
||||
2:
|
||||
ldp E_l, E_h, [src, 48]
|
||||
stp A_l, A_h, [dstend, -16]
|
||||
ldp A_l, A_h, [src, 32]
|
||||
stp B_l, B_h, [dstend, -32]
|
||||
ldp B_l, B_h, [src, 16]
|
||||
stp C_l, C_h, [dstend, -48]
|
||||
ldp C_l, C_h, [src]
|
||||
stp D_l, D_h, [dstend, -64]
|
||||
stp E_l, E_h, [dstin, 48]
|
||||
stp A_l, A_h, [dstin, 32]
|
||||
stp B_l, B_h, [dstin, 16]
|
||||
stp C_l, C_h, [dstin]
|
||||
3: ret
|
||||
|
||||
#if defined(WMEMMOVE)
|
||||
END(wmemmove)
|
||||
#else
|
||||
END(memmove)
|
||||
#endif
|
329
libc/arch-arm64/denver64/bionic/memmove.S
Normal file
329
libc/arch-arm64/denver64/bionic/memmove.S
Normal file
|
@ -0,0 +1,329 @@
|
|||
/* Copyright (c) 2014, Linaro Limited
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the Linaro nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64
|
||||
* Unaligned accesses
|
||||
* wchar_t is 4 bytes
|
||||
*/
|
||||
|
||||
#include <private/bionic_asm.h>
|
||||
|
||||
/* Parameters and result. */
|
||||
#define dstin x0
|
||||
#define src x1
|
||||
#define count x2
|
||||
#define tmp1 x3
|
||||
#define tmp1w w3
|
||||
#define tmp2 x4
|
||||
#define tmp2w w4
|
||||
#define tmp3 x5
|
||||
#define tmp3w w5
|
||||
#define dst x6
|
||||
|
||||
#define A_l x7
|
||||
#define A_h x8
|
||||
#define B_l x9
|
||||
#define B_h x10
|
||||
#define C_l x11
|
||||
#define C_h x12
|
||||
#define D_l x13
|
||||
#define D_h x14
|
||||
|
||||
#if defined(WMEMMOVE)
|
||||
ENTRY(wmemmove)
|
||||
lsl count, count, #2
|
||||
#else
|
||||
ENTRY(memmove)
|
||||
#endif
|
||||
cmp dstin, src
|
||||
b.lo .Ldownwards
|
||||
add tmp1, src, count
|
||||
cmp dstin, tmp1
|
||||
b.hs memcpy /* No overlap. */
|
||||
|
||||
/* Upwards move with potential overlap.
|
||||
* Need to move from the tail backwards. SRC and DST point one
|
||||
* byte beyond the remaining data to move. */
|
||||
add dst, dstin, count
|
||||
add src, src, count
|
||||
cmp count, #64
|
||||
b.ge .Lmov_not_short_up
|
||||
|
||||
/* Deal with small moves quickly by dropping straight into the
|
||||
* exit block. */
|
||||
.Ltail63up:
|
||||
/* Move up to 48 bytes of data. At this point we only need the
|
||||
* bottom 6 bits of count to be accurate. */
|
||||
ands tmp1, count, #0x30
|
||||
b.eq .Ltail15up
|
||||
sub dst, dst, tmp1
|
||||
sub src, src, tmp1
|
||||
cmp tmp1w, #0x20
|
||||
b.eq 1f
|
||||
b.lt 2f
|
||||
ldp A_l, A_h, [src, #32]
|
||||
stp A_l, A_h, [dst, #32]
|
||||
1:
|
||||
ldp A_l, A_h, [src, #16]
|
||||
stp A_l, A_h, [dst, #16]
|
||||
2:
|
||||
ldp A_l, A_h, [src]
|
||||
stp A_l, A_h, [dst]
|
||||
.Ltail15up:
|
||||
/* Move up to 15 bytes of data. Does not assume additional data
|
||||
* being moved. */
|
||||
tbz count, #3, 1f
|
||||
ldr tmp1, [src, #-8]!
|
||||
str tmp1, [dst, #-8]!
|
||||
1:
|
||||
tbz count, #2, 1f
|
||||
ldr tmp1w, [src, #-4]!
|
||||
str tmp1w, [dst, #-4]!
|
||||
1:
|
||||
tbz count, #1, 1f
|
||||
ldrh tmp1w, [src, #-2]!
|
||||
strh tmp1w, [dst, #-2]!
|
||||
1:
|
||||
tbz count, #0, 1f
|
||||
ldrb tmp1w, [src, #-1]
|
||||
strb tmp1w, [dst, #-1]
|
||||
1:
|
||||
ret
|
||||
|
||||
.Lmov_not_short_up:
|
||||
/* We don't much care about the alignment of DST, but we want SRC
|
||||
* to be 128-bit (16 byte) aligned so that we don't cross cache line
|
||||
* boundaries on both loads and stores. */
|
||||
ands tmp2, src, #15 /* Bytes to reach alignment. */
|
||||
b.eq 2f
|
||||
sub count, count, tmp2
|
||||
/* Move enough data to reach alignment; unlike memcpy, we have to
|
||||
* be aware of the overlap, which means we can't move data twice. */
|
||||
tbz tmp2, #3, 1f
|
||||
ldr tmp1, [src, #-8]!
|
||||
str tmp1, [dst, #-8]!
|
||||
1:
|
||||
tbz tmp2, #2, 1f
|
||||
ldr tmp1w, [src, #-4]!
|
||||
str tmp1w, [dst, #-4]!
|
||||
1:
|
||||
tbz tmp2, #1, 1f
|
||||
ldrh tmp1w, [src, #-2]!
|
||||
strh tmp1w, [dst, #-2]!
|
||||
1:
|
||||
tbz tmp2, #0, 1f
|
||||
ldrb tmp1w, [src, #-1]!
|
||||
strb tmp1w, [dst, #-1]!
|
||||
1:
|
||||
|
||||
/* There may be less than 63 bytes to go now. */
|
||||
cmp count, #63
|
||||
b.le .Ltail63up
|
||||
2:
|
||||
subs count, count, #128
|
||||
b.ge .Lmov_body_large_up
|
||||
/* Less than 128 bytes to move, so handle 64 here and then jump
|
||||
* to the tail. */
|
||||
ldp A_l, A_h, [src, #-64]!
|
||||
ldp B_l, B_h, [src, #16]
|
||||
ldp C_l, C_h, [src, #32]
|
||||
ldp D_l, D_h, [src, #48]
|
||||
stp A_l, A_h, [dst, #-64]!
|
||||
stp B_l, B_h, [dst, #16]
|
||||
stp C_l, C_h, [dst, #32]
|
||||
stp D_l, D_h, [dst, #48]
|
||||
tst count, #0x3f
|
||||
b.ne .Ltail63up
|
||||
ret
|
||||
|
||||
/* Critical loop. Start at a new Icache line boundary. Assuming
|
||||
* 64 bytes per line this ensures the entire loop is in one line. */
|
||||
.p2align 6
|
||||
.Lmov_body_large_up:
|
||||
/* There are at least 128 bytes to move. */
|
||||
ldp A_l, A_h, [src, #-16]
|
||||
ldp B_l, B_h, [src, #-32]
|
||||
ldp C_l, C_h, [src, #-48]
|
||||
ldp D_l, D_h, [src, #-64]!
|
||||
1:
|
||||
stp A_l, A_h, [dst, #-16]
|
||||
ldp A_l, A_h, [src, #-16]
|
||||
stp B_l, B_h, [dst, #-32]
|
||||
ldp B_l, B_h, [src, #-32]
|
||||
stp C_l, C_h, [dst, #-48]
|
||||
ldp C_l, C_h, [src, #-48]
|
||||
stp D_l, D_h, [dst, #-64]!
|
||||
ldp D_l, D_h, [src, #-64]!
|
||||
subs count, count, #64
|
||||
b.ge 1b
|
||||
stp A_l, A_h, [dst, #-16]
|
||||
stp B_l, B_h, [dst, #-32]
|
||||
stp C_l, C_h, [dst, #-48]
|
||||
stp D_l, D_h, [dst, #-64]!
|
||||
tst count, #0x3f
|
||||
b.ne .Ltail63up
|
||||
ret
|
||||
|
||||
|
||||
.Ldownwards:
|
||||
/* For a downwards move we can safely use memcpy provided that
|
||||
* DST is more than 16 bytes away from SRC. */
|
||||
sub tmp1, src, #16
|
||||
cmp dstin, tmp1
|
||||
b.ls memcpy /* May overlap, but not critically. */
|
||||
|
||||
mov dst, dstin /* Preserve DSTIN for return value. */
|
||||
cmp count, #64
|
||||
b.ge .Lmov_not_short_down
|
||||
|
||||
/* Deal with small moves quickly by dropping straight into the
|
||||
* exit block. */
|
||||
.Ltail63down:
|
||||
/* Move up to 48 bytes of data. At this point we only need the
|
||||
* bottom 6 bits of count to be accurate. */
|
||||
ands tmp1, count, #0x30
|
||||
b.eq .Ltail15down
|
||||
add dst, dst, tmp1
|
||||
add src, src, tmp1
|
||||
cmp tmp1w, #0x20
|
||||
b.eq 1f
|
||||
b.lt 2f
|
||||
ldp A_l, A_h, [src, #-48]
|
||||
stp A_l, A_h, [dst, #-48]
|
||||
1:
|
||||
ldp A_l, A_h, [src, #-32]
|
||||
stp A_l, A_h, [dst, #-32]
|
||||
2:
|
||||
ldp A_l, A_h, [src, #-16]
|
||||
stp A_l, A_h, [dst, #-16]
|
||||
.Ltail15down:
|
||||
/* Move up to 15 bytes of data. Does not assume additional data
|
||||
being moved. */
|
||||
tbz count, #3, 1f
|
||||
ldr tmp1, [src], #8
|
||||
str tmp1, [dst], #8
|
||||
1:
|
||||
tbz count, #2, 1f
|
||||
ldr tmp1w, [src], #4
|
||||
str tmp1w, [dst], #4
|
||||
1:
|
||||
tbz count, #1, 1f
|
||||
ldrh tmp1w, [src], #2
|
||||
strh tmp1w, [dst], #2
|
||||
1:
|
||||
tbz count, #0, 1f
|
||||
ldrb tmp1w, [src]
|
||||
strb tmp1w, [dst]
|
||||
1:
|
||||
ret
|
||||
|
||||
.Lmov_not_short_down:
|
||||
/* We don't much care about the alignment of DST, but we want SRC
|
||||
* to be 128-bit (16 byte) aligned so that we don't cross cache line
|
||||
* boundaries on both loads and stores. */
|
||||
neg tmp2, src
|
||||
ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
|
||||
b.eq 2f
|
||||
sub count, count, tmp2
|
||||
/* Move enough data to reach alignment; unlike memcpy, we have to
|
||||
* be aware of the overlap, which means we can't move data twice. */
|
||||
tbz tmp2, #3, 1f
|
||||
ldr tmp1, [src], #8
|
||||
str tmp1, [dst], #8
|
||||
1:
|
||||
tbz tmp2, #2, 1f
|
||||
ldr tmp1w, [src], #4
|
||||
str tmp1w, [dst], #4
|
||||
1:
|
||||
tbz tmp2, #1, 1f
|
||||
ldrh tmp1w, [src], #2
|
||||
strh tmp1w, [dst], #2
|
||||
1:
|
||||
tbz tmp2, #0, 1f
|
||||
ldrb tmp1w, [src], #1
|
||||
strb tmp1w, [dst], #1
|
||||
1:
|
||||
|
||||
/* There may be less than 63 bytes to go now. */
|
||||
cmp count, #63
|
||||
b.le .Ltail63down
|
||||
2:
|
||||
subs count, count, #128
|
||||
b.ge .Lmov_body_large_down
|
||||
/* Less than 128 bytes to move, so handle 64 here and then jump
|
||||
* to the tail. */
|
||||
ldp A_l, A_h, [src]
|
||||
ldp B_l, B_h, [src, #16]
|
||||
ldp C_l, C_h, [src, #32]
|
||||
ldp D_l, D_h, [src, #48]
|
||||
stp A_l, A_h, [dst]
|
||||
stp B_l, B_h, [dst, #16]
|
||||
stp C_l, C_h, [dst, #32]
|
||||
stp D_l, D_h, [dst, #48]
|
||||
tst count, #0x3f
|
||||
add src, src, #64
|
||||
add dst, dst, #64
|
||||
b.ne .Ltail63down
|
||||
ret
|
||||
|
||||
/* Critical loop. Start at a new cache line boundary. Assuming
|
||||
* 64 bytes per line this ensures the entire loop is in one line. */
|
||||
.p2align 6
|
||||
.Lmov_body_large_down:
|
||||
/* There are at least 128 bytes to move. */
|
||||
ldp A_l, A_h, [src, #0]
|
||||
sub dst, dst, #16 /* Pre-bias. */
|
||||
ldp B_l, B_h, [src, #16]
|
||||
ldp C_l, C_h, [src, #32]
|
||||
ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */
|
||||
1:
|
||||
stp A_l, A_h, [dst, #16]
|
||||
ldp A_l, A_h, [src, #16]
|
||||
stp B_l, B_h, [dst, #32]
|
||||
ldp B_l, B_h, [src, #32]
|
||||
stp C_l, C_h, [dst, #48]
|
||||
ldp C_l, C_h, [src, #48]
|
||||
stp D_l, D_h, [dst, #64]!
|
||||
ldp D_l, D_h, [src, #64]!
|
||||
subs count, count, #64
|
||||
b.ge 1b
|
||||
stp A_l, A_h, [dst, #16]
|
||||
stp B_l, B_h, [dst, #32]
|
||||
stp C_l, C_h, [dst, #48]
|
||||
stp D_l, D_h, [dst, #64]
|
||||
add src, src, #16
|
||||
add dst, dst, #64 + 16
|
||||
tst count, #0x3f
|
||||
b.ne .Ltail63down
|
||||
ret
|
||||
#if defined(WMEMMOVE)
|
||||
END(wmemmove)
|
||||
#else
|
||||
END(memmove)
|
||||
#endif
|
|
@ -1,4 +1,4 @@
|
|||
/* Copyright (c) 2014, Linaro Limited
|
||||
/* Copyright (c) 2013, Linaro Limited
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
@ -22,14 +22,39 @@
|
|||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
|
||||
|
||||
/*
|
||||
* Copyright (c) 2015 ARM Ltd
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. The name of the company may not be used to endorse or promote
|
||||
* products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
* IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
||||
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64
|
||||
* Unaligned accesses
|
||||
* wchar_t is 4 bytes
|
||||
* ARMv8-a, AArch64, unaligned accesses, wchar_t is 4 bytes
|
||||
*/
|
||||
|
||||
#include <private/bionic_asm.h>
|
||||
|
@ -38,22 +63,25 @@
|
|||
#define dstin x0
|
||||
#define src x1
|
||||
#define count x2
|
||||
#define tmp1 x3
|
||||
#define tmp1w w3
|
||||
#define tmp2 x4
|
||||
#define tmp2w w4
|
||||
#define tmp3 x5
|
||||
#define tmp3w w5
|
||||
#define dst x6
|
||||
#define srcend x3
|
||||
#define dstend x4
|
||||
#define tmp1 x5
|
||||
#define A_l x6
|
||||
#define A_h x7
|
||||
#define B_l x8
|
||||
#define B_h x9
|
||||
#define C_l x10
|
||||
#define C_h x11
|
||||
#define D_l x12
|
||||
#define D_h x13
|
||||
#define E_l count
|
||||
#define E_h tmp1
|
||||
|
||||
#define A_l x7
|
||||
#define A_h x8
|
||||
#define B_l x9
|
||||
#define B_h x10
|
||||
#define C_l x11
|
||||
#define C_h x12
|
||||
#define D_l x13
|
||||
#define D_h x14
|
||||
/* All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
|
||||
Larger backwards copies are also handled by memcpy. The only remaining
|
||||
case is forward large copies. The destination is aligned, and an
|
||||
unrolled loop processes 64 bytes per iteration.
|
||||
*/
|
||||
|
||||
#if defined(WMEMMOVE)
|
||||
ENTRY(wmemmove)
|
||||
|
@ -61,267 +89,63 @@ ENTRY(wmemmove)
|
|||
#else
|
||||
ENTRY(memmove)
|
||||
#endif
|
||||
cmp dstin, src
|
||||
b.lo .Ldownwards
|
||||
add tmp1, src, count
|
||||
cmp dstin, tmp1
|
||||
b.hs memcpy /* No overlap. */
|
||||
sub tmp1, dstin, src
|
||||
cmp count, 96
|
||||
ccmp tmp1, count, 2, hi
|
||||
b.hs memcpy
|
||||
|
||||
/* Upwards move with potential overlap.
|
||||
* Need to move from the tail backwards. SRC and DST point one
|
||||
* byte beyond the remaining data to move. */
|
||||
add dst, dstin, count
|
||||
add src, src, count
|
||||
cmp count, #64
|
||||
b.ge .Lmov_not_short_up
|
||||
cbz tmp1, 3f
|
||||
add dstend, dstin, count
|
||||
add srcend, src, count
|
||||
|
||||
/* Deal with small moves quickly by dropping straight into the
|
||||
* exit block. */
|
||||
.Ltail63up:
|
||||
/* Move up to 48 bytes of data. At this point we only need the
|
||||
* bottom 6 bits of count to be accurate. */
|
||||
ands tmp1, count, #0x30
|
||||
b.eq .Ltail15up
|
||||
sub dst, dst, tmp1
|
||||
sub src, src, tmp1
|
||||
cmp tmp1w, #0x20
|
||||
b.eq 1f
|
||||
b.lt 2f
|
||||
ldp A_l, A_h, [src, #32]
|
||||
stp A_l, A_h, [dst, #32]
|
||||
/* Align dstend to 16 byte alignment so that we don't cross cache line
|
||||
boundaries on both loads and stores. There are at least 96 bytes
|
||||
to copy, so copy 16 bytes unaligned and then align. The loop
|
||||
copies 64 bytes per iteration and prefetches one iteration ahead. */
|
||||
|
||||
and tmp1, dstend, 15
|
||||
ldp D_l, D_h, [srcend, -16]
|
||||
sub srcend, srcend, tmp1
|
||||
sub count, count, tmp1
|
||||
ldp A_l, A_h, [srcend, -16]
|
||||
stp D_l, D_h, [dstend, -16]
|
||||
ldp B_l, B_h, [srcend, -32]
|
||||
ldp C_l, C_h, [srcend, -48]
|
||||
ldp D_l, D_h, [srcend, -64]!
|
||||
sub dstend, dstend, tmp1
|
||||
subs count, count, 128
|
||||
b.ls 2f
|
||||
nop
|
||||
1:
|
||||
ldp A_l, A_h, [src, #16]
|
||||
stp A_l, A_h, [dst, #16]
|
||||
stp A_l, A_h, [dstend, -16]
|
||||
ldp A_l, A_h, [srcend, -16]
|
||||
stp B_l, B_h, [dstend, -32]
|
||||
ldp B_l, B_h, [srcend, -32]
|
||||
stp C_l, C_h, [dstend, -48]
|
||||
ldp C_l, C_h, [srcend, -48]
|
||||
stp D_l, D_h, [dstend, -64]!
|
||||
ldp D_l, D_h, [srcend, -64]!
|
||||
subs count, count, 64
|
||||
b.hi 1b
|
||||
|
||||
/* Write the last full set of 64 bytes. The remainder is at most 64
|
||||
bytes, so it is safe to always copy 64 bytes from the start even if
|
||||
there is just 1 byte left. */
|
||||
2:
|
||||
ldp A_l, A_h, [src]
|
||||
stp A_l, A_h, [dst]
|
||||
.Ltail15up:
|
||||
/* Move up to 15 bytes of data. Does not assume additional data
|
||||
* being moved. */
|
||||
tbz count, #3, 1f
|
||||
ldr tmp1, [src, #-8]!
|
||||
str tmp1, [dst, #-8]!
|
||||
1:
|
||||
tbz count, #2, 1f
|
||||
ldr tmp1w, [src, #-4]!
|
||||
str tmp1w, [dst, #-4]!
|
||||
1:
|
||||
tbz count, #1, 1f
|
||||
ldrh tmp1w, [src, #-2]!
|
||||
strh tmp1w, [dst, #-2]!
|
||||
1:
|
||||
tbz count, #0, 1f
|
||||
ldrb tmp1w, [src, #-1]
|
||||
strb tmp1w, [dst, #-1]
|
||||
1:
|
||||
ret
|
||||
ldp E_l, E_h, [src, 48]
|
||||
stp A_l, A_h, [dstend, -16]
|
||||
ldp A_l, A_h, [src, 32]
|
||||
stp B_l, B_h, [dstend, -32]
|
||||
ldp B_l, B_h, [src, 16]
|
||||
stp C_l, C_h, [dstend, -48]
|
||||
ldp C_l, C_h, [src]
|
||||
stp D_l, D_h, [dstend, -64]
|
||||
stp E_l, E_h, [dstin, 48]
|
||||
stp A_l, A_h, [dstin, 32]
|
||||
stp B_l, B_h, [dstin, 16]
|
||||
stp C_l, C_h, [dstin]
|
||||
3: ret
|
||||
|
||||
.Lmov_not_short_up:
|
||||
/* We don't much care about the alignment of DST, but we want SRC
|
||||
* to be 128-bit (16 byte) aligned so that we don't cross cache line
|
||||
* boundaries on both loads and stores. */
|
||||
ands tmp2, src, #15 /* Bytes to reach alignment. */
|
||||
b.eq 2f
|
||||
sub count, count, tmp2
|
||||
/* Move enough data to reach alignment; unlike memcpy, we have to
|
||||
* be aware of the overlap, which means we can't move data twice. */
|
||||
tbz tmp2, #3, 1f
|
||||
ldr tmp1, [src, #-8]!
|
||||
str tmp1, [dst, #-8]!
|
||||
1:
|
||||
tbz tmp2, #2, 1f
|
||||
ldr tmp1w, [src, #-4]!
|
||||
str tmp1w, [dst, #-4]!
|
||||
1:
|
||||
tbz tmp2, #1, 1f
|
||||
ldrh tmp1w, [src, #-2]!
|
||||
strh tmp1w, [dst, #-2]!
|
||||
1:
|
||||
tbz tmp2, #0, 1f
|
||||
ldrb tmp1w, [src, #-1]!
|
||||
strb tmp1w, [dst, #-1]!
|
||||
1:
|
||||
|
||||
/* There may be less than 63 bytes to go now. */
|
||||
cmp count, #63
|
||||
b.le .Ltail63up
|
||||
2:
|
||||
subs count, count, #128
|
||||
b.ge .Lmov_body_large_up
|
||||
/* Less than 128 bytes to move, so handle 64 here and then jump
|
||||
* to the tail. */
|
||||
ldp A_l, A_h, [src, #-64]!
|
||||
ldp B_l, B_h, [src, #16]
|
||||
ldp C_l, C_h, [src, #32]
|
||||
ldp D_l, D_h, [src, #48]
|
||||
stp A_l, A_h, [dst, #-64]!
|
||||
stp B_l, B_h, [dst, #16]
|
||||
stp C_l, C_h, [dst, #32]
|
||||
stp D_l, D_h, [dst, #48]
|
||||
tst count, #0x3f
|
||||
b.ne .Ltail63up
|
||||
ret
|
||||
|
||||
/* Critical loop. Start at a new Icache line boundary. Assuming
|
||||
* 64 bytes per line this ensures the entire loop is in one line. */
|
||||
.p2align 6
|
||||
.Lmov_body_large_up:
|
||||
/* There are at least 128 bytes to move. */
|
||||
ldp A_l, A_h, [src, #-16]
|
||||
ldp B_l, B_h, [src, #-32]
|
||||
ldp C_l, C_h, [src, #-48]
|
||||
ldp D_l, D_h, [src, #-64]!
|
||||
1:
|
||||
stp A_l, A_h, [dst, #-16]
|
||||
ldp A_l, A_h, [src, #-16]
|
||||
stp B_l, B_h, [dst, #-32]
|
||||
ldp B_l, B_h, [src, #-32]
|
||||
stp C_l, C_h, [dst, #-48]
|
||||
ldp C_l, C_h, [src, #-48]
|
||||
stp D_l, D_h, [dst, #-64]!
|
||||
ldp D_l, D_h, [src, #-64]!
|
||||
subs count, count, #64
|
||||
b.ge 1b
|
||||
stp A_l, A_h, [dst, #-16]
|
||||
stp B_l, B_h, [dst, #-32]
|
||||
stp C_l, C_h, [dst, #-48]
|
||||
stp D_l, D_h, [dst, #-64]!
|
||||
tst count, #0x3f
|
||||
b.ne .Ltail63up
|
||||
ret
|
||||
|
||||
|
||||
.Ldownwards:
|
||||
/* For a downwards move we can safely use memcpy provided that
|
||||
* DST is more than 16 bytes away from SRC. */
|
||||
sub tmp1, src, #16
|
||||
cmp dstin, tmp1
|
||||
b.ls memcpy /* May overlap, but not critically. */
|
||||
|
||||
mov dst, dstin /* Preserve DSTIN for return value. */
|
||||
cmp count, #64
|
||||
b.ge .Lmov_not_short_down
|
||||
|
||||
/* Deal with small moves quickly by dropping straight into the
|
||||
* exit block. */
|
||||
.Ltail63down:
|
||||
/* Move up to 48 bytes of data. At this point we only need the
|
||||
* bottom 6 bits of count to be accurate. */
|
||||
ands tmp1, count, #0x30
|
||||
b.eq .Ltail15down
|
||||
add dst, dst, tmp1
|
||||
add src, src, tmp1
|
||||
cmp tmp1w, #0x20
|
||||
b.eq 1f
|
||||
b.lt 2f
|
||||
ldp A_l, A_h, [src, #-48]
|
||||
stp A_l, A_h, [dst, #-48]
|
||||
1:
|
||||
ldp A_l, A_h, [src, #-32]
|
||||
stp A_l, A_h, [dst, #-32]
|
||||
2:
|
||||
ldp A_l, A_h, [src, #-16]
|
||||
stp A_l, A_h, [dst, #-16]
|
||||
.Ltail15down:
|
||||
/* Move up to 15 bytes of data. Does not assume additional data
|
||||
being moved. */
|
||||
tbz count, #3, 1f
|
||||
ldr tmp1, [src], #8
|
||||
str tmp1, [dst], #8
|
||||
1:
|
||||
tbz count, #2, 1f
|
||||
ldr tmp1w, [src], #4
|
||||
str tmp1w, [dst], #4
|
||||
1:
|
||||
tbz count, #1, 1f
|
||||
ldrh tmp1w, [src], #2
|
||||
strh tmp1w, [dst], #2
|
||||
1:
|
||||
tbz count, #0, 1f
|
||||
ldrb tmp1w, [src]
|
||||
strb tmp1w, [dst]
|
||||
1:
|
||||
ret
|
||||
|
||||
.Lmov_not_short_down:
|
||||
/* We don't much care about the alignment of DST, but we want SRC
|
||||
* to be 128-bit (16 byte) aligned so that we don't cross cache line
|
||||
* boundaries on both loads and stores. */
|
||||
neg tmp2, src
|
||||
ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
|
||||
b.eq 2f
|
||||
sub count, count, tmp2
|
||||
/* Move enough data to reach alignment; unlike memcpy, we have to
|
||||
* be aware of the overlap, which means we can't move data twice. */
|
||||
tbz tmp2, #3, 1f
|
||||
ldr tmp1, [src], #8
|
||||
str tmp1, [dst], #8
|
||||
1:
|
||||
tbz tmp2, #2, 1f
|
||||
ldr tmp1w, [src], #4
|
||||
str tmp1w, [dst], #4
|
||||
1:
|
||||
tbz tmp2, #1, 1f
|
||||
ldrh tmp1w, [src], #2
|
||||
strh tmp1w, [dst], #2
|
||||
1:
|
||||
tbz tmp2, #0, 1f
|
||||
ldrb tmp1w, [src], #1
|
||||
strb tmp1w, [dst], #1
|
||||
1:
|
||||
|
||||
/* There may be less than 63 bytes to go now. */
|
||||
cmp count, #63
|
||||
b.le .Ltail63down
|
||||
2:
|
||||
subs count, count, #128
|
||||
b.ge .Lmov_body_large_down
|
||||
/* Less than 128 bytes to move, so handle 64 here and then jump
|
||||
* to the tail. */
|
||||
ldp A_l, A_h, [src]
|
||||
ldp B_l, B_h, [src, #16]
|
||||
ldp C_l, C_h, [src, #32]
|
||||
ldp D_l, D_h, [src, #48]
|
||||
stp A_l, A_h, [dst]
|
||||
stp B_l, B_h, [dst, #16]
|
||||
stp C_l, C_h, [dst, #32]
|
||||
stp D_l, D_h, [dst, #48]
|
||||
tst count, #0x3f
|
||||
add src, src, #64
|
||||
add dst, dst, #64
|
||||
b.ne .Ltail63down
|
||||
ret
|
||||
|
||||
/* Critical loop. Start at a new cache line boundary. Assuming
|
||||
* 64 bytes per line this ensures the entire loop is in one line. */
|
||||
.p2align 6
|
||||
.Lmov_body_large_down:
|
||||
/* There are at least 128 bytes to move. */
|
||||
ldp A_l, A_h, [src, #0]
|
||||
sub dst, dst, #16 /* Pre-bias. */
|
||||
ldp B_l, B_h, [src, #16]
|
||||
ldp C_l, C_h, [src, #32]
|
||||
ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */
|
||||
1:
|
||||
stp A_l, A_h, [dst, #16]
|
||||
ldp A_l, A_h, [src, #16]
|
||||
stp B_l, B_h, [dst, #32]
|
||||
ldp B_l, B_h, [src, #32]
|
||||
stp C_l, C_h, [dst, #48]
|
||||
ldp C_l, C_h, [src, #48]
|
||||
stp D_l, D_h, [dst, #64]!
|
||||
ldp D_l, D_h, [src, #64]!
|
||||
subs count, count, #64
|
||||
b.ge 1b
|
||||
stp A_l, A_h, [dst, #16]
|
||||
stp B_l, B_h, [dst, #32]
|
||||
stp C_l, C_h, [dst, #48]
|
||||
stp D_l, D_h, [dst, #64]
|
||||
add src, src, #16
|
||||
add dst, dst, #64 + 16
|
||||
tst count, #0x3f
|
||||
b.ne .Ltail63down
|
||||
ret
|
||||
#if defined(WMEMMOVE)
|
||||
END(wmemmove)
|
||||
#else
|
||||
|
|
Loading…
Reference in a new issue