[AArch64] Improve strcmp performance for misaligned strings
This patch was originally written by Siddhesh Poyarekar and pushed on cortex-strings [1]. Replace the simple byte-wise compare in the misaligned case with a dword compare with page boundary checks in place. For simplicity its uses a 4K page boundary so that it does not have to query the actual page size on the system. Comparison on the default bionic and proposed optimized routines shows the following performance improvements on A64 (using the new proposed memcmp input data from test_strcmp.xml): - Small improvement for aligned arguments with sizes up to 56 bytes (from 10% to 20%). - Large improvements for unaligned arguments for small sizes (from 3 to 256 bytes). Benchmark Time CPU Time Old Time New CPU Old CPU New ------------------------------------------------------------------------------------------------------------------- BM_string_strcmp/1/0/0 +0.0034 +0.0034 11 11 11 11 BM_string_strcmp/2/0/0 +0.0000 +0.0000 11 11 11 11 BM_string_strcmp/3/0/0 -0.1726 -0.1726 11 9 11 9 BM_string_strcmp/4/0/0 -0.1726 -0.1726 11 9 11 9 BM_string_strcmp/5/0/0 -0.1726 -0.1726 11 9 11 9 BM_string_strcmp/6/0/0 -0.1719 -0.1719 11 9 11 9 BM_string_strcmp/7/0/0 -0.1724 -0.1724 11 9 11 9 BM_string_strcmp/8/0/0 -0.1718 -0.1718 11 9 11 9 BM_string_strcmp/9/0/0 -0.2008 -0.2008 16 13 16 13 BM_string_strcmp/10/0/0 -0.2008 -0.2008 16 13 16 13 BM_string_strcmp/11/0/0 -0.2040 -0.2040 16 13 16 13 BM_string_strcmp/12/0/0 -0.1991 -0.1991 16 13 16 13 BM_string_strcmp/13/0/0 -0.1997 -0.1997 16 13 16 13 BM_string_strcmp/14/0/0 -0.1988 -0.1989 16 13 16 13 BM_string_strcmp/15/0/0 -0.2006 -0.2006 16 13 16 13 BM_string_strcmp/16/0/0 -0.2043 -0.2043 16 13 16 13 BM_string_strcmp/24/0/0 -0.1927 -0.1927 18 15 18 15 BM_string_strcmp/32/0/0 -0.1743 -0.1743 20 17 20 17 BM_string_strcmp/40/0/0 -0.1427 -0.1427 22 19 22 19 BM_string_strcmp/48/0/0 -0.1053 -0.1053 24 22 24 22 BM_string_strcmp/56/0/0 -0.0805 -0.0805 26 24 26 24 BM_string_strcmp/64/0/0 -0.0454 -0.0454 28 27 28 27 BM_string_strcmp/72/0/0 -0.0303 -0.0303 30 29 30 29 BM_string_strcmp/80/0/0 -0.0111 -0.0111 32 32 32 32 BM_string_strcmp/88/0/0 -0.0004 -0.0004 34 34 34 34 BM_string_strcmp/96/0/0 -0.0058 -0.0058 37 37 37 37 BM_string_strcmp/104/0/0 +0.0000 +0.0000 40 40 40 40 BM_string_strcmp/112/0/0 -0.0457 -0.0457 61 58 61 58 BM_string_strcmp/120/0/0 -0.0486 -0.0487 61 58 61 58 BM_string_strcmp/128/0/0 -0.0499 -0.0499 64 61 64 61 BM_string_strcmp/136/0/0 -0.0529 -0.0529 66 63 66 63 BM_string_strcmp/144/0/0 -0.0492 -0.0492 69 66 69 66 BM_string_strcmp/160/0/0 -0.0459 -0.0459 74 71 74 71 BM_string_strcmp/176/0/0 -0.0400 -0.0401 79 76 79 76 BM_string_strcmp/192/0/0 -0.0378 -0.0378 85 81 85 81 BM_string_strcmp/208/0/0 -0.0009 -0.0009 89 89 89 89 BM_string_strcmp/224/0/0 -0.0003 -0.0003 95 95 95 95 BM_string_strcmp/240/0/0 -0.0320 -0.0320 100 96 100 96 BM_string_strcmp/256/0/0 -0.0303 -0.0304 105 102 105 102 BM_string_strcmp/512/0/0 -0.0171 -0.0171 187 183 187 183 BM_string_strcmp/1024/0/0 -0.0091 -0.0091 350 347 350 347 BM_string_strcmp/8192/0/0 -0.0030 -0.0031 2668 2660 2668 2660 BM_string_strcmp/16384/0/0 +0.0007 +0.0007 5449 5452 5448 5452 BM_string_strcmp/32768/0/0 +0.0635 +0.0635 10868 11558 10867 11557 BM_string_strcmp/65536/0/0 -0.0017 -0.0017 21824 21786 21822 21784 BM_string_strcmp/131072/0/0 +0.0012 +0.0012 43485 43536 43480 43532 BM_string_strcmp/1/4/0 +0.7630 +0.7630 7 12 7 12 BM_string_strcmp/2/4/0 +0.9265 +0.9265 12 23 12 23 BM_string_strcmp/3/4/0 -0.0000 -0.0000 14 14 14 14 BM_string_strcmp/4/4/0 +0.0372 +0.0372 19 19 19 19 BM_string_strcmp/6/4/0 -0.0921 -0.0921 20 19 20 19 BM_string_strcmp/7/4/0 -0.0291 -0.0291 19 19 19 19 BM_string_strcmp/8/4/0 +0.0648 +0.0648 20 22 20 22 BM_string_strcmp/9/4/0 +0.0001 -0.0055 22 22 22 22 BM_string_strcmp/10/4/0 -0.1924 -0.1924 23 19 23 19 BM_string_strcmp/11/4/0 -0.2347 -0.2347 24 19 24 19 BM_string_strcmp/12/4/0 -0.2738 -0.2739 26 19 26 19 BM_string_strcmp/13/4/0 -0.3804 -0.3804 42 26 42 26 BM_string_strcmp/14/4/0 -0.3581 -0.3582 41 26 41 26 BM_string_strcmp/15/4/0 -0.3905 -0.3905 43 26 43 26 BM_string_strcmp/16/4/0 -0.4068 -0.4068 44 26 44 26 BM_string_strcmp/24/4/0 -0.4917 -0.4917 57 29 57 29 BM_string_strcmp/32/4/0 -0.5607 -0.5607 70 31 70 31 BM_string_strcmp/40/4/0 -0.5940 -0.5940 82 33 82 33 BM_string_strcmp/48/4/0 -0.5303 -0.5302 95 45 95 45 BM_string_strcmp/56/4/0 -0.4975 -0.4975 108 54 108 54 BM_string_strcmp/64/4/0 -0.5167 -0.5167 121 58 121 58 BM_string_strcmp/72/4/0 -0.5325 -0.5325 133 62 133 62 BM_string_strcmp/80/4/0 -0.5523 -0.5523 146 65 146 65 BM_string_strcmp/88/4/0 -0.5686 -0.5686 159 69 159 69 BM_string_strcmp/96/4/0 -0.5815 -0.5815 172 72 172 72 BM_string_strcmp/104/4/0 -0.5931 -0.5931 185 75 185 75 BM_string_strcmp/112/4/0 -0.6046 -0.6046 197 78 197 78 BM_string_strcmp/120/4/0 -0.6113 -0.6113 210 82 210 82 BM_string_strcmp/128/4/0 -0.6186 -0.6186 223 85 223 85 BM_string_strcmp/136/4/0 -0.6278 -0.6278 237 88 237 88 BM_string_strcmp/144/4/0 -0.6410 -0.6410 253 91 253 91 BM_string_strcmp/160/4/0 -0.6506 -0.6506 280 98 280 98 BM_string_strcmp/176/4/0 -0.6593 -0.6593 304 104 304 104 BM_string_strcmp/192/4/0 -0.6647 -0.6647 330 111 330 111 BM_string_strcmp/208/4/0 -0.6741 -0.6741 357 116 357 116 BM_string_strcmp/224/4/0 -0.6761 -0.6761 381 123 381 123 BM_string_strcmp/240/4/0 -0.6824 -0.6824 406 129 406 129 BM_string_strcmp/256/4/0 -0.6846 -0.6846 432 136 432 136 BM_string_strcmp/1/0/4 +1.0024 +1.0024 7 14 7 14 BM_string_strcmp/2/0/4 +0.1591 +0.1591 12 14 12 14 BM_string_strcmp/3/0/4 -0.0015 -0.0015 14 14 14 14 BM_string_strcmp/4/0/4 -0.0809 -0.0809 15 14 15 14 BM_string_strcmp/5/0/4 -0.1535 -0.1536 17 14 17 14 BM_string_strcmp/6/0/4 -0.2111 -0.2111 18 14 18 14 BM_string_strcmp/7/0/4 -0.2650 -0.2650 19 14 19 14 BM_string_strcmp/8/0/4 -0.3118 -0.3118 20 14 20 14 BM_string_strcmp/9/0/4 -0.1741 -0.1740 22 18 22 18 BM_string_strcmp/10/0/4 -0.2201 -0.2201 23 18 23 18 BM_string_strcmp/11/0/4 -0.2610 -0.2610 24 18 24 18 BM_string_strcmp/12/0/4 -0.2987 -0.2987 26 18 26 18 BM_string_strcmp/13/0/4 -0.5748 -0.5748 42 18 42 18 BM_string_strcmp/14/0/4 -0.5796 -0.5796 43 18 43 18 BM_string_strcmp/15/0/4 -0.6167 -0.6167 47 18 47 18 BM_string_strcmp/16/0/4 -0.6303 -0.6303 49 18 49 18 BM_string_strcmp/24/0/4 -0.6557 -0.6557 61 21 61 21 BM_string_strcmp/32/0/4 -0.6612 -0.6612 70 24 70 24 BM_string_strcmp/40/0/4 -0.6812 -0.6813 82 26 82 26 BM_string_strcmp/48/0/4 -0.6974 -0.6974 95 29 95 29 BM_string_strcmp/56/0/4 -0.7151 -0.7151 108 31 108 31 BM_string_strcmp/64/0/4 -0.5717 -0.5717 121 52 121 52 BM_string_strcmp/72/0/4 -0.5927 -0.5927 134 54 134 54 BM_string_strcmp/80/0/4 -0.6004 -0.6004 146 58 146 58 BM_string_strcmp/88/0/4 -0.6145 -0.6145 159 61 159 61 BM_string_strcmp/96/0/4 -0.6287 -0.6287 172 64 172 64 BM_string_strcmp/104/0/4 -0.6351 -0.6351 185 67 185 67 BM_string_strcmp/112/0/4 -0.6423 -0.6423 197 71 197 71 BM_string_strcmp/120/0/4 -0.6489 -0.6489 210 74 210 74 BM_string_strcmp/128/0/4 -0.6578 -0.6578 223 76 223 76 BM_string_strcmp/136/0/4 -0.6597 -0.6597 236 80 236 80 BM_string_strcmp/144/0/4 -0.6674 -0.6674 250 83 250 83 BM_string_strcmp/160/0/4 -0.6751 -0.6751 274 89 274 89 BM_string_strcmp/176/0/4 -0.6798 -0.6798 300 96 300 96 BM_string_strcmp/192/0/4 -0.6873 -0.6855 327 102 325 102 BM_string_strcmp/208/0/4 -0.6903 -0.6903 351 109 351 109 BM_string_strcmp/224/0/4 -0.6907 -0.6907 376 116 376 116 BM_string_strcmp/240/0/4 -0.6897 -0.6897 402 125 402 125 BM_string_strcmp/256/0/4 -0.6937 -0.6937 427 131 427 131 BM_string_strcmp/1/4/4 +0.0009 +0.0009 14 14 14 14 BM_string_strcmp/2/4/4 -0.2229 -0.2229 14 11 14 11 BM_string_strcmp/3/4/4 -0.2256 -0.2256 14 11 14 11 BM_string_strcmp/4/4/4 -0.2241 -0.2240 14 11 14 11 BM_string_strcmp/5/4/4 -0.2220 -0.2220 20 15 20 15 BM_string_strcmp/6/4/4 -0.2267 -0.2267 20 15 20 15 BM_string_strcmp/7/4/4 -0.2228 -0.2227 20 15 20 15 BM_string_strcmp/8/4/4 -0.2219 -0.2219 20 15 20 15 BM_string_strcmp/9/4/4 -0.2220 -0.2220 20 15 20 15 BM_string_strcmp/10/4/4 -0.2227 -0.2227 20 15 20 15 BM_string_strcmp/11/4/4 -0.2210 -0.2210 20 15 20 15 BM_string_strcmp/12/4/4 -0.2224 -0.2224 20 15 20 15 BM_string_strcmp/13/4/4 -0.1778 -0.1778 21 17 21 17 BM_string_strcmp/14/4/4 -0.1863 -0.1863 21 17 21 17 BM_string_strcmp/15/4/4 -0.1780 -0.1780 21 17 21 17 BM_string_strcmp/16/4/4 +0.0031 +0.0031 21 21 21 21 BM_string_strcmp/24/4/4 +0.0041 +0.0041 24 24 24 24 BM_string_strcmp/32/4/4 -0.0001 -0.0000 25 25 25 25 BM_string_strcmp/40/4/4 +0.0016 +0.0016 26 26 26 26 BM_string_strcmp/48/4/4 +0.0001 +0.0001 28 28 28 28 BM_string_strcmp/56/4/4 -0.0001 -0.0001 30 30 30 30 BM_string_strcmp/64/4/4 -0.0342 -0.0342 32 31 32 31 BM_string_strcmp/72/4/4 -0.0186 -0.0186 34 34 34 34 BM_string_strcmp/80/4/4 +0.0004 +0.0004 36 36 36 36 BM_string_strcmp/88/4/4 -0.0000 -0.0000 39 39 39 39 BM_string_strcmp/96/4/4 -0.0510 -0.0510 62 59 62 59 BM_string_strcmp/104/4/4 -0.0502 -0.0502 63 60 63 60 BM_string_strcmp/112/4/4 -0.0490 -0.0490 65 62 65 62 BM_string_strcmp/120/4/4 -0.0387 -0.0387 67 65 67 65 BM_string_strcmp/128/4/4 -0.0426 -0.0426 70 67 70 67 BM_string_strcmp/136/4/4 -0.0408 -0.0408 73 70 73 70 BM_string_strcmp/144/4/4 -0.0194 -0.0194 75 74 75 74 BM_string_strcmp/160/4/4 -0.0035 -0.0035 81 81 81 81 BM_string_strcmp/176/4/4 -0.0001 -0.0001 86 86 86 86 BM_string_strcmp/192/4/4 -0.0002 -0.0002 91 91 91 91 BM_string_strcmp/208/4/4 -0.0335 -0.0335 96 93 96 93 BM_string_strcmp/224/4/4 -0.0314 -0.0314 101 98 101 98 BM_string_strcmp/240/4/4 -0.0303 -0.0303 106 103 106 103 BM_string_strcmp/256/4/4 -0.0288 -0.0288 111 108 111 108 [1] Commit id: f98f2a6780d686ca3d44f8011c7823d42d9b083a Test: bionic tests and benchmarks on aarch64. Change-Id: I75f8948782b8bd459d21f15e75e1d420905f5e5a
This commit is contained in:
parent
4ab56af82d
commit
b42ff1b5c3
1 changed files with 39 additions and 10 deletions
|
@ -32,6 +32,8 @@
|
|||
|
||||
#include <private/bionic_asm.h>
|
||||
|
||||
#define L(label) .L ## label
|
||||
|
||||
#define REP8_01 0x0101010101010101
|
||||
#define REP8_7f 0x7f7f7f7f7f7f7f7f
|
||||
#define REP8_80 0x8080808080808080
|
||||
|
@ -61,24 +63,25 @@ ENTRY(strcmp)
|
|||
eor tmp1, src1, src2
|
||||
mov zeroones, #REP8_01
|
||||
tst tmp1, #7
|
||||
b.ne .Lmisaligned8
|
||||
b.ne L(misaligned8)
|
||||
ands tmp1, src1, #7
|
||||
b.ne .Lmutual_align
|
||||
b.ne L(mutual_align)
|
||||
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
|
||||
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
|
||||
can be done in parallel across the entire word. */
|
||||
.Lloop_aligned:
|
||||
L(loop_aligned):
|
||||
ldr data1, [src1], #8
|
||||
ldr data2, [src2], #8
|
||||
.Lstart_realigned:
|
||||
L(start_realigned):
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, #REP8_7f
|
||||
eor diff, data1, data2 /* Non-zero if differences found. */
|
||||
bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
|
||||
orr syndrome, diff, has_nul
|
||||
cbz syndrome, .Lloop_aligned
|
||||
cbz syndrome, L(loop_aligned)
|
||||
/* End of performance-critical section -- one 64B cache line. */
|
||||
|
||||
L(end):
|
||||
#ifndef __AARCH64EB__
|
||||
rev syndrome, syndrome
|
||||
rev data1, data1
|
||||
|
@ -129,7 +132,7 @@ ENTRY(strcmp)
|
|||
ret
|
||||
#endif
|
||||
|
||||
.Lmutual_align:
|
||||
L(mutual_align):
|
||||
/* Sources are mutually aligned, but are not currently at an
|
||||
alignment boundary. Round down the addresses and then mask off
|
||||
the bytes that preceed the start point. */
|
||||
|
@ -149,15 +152,41 @@ ENTRY(strcmp)
|
|||
#endif
|
||||
orr data1, data1, tmp2
|
||||
orr data2, data2, tmp2
|
||||
b .Lstart_realigned
|
||||
b L(start_realigned)
|
||||
|
||||
.Lmisaligned8:
|
||||
/* We can do better than this. */
|
||||
L(misaligned8):
|
||||
/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
|
||||
checking to make sure that we don't access beyond page boundary in
|
||||
SRC2. */
|
||||
tst src1, #7
|
||||
b.eq L(loop_misaligned)
|
||||
L(do_misaligned):
|
||||
ldrb data1w, [src1], #1
|
||||
ldrb data2w, [src2], #1
|
||||
cmp data1w, #1
|
||||
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
|
||||
b.eq .Lmisaligned8
|
||||
b.ne L(done)
|
||||
tst src1, #7
|
||||
b.ne L(do_misaligned)
|
||||
|
||||
L(loop_misaligned):
|
||||
/* Test if we are within the last dword of the end of a 4K page. If
|
||||
yes then jump back to the misaligned loop to copy a byte at a time. */
|
||||
and tmp1, src2, #0xff8
|
||||
eor tmp1, tmp1, #0xff8
|
||||
cbz tmp1, L(do_misaligned)
|
||||
ldr data1, [src1], #8
|
||||
ldr data2, [src2], #8
|
||||
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, #REP8_7f
|
||||
eor diff, data1, data2 /* Non-zero if differences found. */
|
||||
bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
|
||||
orr syndrome, diff, has_nul
|
||||
cbz syndrome, L(loop_misaligned)
|
||||
b L(end)
|
||||
|
||||
L(done):
|
||||
sub result, data1, data2
|
||||
ret
|
||||
END(strcmp)
|
||||
|
|
Loading…
Reference in a new issue