[AArch64] Improve strcmp performance for misaligned strings

This patch was originally written by Siddhesh Poyarekar and pushed on
cortex-strings [1]. Replace the simple byte-wise compare in the
misaligned case with a dword compare with page boundary checks in
place. For simplicity its uses a 4K page boundary so that it does not
have to query the actual page size on the system.

Comparison on the default bionic and proposed optimized routines
shows the following performance improvements on A64 (using the
new proposed memcmp input data from test_strcmp.xml):

  - Small improvement for aligned arguments with sizes up to 56 bytes
    (from 10% to 20%).

  - Large improvements for unaligned arguments for small sizes (from
    3 to 256 bytes).

Benchmark                              Time             CPU      Time Old      Time New       CPU Old       CPU New
-------------------------------------------------------------------------------------------------------------------
BM_string_strcmp/1/0/0              +0.0034         +0.0034            11            11            11            11
BM_string_strcmp/2/0/0              +0.0000         +0.0000            11            11            11            11
BM_string_strcmp/3/0/0              -0.1726         -0.1726            11             9            11             9
BM_string_strcmp/4/0/0              -0.1726         -0.1726            11             9            11             9
BM_string_strcmp/5/0/0              -0.1726         -0.1726            11             9            11             9
BM_string_strcmp/6/0/0              -0.1719         -0.1719            11             9            11             9
BM_string_strcmp/7/0/0              -0.1724         -0.1724            11             9            11             9
BM_string_strcmp/8/0/0              -0.1718         -0.1718            11             9            11             9
BM_string_strcmp/9/0/0              -0.2008         -0.2008            16            13            16            13
BM_string_strcmp/10/0/0             -0.2008         -0.2008            16            13            16            13
BM_string_strcmp/11/0/0             -0.2040         -0.2040            16            13            16            13
BM_string_strcmp/12/0/0             -0.1991         -0.1991            16            13            16            13
BM_string_strcmp/13/0/0             -0.1997         -0.1997            16            13            16            13
BM_string_strcmp/14/0/0             -0.1988         -0.1989            16            13            16            13
BM_string_strcmp/15/0/0             -0.2006         -0.2006            16            13            16            13
BM_string_strcmp/16/0/0             -0.2043         -0.2043            16            13            16            13
BM_string_strcmp/24/0/0             -0.1927         -0.1927            18            15            18            15
BM_string_strcmp/32/0/0             -0.1743         -0.1743            20            17            20            17
BM_string_strcmp/40/0/0             -0.1427         -0.1427            22            19            22            19
BM_string_strcmp/48/0/0             -0.1053         -0.1053            24            22            24            22
BM_string_strcmp/56/0/0             -0.0805         -0.0805            26            24            26            24
BM_string_strcmp/64/0/0             -0.0454         -0.0454            28            27            28            27
BM_string_strcmp/72/0/0             -0.0303         -0.0303            30            29            30            29
BM_string_strcmp/80/0/0             -0.0111         -0.0111            32            32            32            32
BM_string_strcmp/88/0/0             -0.0004         -0.0004            34            34            34            34
BM_string_strcmp/96/0/0             -0.0058         -0.0058            37            37            37            37
BM_string_strcmp/104/0/0            +0.0000         +0.0000            40            40            40            40
BM_string_strcmp/112/0/0            -0.0457         -0.0457            61            58            61            58
BM_string_strcmp/120/0/0            -0.0486         -0.0487            61            58            61            58
BM_string_strcmp/128/0/0            -0.0499         -0.0499            64            61            64            61
BM_string_strcmp/136/0/0            -0.0529         -0.0529            66            63            66            63
BM_string_strcmp/144/0/0            -0.0492         -0.0492            69            66            69            66
BM_string_strcmp/160/0/0            -0.0459         -0.0459            74            71            74            71
BM_string_strcmp/176/0/0            -0.0400         -0.0401            79            76            79            76
BM_string_strcmp/192/0/0            -0.0378         -0.0378            85            81            85            81
BM_string_strcmp/208/0/0            -0.0009         -0.0009            89            89            89            89
BM_string_strcmp/224/0/0            -0.0003         -0.0003            95            95            95            95
BM_string_strcmp/240/0/0            -0.0320         -0.0320           100            96           100            96
BM_string_strcmp/256/0/0            -0.0303         -0.0304           105           102           105           102
BM_string_strcmp/512/0/0            -0.0171         -0.0171           187           183           187           183
BM_string_strcmp/1024/0/0           -0.0091         -0.0091           350           347           350           347
BM_string_strcmp/8192/0/0           -0.0030         -0.0031          2668          2660          2668          2660
BM_string_strcmp/16384/0/0          +0.0007         +0.0007          5449          5452          5448          5452
BM_string_strcmp/32768/0/0          +0.0635         +0.0635         10868         11558         10867         11557
BM_string_strcmp/65536/0/0          -0.0017         -0.0017         21824         21786         21822         21784
BM_string_strcmp/131072/0/0         +0.0012         +0.0012         43485         43536         43480         43532
BM_string_strcmp/1/4/0              +0.7630         +0.7630             7            12             7            12
BM_string_strcmp/2/4/0              +0.9265         +0.9265            12            23            12            23
BM_string_strcmp/3/4/0              -0.0000         -0.0000            14            14            14            14
BM_string_strcmp/4/4/0              +0.0372         +0.0372            19            19            19            19
BM_string_strcmp/6/4/0              -0.0921         -0.0921            20            19            20            19
BM_string_strcmp/7/4/0              -0.0291         -0.0291            19            19            19            19
BM_string_strcmp/8/4/0              +0.0648         +0.0648            20            22            20            22
BM_string_strcmp/9/4/0              +0.0001         -0.0055            22            22            22            22
BM_string_strcmp/10/4/0             -0.1924         -0.1924            23            19            23            19
BM_string_strcmp/11/4/0             -0.2347         -0.2347            24            19            24            19
BM_string_strcmp/12/4/0             -0.2738         -0.2739            26            19            26            19
BM_string_strcmp/13/4/0             -0.3804         -0.3804            42            26            42            26
BM_string_strcmp/14/4/0             -0.3581         -0.3582            41            26            41            26
BM_string_strcmp/15/4/0             -0.3905         -0.3905            43            26            43            26
BM_string_strcmp/16/4/0             -0.4068         -0.4068            44            26            44            26
BM_string_strcmp/24/4/0             -0.4917         -0.4917            57            29            57            29
BM_string_strcmp/32/4/0             -0.5607         -0.5607            70            31            70            31
BM_string_strcmp/40/4/0             -0.5940         -0.5940            82            33            82            33
BM_string_strcmp/48/4/0             -0.5303         -0.5302            95            45            95            45
BM_string_strcmp/56/4/0             -0.4975         -0.4975           108            54           108            54
BM_string_strcmp/64/4/0             -0.5167         -0.5167           121            58           121            58
BM_string_strcmp/72/4/0             -0.5325         -0.5325           133            62           133            62
BM_string_strcmp/80/4/0             -0.5523         -0.5523           146            65           146            65
BM_string_strcmp/88/4/0             -0.5686         -0.5686           159            69           159            69
BM_string_strcmp/96/4/0             -0.5815         -0.5815           172            72           172            72
BM_string_strcmp/104/4/0            -0.5931         -0.5931           185            75           185            75
BM_string_strcmp/112/4/0            -0.6046         -0.6046           197            78           197            78
BM_string_strcmp/120/4/0            -0.6113         -0.6113           210            82           210            82
BM_string_strcmp/128/4/0            -0.6186         -0.6186           223            85           223            85
BM_string_strcmp/136/4/0            -0.6278         -0.6278           237            88           237            88
BM_string_strcmp/144/4/0            -0.6410         -0.6410           253            91           253            91
BM_string_strcmp/160/4/0            -0.6506         -0.6506           280            98           280            98
BM_string_strcmp/176/4/0            -0.6593         -0.6593           304           104           304           104
BM_string_strcmp/192/4/0            -0.6647         -0.6647           330           111           330           111
BM_string_strcmp/208/4/0            -0.6741         -0.6741           357           116           357           116
BM_string_strcmp/224/4/0            -0.6761         -0.6761           381           123           381           123
BM_string_strcmp/240/4/0            -0.6824         -0.6824           406           129           406           129
BM_string_strcmp/256/4/0            -0.6846         -0.6846           432           136           432           136
BM_string_strcmp/1/0/4              +1.0024         +1.0024             7            14             7            14
BM_string_strcmp/2/0/4              +0.1591         +0.1591            12            14            12            14
BM_string_strcmp/3/0/4              -0.0015         -0.0015            14            14            14            14
BM_string_strcmp/4/0/4              -0.0809         -0.0809            15            14            15            14
BM_string_strcmp/5/0/4              -0.1535         -0.1536            17            14            17            14
BM_string_strcmp/6/0/4              -0.2111         -0.2111            18            14            18            14
BM_string_strcmp/7/0/4              -0.2650         -0.2650            19            14            19            14
BM_string_strcmp/8/0/4              -0.3118         -0.3118            20            14            20            14
BM_string_strcmp/9/0/4              -0.1741         -0.1740            22            18            22            18
BM_string_strcmp/10/0/4             -0.2201         -0.2201            23            18            23            18
BM_string_strcmp/11/0/4             -0.2610         -0.2610            24            18            24            18
BM_string_strcmp/12/0/4             -0.2987         -0.2987            26            18            26            18
BM_string_strcmp/13/0/4             -0.5748         -0.5748            42            18            42            18
BM_string_strcmp/14/0/4             -0.5796         -0.5796            43            18            43            18
BM_string_strcmp/15/0/4             -0.6167         -0.6167            47            18            47            18
BM_string_strcmp/16/0/4             -0.6303         -0.6303            49            18            49            18
BM_string_strcmp/24/0/4             -0.6557         -0.6557            61            21            61            21
BM_string_strcmp/32/0/4             -0.6612         -0.6612            70            24            70            24
BM_string_strcmp/40/0/4             -0.6812         -0.6813            82            26            82            26
BM_string_strcmp/48/0/4             -0.6974         -0.6974            95            29            95            29
BM_string_strcmp/56/0/4             -0.7151         -0.7151           108            31           108            31
BM_string_strcmp/64/0/4             -0.5717         -0.5717           121            52           121            52
BM_string_strcmp/72/0/4             -0.5927         -0.5927           134            54           134            54
BM_string_strcmp/80/0/4             -0.6004         -0.6004           146            58           146            58
BM_string_strcmp/88/0/4             -0.6145         -0.6145           159            61           159            61
BM_string_strcmp/96/0/4             -0.6287         -0.6287           172            64           172            64
BM_string_strcmp/104/0/4            -0.6351         -0.6351           185            67           185            67
BM_string_strcmp/112/0/4            -0.6423         -0.6423           197            71           197            71
BM_string_strcmp/120/0/4            -0.6489         -0.6489           210            74           210            74
BM_string_strcmp/128/0/4            -0.6578         -0.6578           223            76           223            76
BM_string_strcmp/136/0/4            -0.6597         -0.6597           236            80           236            80
BM_string_strcmp/144/0/4            -0.6674         -0.6674           250            83           250            83
BM_string_strcmp/160/0/4            -0.6751         -0.6751           274            89           274            89
BM_string_strcmp/176/0/4            -0.6798         -0.6798           300            96           300            96
BM_string_strcmp/192/0/4            -0.6873         -0.6855           327           102           325           102
BM_string_strcmp/208/0/4            -0.6903         -0.6903           351           109           351           109
BM_string_strcmp/224/0/4            -0.6907         -0.6907           376           116           376           116
BM_string_strcmp/240/0/4            -0.6897         -0.6897           402           125           402           125
BM_string_strcmp/256/0/4            -0.6937         -0.6937           427           131           427           131
BM_string_strcmp/1/4/4              +0.0009         +0.0009            14            14            14            14
BM_string_strcmp/2/4/4              -0.2229         -0.2229            14            11            14            11
BM_string_strcmp/3/4/4              -0.2256         -0.2256            14            11            14            11
BM_string_strcmp/4/4/4              -0.2241         -0.2240            14            11            14            11
BM_string_strcmp/5/4/4              -0.2220         -0.2220            20            15            20            15
BM_string_strcmp/6/4/4              -0.2267         -0.2267            20            15            20            15
BM_string_strcmp/7/4/4              -0.2228         -0.2227            20            15            20            15
BM_string_strcmp/8/4/4              -0.2219         -0.2219            20            15            20            15
BM_string_strcmp/9/4/4              -0.2220         -0.2220            20            15            20            15
BM_string_strcmp/10/4/4             -0.2227         -0.2227            20            15            20            15
BM_string_strcmp/11/4/4             -0.2210         -0.2210            20            15            20            15
BM_string_strcmp/12/4/4             -0.2224         -0.2224            20            15            20            15
BM_string_strcmp/13/4/4             -0.1778         -0.1778            21            17            21            17
BM_string_strcmp/14/4/4             -0.1863         -0.1863            21            17            21            17
BM_string_strcmp/15/4/4             -0.1780         -0.1780            21            17            21            17
BM_string_strcmp/16/4/4             +0.0031         +0.0031            21            21            21            21
BM_string_strcmp/24/4/4             +0.0041         +0.0041            24            24            24            24
BM_string_strcmp/32/4/4             -0.0001         -0.0000            25            25            25            25
BM_string_strcmp/40/4/4             +0.0016         +0.0016            26            26            26            26
BM_string_strcmp/48/4/4             +0.0001         +0.0001            28            28            28            28
BM_string_strcmp/56/4/4             -0.0001         -0.0001            30            30            30            30
BM_string_strcmp/64/4/4             -0.0342         -0.0342            32            31            32            31
BM_string_strcmp/72/4/4             -0.0186         -0.0186            34            34            34            34
BM_string_strcmp/80/4/4             +0.0004         +0.0004            36            36            36            36
BM_string_strcmp/88/4/4             -0.0000         -0.0000            39            39            39            39
BM_string_strcmp/96/4/4             -0.0510         -0.0510            62            59            62            59
BM_string_strcmp/104/4/4            -0.0502         -0.0502            63            60            63            60
BM_string_strcmp/112/4/4            -0.0490         -0.0490            65            62            65            62
BM_string_strcmp/120/4/4            -0.0387         -0.0387            67            65            67            65
BM_string_strcmp/128/4/4            -0.0426         -0.0426            70            67            70            67
BM_string_strcmp/136/4/4            -0.0408         -0.0408            73            70            73            70
BM_string_strcmp/144/4/4            -0.0194         -0.0194            75            74            75            74
BM_string_strcmp/160/4/4            -0.0035         -0.0035            81            81            81            81
BM_string_strcmp/176/4/4            -0.0001         -0.0001            86            86            86            86
BM_string_strcmp/192/4/4            -0.0002         -0.0002            91            91            91            91
BM_string_strcmp/208/4/4            -0.0335         -0.0335            96            93            96            93
BM_string_strcmp/224/4/4            -0.0314         -0.0314           101            98           101            98
BM_string_strcmp/240/4/4            -0.0303         -0.0303           106           103           106           103
BM_string_strcmp/256/4/4            -0.0288         -0.0288           111           108           111           108

[1] Commit id: f98f2a6780d686ca3d44f8011c7823d42d9b083a

Test: bionic tests and benchmarks on aarch64.
Change-Id: I75f8948782b8bd459d21f15e75e1d420905f5e5a
This commit is contained in:
Adhemerval Zanella 2018-06-22 13:35:54 -03:00 committed by Haibo Huang
parent 4ab56af82d
commit b42ff1b5c3

View file

@ -32,6 +32,8 @@
#include <private/bionic_asm.h>
#define L(label) .L ## label
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define REP8_80 0x8080808080808080
@ -61,24 +63,25 @@ ENTRY(strcmp)
eor tmp1, src1, src2
mov zeroones, #REP8_01
tst tmp1, #7
b.ne .Lmisaligned8
b.ne L(misaligned8)
ands tmp1, src1, #7
b.ne .Lmutual_align
b.ne L(mutual_align)
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
can be done in parallel across the entire word. */
.Lloop_aligned:
L(loop_aligned):
ldr data1, [src1], #8
ldr data2, [src2], #8
.Lstart_realigned:
L(start_realigned):
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
eor diff, data1, data2 /* Non-zero if differences found. */
bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
orr syndrome, diff, has_nul
cbz syndrome, .Lloop_aligned
cbz syndrome, L(loop_aligned)
/* End of performance-critical section -- one 64B cache line. */
L(end):
#ifndef __AARCH64EB__
rev syndrome, syndrome
rev data1, data1
@ -129,7 +132,7 @@ ENTRY(strcmp)
ret
#endif
.Lmutual_align:
L(mutual_align):
/* Sources are mutually aligned, but are not currently at an
alignment boundary. Round down the addresses and then mask off
the bytes that preceed the start point. */
@ -149,15 +152,41 @@ ENTRY(strcmp)
#endif
orr data1, data1, tmp2
orr data2, data2, tmp2
b .Lstart_realigned
b L(start_realigned)
.Lmisaligned8:
/* We can do better than this. */
L(misaligned8):
/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
checking to make sure that we don't access beyond page boundary in
SRC2. */
tst src1, #7
b.eq L(loop_misaligned)
L(do_misaligned):
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
cmp data1w, #1
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
b.eq .Lmisaligned8
b.ne L(done)
tst src1, #7
b.ne L(do_misaligned)
L(loop_misaligned):
/* Test if we are within the last dword of the end of a 4K page. If
yes then jump back to the misaligned loop to copy a byte at a time. */
and tmp1, src2, #0xff8
eor tmp1, tmp1, #0xff8
cbz tmp1, L(do_misaligned)
ldr data1, [src1], #8
ldr data2, [src2], #8
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
eor diff, data1, data2 /* Non-zero if differences found. */
bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
orr syndrome, diff, has_nul
cbz syndrome, L(loop_misaligned)
b L(end)
L(done):
sub result, data1, data2
ret
END(strcmp)