platform_bionic/libm/x86_64/s_expm1.S
Jingwei Zhang 5d4f0e6a26 Add the optimized implementation of 18 math functions for x86 and x86_64 respectively
Change-Id: I31bf601448a9427f825517f3a0ff24de47f49bfa
Signed-off-by: Jingwei Zhang <jingwei.zhang@intel.com>
Signed-off-by: Mingwei Shi <mingwei.shi@intel.com>
2015-03-09 13:19:08 -07:00

727 lines
17 KiB
ArmAsm

/*
Copyright (c) 2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/******************************************************************************/
// ALGORITHM DESCRIPTION
// ---------------------
//
// Description:
// Let K = 64 (table size).
//
// Four sub-domains:
// 1. |x| < 1/(2*K)
// expm1(x) ~ P(x)
// 2. 1/(2*K) <= |x| <= 56*log(2)
// x x/log(2) n
// e - 1 = 2 = 2 * T[j] * (1 + P(y)) - 1
// 3. 56*log(2) < x < MAX_LOG
// x x x/log(2) n
// e - 1 ~ e = 2 = 2 * T[j] * (1 + P(y))
// 4. x < -56*log(2)
// x x
// e - 1 = -1 + e ~ -1
// where
// x = m*log(2)/K + y, y in [-log(2)/K..log(2)/K]
// m = n*K + j, m,n,j - signed integer, j in [-K/2..K/2]
// j/K
// values of 2 are tabulated as T[j] = T_hi[j] ( 1 + T_lo[j]).
//
// P(y) is a minimax polynomial approximation of exp(x)-1
// on small interval [-log(2)/K..log(2)/K] (were calculated by Maple V).
//
// In case 3, to avoid problems with arithmetic overflow and underflow,
// n n1 n2
// value of 2 is safely computed as 2 * 2 where n1 in [-BIAS/2..BIAS/2]
// and BIAS is a value of exponent bias.
//
// Special cases:
// expm1(NaN) is NaN
// expm1(+INF) is +INF
// expm1(-INF) is -1
// expm1(x) is x for subnormals
// for finite argument, only expm1(0)=0 is exact.
// For IEEE double
// if x > 709.782712893383973096 then expm1(x) overflow
//
/******************************************************************************/
#include <private/bionic_asm.h>
# -- Begin expm1
ENTRY(expm1)
# parameter 1: %xmm0
..B1.1:
..___tag_value_expm1.1:
subq $56, %rsp
..___tag_value_expm1.3:
movsd %xmm0, 32(%rsp)
..B1.2:
unpcklpd %xmm0, %xmm0
movapd cv(%rip), %xmm1
movapd Shifter(%rip), %xmm6
movapd 16+cv(%rip), %xmm2
movapd 32+cv(%rip), %xmm3
pextrw $3, %xmm0, %eax
andl $32767, %eax
movl $16527, %edx
subl %eax, %edx
subl $16304, %eax
orl %eax, %edx
cmpl $-2147483648, %edx
jae .L_2TAG_PACKET_0.0.2
mulpd %xmm0, %xmm1
addpd %xmm6, %xmm1
movapd %xmm1, %xmm7
subpd %xmm6, %xmm1
mulpd %xmm1, %xmm2
movapd 48+cv(%rip), %xmm4
mulpd %xmm1, %xmm3
movapd 64+cv(%rip), %xmm5
subpd %xmm2, %xmm0
movd %xmm7, %eax
movl %eax, %ecx
andl $63, %ecx
shll $4, %ecx
sarl $6, %eax
movl %eax, %edx
subpd %xmm3, %xmm0
lea Tbl_addr(%rip), %r11
movapd (%rcx,%r11), %xmm2
movq 80+cv(%rip), %xmm3
mulpd %xmm0, %xmm4
movapd %xmm0, %xmm1
mulpd %xmm0, %xmm0
mulsd %xmm0, %xmm3
addpd %xmm4, %xmm5
mulsd %xmm0, %xmm0
movq %xmm2, %xmm4
unpckhpd %xmm2, %xmm2
movdqa mmask(%rip), %xmm6
pand %xmm6, %xmm7
movdqa bias(%rip), %xmm6
paddq %xmm6, %xmm7
psllq $46, %xmm7
mulsd %xmm0, %xmm3
mulpd %xmm5, %xmm0
addl $894, %edx
cmpl $1916, %edx
ja .L_2TAG_PACKET_1.0.2
addsd %xmm3, %xmm0
xorpd %xmm3, %xmm3
movl $16368, %eax
pinsrw $3, %eax, %xmm3
orpd %xmm7, %xmm2
mulsd %xmm4, %xmm7
movq %xmm3, %xmm6
addsd %xmm1, %xmm3
pextrw $3, %xmm2, %edx
pshufd $238, %xmm0, %xmm5
psrlq $38, %xmm3
psllq $38, %xmm3
movq %xmm2, %xmm4
subsd %xmm3, %xmm6
addsd %xmm5, %xmm0
addsd %xmm6, %xmm1
addsd %xmm7, %xmm4
mulsd %xmm3, %xmm7
mulsd %xmm2, %xmm3
xorpd %xmm5, %xmm5
movl $16368, %eax
pinsrw $3, %eax, %xmm5
addsd %xmm1, %xmm0
movl $17184, %ecx
subl %edx, %ecx
subl $16256, %edx
orl %edx, %ecx
jl .L_2TAG_PACKET_2.0.2
mulsd %xmm4, %xmm0
subsd %xmm5, %xmm3
addsd %xmm7, %xmm0
addsd %xmm3, %xmm0
.L_2TAG_PACKET_3.0.2:
jmp ..B1.5
.L_2TAG_PACKET_2.0.2:
cmpl $0, %edx
jl .L_2TAG_PACKET_4.0.2
mulsd %xmm4, %xmm0
subsd %xmm5, %xmm7
addsd %xmm7, %xmm0
addsd %xmm3, %xmm0
jmp ..B1.5
.L_2TAG_PACKET_4.0.2:
mulsd %xmm4, %xmm0
addsd %xmm7, %xmm0
addsd %xmm3, %xmm0
subsd %xmm5, %xmm0
jmp ..B1.5
.L_2TAG_PACKET_1.0.2:
movl 36(%rsp), %ecx
addsd %xmm0, %xmm1
unpckhpd %xmm0, %xmm0
addsd %xmm1, %xmm0
cmpl $0, %ecx
jl .L_2TAG_PACKET_5.0.2
fstcw (%rsp)
movw (%rsp), %dx
orw $768, %dx
movw %dx, 4(%rsp)
fldcw 4(%rsp)
movl %eax, %edx
sarl $1, %eax
subl %eax, %edx
movdqa emask(%rip), %xmm6
pandn %xmm2, %xmm6
addl $1023, %eax
movd %eax, %xmm3
psllq $52, %xmm3
orpd %xmm3, %xmm6
mulsd %xmm3, %xmm4
movsd %xmm0, 16(%rsp)
fldl 16(%rsp)
movsd %xmm6, 24(%rsp)
fldl 24(%rsp)
movsd %xmm4, 16(%rsp)
fldl 16(%rsp)
addl $1023, %edx
movd %edx, %xmm4
psllq $52, %xmm4
faddp %st, %st(1)
fmul %st, %st(1)
faddp %st, %st(1)
movsd %xmm4, 24(%rsp)
fldl 24(%rsp)
fmulp %st, %st(1)
fstpl 16(%rsp)
movsd 16(%rsp), %xmm0
fldcw (%rsp)
pextrw $3, %xmm0, %ecx
andl $32752, %ecx
cmpl $32752, %ecx
jae .L_2TAG_PACKET_6.0.2
jmp ..B1.5
cmpl $-2147483648, %ecx
jb .L_2TAG_PACKET_6.0.2
jmp ..B1.5
.L_2TAG_PACKET_6.0.2:
movl $41, 8(%rsp)
jmp .L_2TAG_PACKET_7.0.2
.L_2TAG_PACKET_8.0.2:
cmpl $2146435072, %eax
jae .L_2TAG_PACKET_9.0.2
movsd XMAX(%rip), %xmm0
mulsd %xmm0, %xmm0
movl $41, 8(%rsp)
jmp .L_2TAG_PACKET_7.0.2
.L_2TAG_PACKET_9.0.2:
movl 36(%rsp), %eax
movl 32(%rsp), %edx
movl %eax, %ecx
andl $2147483647, %eax
cmpl $2146435072, %eax
ja .L_2TAG_PACKET_10.0.2
cmpl $0, %edx
jne .L_2TAG_PACKET_10.0.2
cmpl $0, %ecx
jl .L_2TAG_PACKET_11.0.2
movq INF(%rip), %xmm0
jmp ..B1.5
.L_2TAG_PACKET_11.0.2:
jmp .L_2TAG_PACKET_5.0.2
.L_2TAG_PACKET_10.0.2:
movsd 32(%rsp), %xmm0
addsd %xmm0, %xmm0
jmp ..B1.5
.L_2TAG_PACKET_12.0.2:
addl $16304, %eax
cmpl $15504, %eax
jb .L_2TAG_PACKET_13.0.2
movapd cvl(%rip), %xmm2
pshufd $68, %xmm0, %xmm1
movapd 16+cvl(%rip), %xmm3
movapd 32+cvl(%rip), %xmm4
movq 48+cvl(%rip), %xmm5
mulsd %xmm1, %xmm1
xorpd %xmm6, %xmm6
movl $16352, %eax
pinsrw $3, %eax, %xmm6
mulpd %xmm0, %xmm2
xorpd %xmm7, %xmm7
movl $16368, %edx
pinsrw $3, %edx, %xmm7
addpd %xmm3, %xmm2
mulsd %xmm1, %xmm5
pshufd $228, %xmm1, %xmm3
mulpd %xmm1, %xmm1
mulsd %xmm0, %xmm6
mulpd %xmm0, %xmm2
addpd %xmm4, %xmm2
movq %xmm7, %xmm4
addsd %xmm6, %xmm7
mulpd %xmm3, %xmm1
psrlq $27, %xmm7
psllq $27, %xmm7
movq HIGHMASK(%rip), %xmm3
subsd %xmm7, %xmm4
mulpd %xmm1, %xmm2
addsd %xmm4, %xmm6
pshufd $238, %xmm2, %xmm1
addsd %xmm2, %xmm6
andpd %xmm0, %xmm3
movq %xmm0, %xmm4
addsd %xmm6, %xmm1
subsd %xmm3, %xmm0
addsd %xmm5, %xmm1
mulsd %xmm7, %xmm3
mulsd %xmm7, %xmm0
mulsd %xmm1, %xmm4
addsd %xmm4, %xmm0
addsd %xmm3, %xmm0
jmp ..B1.5
.L_2TAG_PACKET_13.0.2:
cmpl $16, %eax
jae .L_2TAG_PACKET_3.0.2
movq %xmm0, %xmm2
movd %xmm0, %eax
psrlq $31, %xmm2
movd %xmm2, %ecx
orl %ecx, %eax
je .L_2TAG_PACKET_3.0.2
movl $16, %edx
xorpd %xmm1, %xmm1
pinsrw $3, %edx, %xmm1
mulsd %xmm1, %xmm1
movl $42, 8(%rsp)
jmp .L_2TAG_PACKET_7.0.2
.L_2TAG_PACKET_0.0.2:
cmpl $0, %eax
jl .L_2TAG_PACKET_12.0.2
movl 36(%rsp), %eax
cmpl $1083179008, %eax
jge .L_2TAG_PACKET_8.0.2
cmpl $-1048576, %eax
jae .L_2TAG_PACKET_9.0.2
.L_2TAG_PACKET_5.0.2:
xorpd %xmm0, %xmm0
movl $49136, %eax
pinsrw $3, %eax, %xmm0
jmp ..B1.5
.L_2TAG_PACKET_7.0.2:
movq %xmm0, 40(%rsp)
..B1.3:
movq 40(%rsp), %xmm0
.L_2TAG_PACKET_14.0.2:
..B1.5:
addq $56, %rsp
..___tag_value_expm1.4:
ret
..___tag_value_expm1.5:
END(expm1)
# -- End expm1
.section .rodata, "a"
.align 16
.align 16
cv:
.long 1697350398
.long 1079448903
.long 1697350398
.long 1079448903
.long 4277796864
.long 1065758274
.long 4277796864
.long 1065758274
.long 3164486458
.long 1025308570
.long 3164486458
.long 1025308570
.long 1963358694
.long 1065423121
.long 1431655765
.long 1069897045
.long 1431655765
.long 1067799893
.long 0
.long 1071644672
.long 381774871
.long 1062650220
.long 381774871
.long 1062650220
.type cv,@object
.size cv,96
.align 16
Shifter:
.long 0
.long 1127743488
.long 0
.long 1127743488
.type Shifter,@object
.size Shifter,16
.align 16
Tbl_addr:
.long 0
.long 0
.long 0
.long 0
.long 1000070955
.long 1042145304
.long 1040187392
.long 11418
.long 988267849
.long 1039500660
.long 3539992576
.long 22960
.long 36755401
.long 1042114290
.long 402653184
.long 34629
.long 3634769483
.long 1042178627
.long 1820327936
.long 46424
.long 2155991225
.long 1041560680
.long 847249408
.long 58348
.long 2766913307
.long 1039293264
.long 3489660928
.long 70401
.long 3651174602
.long 1040488175
.long 2927624192
.long 82586
.long 3073892131
.long 1042240606
.long 1006632960
.long 94904
.long 1328391742
.long 1042019037
.long 3942645760
.long 107355
.long 2650893825
.long 1041903210
.long 822083584
.long 119943
.long 2397289153
.long 1041802037
.long 2281701376
.long 132667
.long 430997175
.long 1042110606
.long 1845493760
.long 145530
.long 1230936525
.long 1041801015
.long 1702887424
.long 158533
.long 740675935
.long 1040178913
.long 4110417920
.long 171677
.long 3489810261
.long 1041825986
.long 2793406464
.long 184965
.long 2532600530
.long 1040767882
.long 167772160
.long 198398
.long 3542557060
.long 1041827263
.long 2986344448
.long 211976
.long 1401563777
.long 1041061093
.long 922746880
.long 225703
.long 3129406026
.long 1041852413
.long 880803840
.long 239579
.long 900993572
.long 1039283234
.long 1275068416
.long 253606
.long 2115029358
.long 1042140042
.long 562036736
.long 267786
.long 1086643152
.long 1041785419
.long 1610612736
.long 282120
.long 82864366
.long 1041256244
.long 3045064704
.long 296610
.long 2392968152
.long 1040913683
.long 3573547008
.long 311258
.long 2905856183
.long 1040002214
.long 1988100096
.long 326066
.long 3742008261
.long 1040011137
.long 1451229184
.long 341035
.long 863393794
.long 1040880621
.long 914358272
.long 356167
.long 1446136837
.long 1041372426
.long 3707764736
.long 371463
.long 927855201
.long 1040617636
.long 360710144
.long 386927
.long 1492679939
.long 1041050306
.long 2952790016
.long 402558
.long 608827001
.long 1041582217
.long 2181038080
.long 418360
.long 606260204
.long 1042271987
.long 1711276032
.long 434334
.long 3163044019
.long 1041843851
.long 1006632960
.long 450482
.long 4148747325
.long 1041962972
.long 3900702720
.long 466805
.long 802924201
.long 1041275378
.long 1442840576
.long 483307
.long 3052749833
.long 1041940577
.long 1937768448
.long 499988
.long 2216116399
.long 1041486744
.long 914358272
.long 516851
.long 2729697836
.long 1041445764
.long 2566914048
.long 533897
.long 540608356
.long 1041310907
.long 2600468480
.long 551129
.long 2916344493
.long 1040535661
.long 1107296256
.long 568549
.long 731391814
.long 1039497014
.long 2566914048
.long 586158
.long 1024722704
.long 1041461625
.long 2961178624
.long 603959
.long 3806831748
.long 1041732499
.long 2675965952
.long 621954
.long 238953304
.long 1040316488
.long 2189426688
.long 640145
.long 749123235
.long 1041725785
.long 2063597568
.long 658534
.long 1168187977
.long 1041175214
.long 2986344448
.long 677123
.long 3506096399
.long 1042186095
.long 1426063360
.long 695915
.long 1470221620
.long 1041675499
.long 2566914048
.long 714911
.long 3182425146
.long 1041483134
.long 3087007744
.long 734114
.long 3131698208
.long 1042208657
.long 4068474880
.long 753526
.long 2300504125
.long 1041428596
.long 2415919104
.long 773150
.long 2290297931
.long 1037388400
.long 3716153344
.long 792987
.long 3532148223
.long 1041626194
.long 771751936
.long 813041
.long 1161884404
.long 1042015258
.long 3699376128
.long 833312
.long 876383176
.long 1037968878
.long 1241513984
.long 853805
.long 3379986796
.long 1042213153
.long 3699376128
.long 874520
.long 1545797737
.long 1041681569
.long 58720256
.long 895462
.long 2925146801
.long 1042212567
.long 855638016
.long 916631
.long 1316627971
.long 1038516204
.long 3883925504
.long 938030
.long 3267869137
.long 1040337004
.long 2726297600
.long 959663
.long 3720868999
.long 1041782409
.long 3992977408
.long 981531
.long 433316142
.long 1041994064
.long 1526726656
.long 1003638
.long 781232103
.long 1040093400
.long 2172649472
.long 1025985
.type Tbl_addr,@object
.size Tbl_addr,1024
.align 16
mmask:
.long 4294967232
.long 0
.long 4294967232
.long 0
.type mmask,@object
.size mmask,16
.align 16
bias:
.long 65472
.long 0
.long 65472
.long 0
.type bias,@object
.size bias,16
.align 16
emask:
.long 0
.long 4293918720
.long 0
.long 4293918720
.type emask,@object
.size emask,16
.align 16
cvl:
.long 2773927732
.long 1053236707
.long 381774871
.long 1062650220
.long 379653899
.long 1056571845
.long 286331153
.long 1065423121
.long 436314138
.long 1059717536
.long 1431655765
.long 1067799893
.long 1431655765
.long 1069897045
.long 0
.long 1071644672
.type cvl,@object
.size cvl,64
.align 8
XMAX:
.long 4294967295
.long 2146435071
.type XMAX,@object
.size XMAX,8
.align 8
INF:
.long 0
.long 2146435072
.type INF,@object
.size INF,8
.align 8
HIGHMASK:
.long 4227858432
.long 4294967295
.type HIGHMASK,@object
.size HIGHMASK,8
.data
.section .note.GNU-stack, ""
// -- Begin DWARF2 SEGMENT .eh_frame
.section .eh_frame,"a",@progbits
.eh_frame_seg:
.align 1
.4byte 0x00000014
.8byte 0x00527a0100000000
.8byte 0x08070c1b01107801
.4byte 0x00000190
.4byte 0x0000001c
.4byte 0x0000001c
.4byte ..___tag_value_expm1.1-.
.4byte ..___tag_value_expm1.5-..___tag_value_expm1.1
.2byte 0x0400
.4byte ..___tag_value_expm1.3-..___tag_value_expm1.1
.2byte 0x400e
.byte 0x04
.4byte ..___tag_value_expm1.4-..___tag_value_expm1.3
.2byte 0x080e
.byte 0x00
# End