diff --git a/libc/arch-arm/bionic/memcpy.S b/libc/arch-arm/bionic/memcpy.S index 4ea2c6d01..fcb58cd19 100644 --- a/libc/arch-arm/bionic/memcpy.S +++ b/libc/arch-arm/bionic/memcpy.S @@ -28,111 +28,6 @@ #include -#if __ARM_ARCH__ == 7 || defined(__ARM_NEON__) - - .text - .fpu neon - - .global memcpy - .type memcpy, %function - .align 4 - -#define NEON_MAX_PREFETCH_DISTANCE 320 - -memcpy: - .fnstart - mov ip, r0 - cmp r2, #16 - blt 4f @ Have less than 16 bytes to copy - - @ First ensure 16 byte alignment for the destination buffer - tst r0, #0xF - beq 2f - tst r0, #1 - ldrneb r3, [r1], #1 - strneb r3, [ip], #1 - subne r2, r2, #1 - tst ip, #2 - ldrneb r3, [r1], #1 - strneb r3, [ip], #1 - ldrneb r3, [r1], #1 - strneb r3, [ip], #1 - subne r2, r2, #2 - - tst ip, #4 - beq 1f - vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! - vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [ip, :32]! - sub r2, r2, #4 -1: - tst ip, #8 - beq 2f - vld1.8 {d0}, [r1]! - vst1.8 {d0}, [ip, :64]! - sub r2, r2, #8 -2: - subs r2, r2, #32 - blt 3f - mov r3, #32 - - @ Main copy loop, 32 bytes are processed per iteration. - @ ARM instructions are used for doing fine-grained prefetch, - @ increasing prefetch distance progressively up to - @ NEON_MAX_PREFETCH_DISTANCE at runtime -1: - vld1.8 {d0-d3}, [r1]! - cmp r3, #(NEON_MAX_PREFETCH_DISTANCE - 32) - pld [r1, r3] - addle r3, r3, #32 - vst1.8 {d0-d3}, [ip, :128]! - sub r2, r2, #32 - cmp r2, r3 - bge 1b - cmp r2, #0 - blt 3f -1: @ Copy the remaining part of the buffer (already prefetched) - vld1.8 {d0-d3}, [r1]! - subs r2, r2, #32 - vst1.8 {d0-d3}, [ip, :128]! - bge 1b -3: @ Copy up to 31 remaining bytes - tst r2, #16 - beq 4f - vld1.8 {d0, d1}, [r1]! - vst1.8 {d0, d1}, [ip, :128]! -4: - @ Use ARM instructions exclusively for the final trailing part - @ not fully fitting into full 16 byte aligned block in order - @ to avoid "ARM store after NEON store" hazard. Also NEON - @ pipeline will be (mostly) flushed by the time when the - @ control returns to the caller, making the use of NEON mostly - @ transparent (and avoiding hazards in the caller code) - - movs r3, r2, lsl #29 - bcc 1f - .rept 8 - ldrcsb r3, [r1], #1 - strcsb r3, [ip], #1 - .endr -1: - bpl 1f - .rept 4 - ldrmib r3, [r1], #1 - strmib r3, [ip], #1 - .endr -1: - movs r2, r2, lsl #31 - ldrcsb r3, [r1], #1 - strcsb r3, [ip], #1 - ldrcsb r3, [r1], #1 - strcsb r3, [ip], #1 - ldrmib r3, [r1], #1 - strmib r3, [ip], #1 - bx lr - .fnend - -#else /* __ARM_ARCH__ < 7 */ - .text .global memcpy @@ -490,5 +385,3 @@ copy_last_3_and_return: bx lr .fnend -#endif -