/* * Copyright (C) 2008 The Android Open Source Project * All rights reserved. * Copyright (c) 2013-2014, NVIDIA Corporation. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #define CACHE_LINE_SIZE (64) #define PREFETCH_DISTANCE (CACHE_LINE_SIZE*6) ENTRY_PRIVATE(MEMCPY_BASE) .cfi_def_cfa_offset 8 .cfi_rel_offset r0, 0 .cfi_rel_offset lr, 4 cmp r2, #0 beq .L_memcpy_done cmp r0, r1 beq .L_memcpy_done /* preload next cache line */ pld [r1, #CACHE_LINE_SIZE*1] /* Deal with very small blocks (< 32bytes) asap */ cmp r2, #32 blo .L_memcpy_lt_32bytes /* no need to align if len < 128 bytes */ cmp r2, #128 blo .L_memcpy_lt_128bytes /* large copy, align dest to 64 byte boundry */ pld [r1, #CACHE_LINE_SIZE*2] rsb r3, r0, #0 ands r3, r3, #0x3F pld [r1, #CACHE_LINE_SIZE*3] beq .L_memcpy_dispatch sub r2, r2, r3 /* copy 1 byte */ movs ip, r3, lsl #31 itt mi ldrbmi ip, [r1], #1 strbmi ip, [r0], #1 /* copy 2 bytes */ itt cs ldrhcs ip, [r1], #2 strhcs ip, [r0], #2 /* copy 4 bytes */ movs ip, r3, lsl #29 itt mi ldrmi ip, [r1], #4 strmi ip, [r0], #4 /* copy 8 bytes */ bcc 1f vld1.8 {d0}, [r1]! vst1.8 {d0}, [r0, :64]! 1: /* copy 16 bytes */ movs ip, r3, lsl #27 bpl 1f vld1.8 {q0}, [r1]! vst1.8 {q0}, [r0, :128]! 1: /* copy 32 bytes */ bcc .L_memcpy_dispatch vld1.8 {q0, q1}, [r1]! vst1.8 {q0, q1}, [r0, :256]! .L_memcpy_dispatch: // pre-decrement by 128 to detect nearly-done condition easily, but // also need to check if we have less than 128 bytes left at this // point due to alignment code above subs r2, r2, #128 blo .L_memcpy_lt_128presub // Denver does better if both source and dest are aligned so // we'll special-case that even though the code is virually identical tst r1, #0xF bne .L_memcpy_neon_unalign_src_pld // DRAM memcpy should be throttled slightly to get full bandwidth // cmp r2, #32768 bhi .L_memcpy_neon_unalign_src_pld .align 4 1: /* copy 128 bytes in each loop */ subs r2, r2, #128 /* preload a cache line */ pld [r1, #PREFETCH_DISTANCE] /* copy a cache line */ vld1.8 {q0, q1}, [r1, :128]! vst1.8 {q0, q1}, [r0, :256]! vld1.8 {q0, q1}, [r1, :128]! vst1.8 {q0, q1}, [r0, :256]! /* preload a cache line */ pld [r1, #PREFETCH_DISTANCE] /* copy a cache line */ vld1.8 {q0, q1}, [r1, :128]! vst1.8 {q0, q1}, [r0, :256]! vld1.8 {q0, q1}, [r1, :128]! vst1.8 {q0, q1}, [r0, :256]! bhs 1b adds r2, r2, #128 bne .L_memcpy_lt_128bytes_align pop {r0, pc} .align 4 .L_memcpy_neon_unalign_src_pld: 1: /* copy 128 bytes in each loop */ subs r2, r2, #128 /* preload a cache line */ pld [r1, #PREFETCH_DISTANCE] /* copy a cache line */ vld1.8 {q0, q1}, [r1]! vst1.8 {q0, q1}, [r0, :256]! vld1.8 {q0, q1}, [r1]! vst1.8 {q0, q1}, [r0, :256]! /* preload a cache line */ pld [r1, #PREFETCH_DISTANCE] /* copy a cache line */ vld1.8 {q0, q1}, [r1]! vst1.8 {q0, q1}, [r0, :256]! vld1.8 {q0, q1}, [r1]! vst1.8 {q0, q1}, [r0, :256]! bhs 1b adds r2, r2, #128 bne .L_memcpy_lt_128bytes_align pop {r0, pc} .L_memcpy_lt_128presub: add r2, r2, #128 .L_memcpy_lt_128bytes_align: /* copy 64 bytes */ movs ip, r2, lsl #26 bcc 1f vld1.8 {q0, q1}, [r1]! vst1.8 {q0, q1}, [r0, :256]! vld1.8 {q0, q1}, [r1]! vst1.8 {q0, q1}, [r0, :256]! 1: /* copy 32 bytes */ bpl 1f vld1.8 {q0, q1}, [r1]! vst1.8 {q0, q1}, [r0, :256]! 1: /* copy 16 bytes */ movs ip, r2, lsl #28 bcc 1f vld1.8 {q0}, [r1]! vst1.8 {q0}, [r0, :128]! 1: /* copy 8 bytes */ bpl 1f vld1.8 {d0}, [r1]! vst1.8 {d0}, [r0, :64]! 1: /* copy 4 bytes */ tst r2, #4 itt ne ldrne ip, [r1], #4 strne ip, [r0], #4 /* copy 2 bytes */ movs ip, r2, lsl #31 itt cs ldrhcs ip, [r1], #2 strhcs ip, [r0], #2 /* copy 1 byte */ itt mi ldrbmi ip, [r1] strbmi ip, [r0] pop {r0, pc} .L_memcpy_lt_128bytes: /* copy 64 bytes */ movs ip, r2, lsl #26 bcc 1f vld1.8 {q0, q1}, [r1]! vst1.8 {q0, q1}, [r0]! vld1.8 {q0, q1}, [r1]! vst1.8 {q0, q1}, [r0]! 1: /* copy 32 bytes */ bpl .L_memcpy_lt_32bytes vld1.8 {q0, q1}, [r1]! vst1.8 {q0, q1}, [r0]! .L_memcpy_lt_32bytes: /* copy 16 bytes */ movs ip, r2, lsl #28 bcc 1f vld1.8 {q0}, [r1]! vst1.8 {q0}, [r0]! 1: /* copy 8 bytes */ bpl 1f vld1.8 {d0}, [r1]! vst1.8 {d0}, [r0]! 1: /* copy 4 bytes */ tst r2, #4 itt ne ldrne ip, [r1], #4 strne ip, [r0], #4 /* copy 2 bytes */ movs ip, r2, lsl #31 itt cs ldrhcs ip, [r1], #2 strhcs ip, [r0], #2 /* copy 1 byte */ itt mi ldrbmi ip, [r1] strbmi ip, [r0] .L_memcpy_done: pop {r0, pc} END(MEMCPY_BASE)