/*
 * Copyright (C) 2008 The Android Open Source Project
 * All rights reserved.
 * Copyright (c) 2013-2014, NVIDIA Corporation.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#define CACHE_LINE_SIZE         (64)
#define PREFETCH_DISTANCE       (CACHE_LINE_SIZE*6)

ENTRY_PRIVATE(MEMCPY_BASE)
        .cfi_def_cfa_offset 8
        .cfi_rel_offset r0, 0
        .cfi_rel_offset lr, 4

        cmp         r2, #0
        beq         .L_memcpy_done
        cmp         r0, r1
        beq         .L_memcpy_done

        /* preload next cache line */
        pld         [r1, #CACHE_LINE_SIZE*1]

        /* Deal with very small blocks (< 32bytes) asap */
        cmp         r2, #32
        blo         .L_memcpy_lt_32bytes
        /* no need to align if len < 128 bytes */
        cmp         r2, #128
        blo         .L_memcpy_lt_128bytes

        /* large copy, align dest to 64 byte boundry */
        pld         [r1, #CACHE_LINE_SIZE*2]
        rsb         r3, r0, #0
        ands        r3, r3, #0x3F
        pld         [r1, #CACHE_LINE_SIZE*3]
        beq         .L_memcpy_dispatch
        sub         r2, r2, r3
        /* copy 1 byte */
        movs        ip, r3, lsl #31
        itt         mi
        ldrbmi      ip, [r1], #1
        strbmi      ip, [r0], #1
        /* copy 2 bytes */
        itt         cs
        ldrhcs      ip, [r1], #2
        strhcs      ip, [r0], #2
        /* copy 4 bytes */
        movs        ip, r3, lsl #29
        itt         mi
        ldrmi       ip, [r1], #4
        strmi       ip, [r0], #4
        /* copy 8 bytes */
        bcc         1f
        vld1.8      {d0}, [r1]!
        vst1.8      {d0}, [r0, :64]!
1:      /* copy 16 bytes */
        movs        ip, r3, lsl #27
        bpl         1f
        vld1.8      {q0}, [r1]!
        vst1.8      {q0}, [r0, :128]!
1:      /* copy 32 bytes */
        bcc         .L_memcpy_dispatch
        vld1.8      {q0, q1}, [r1]!
        vst1.8      {q0, q1}, [r0, :256]!

.L_memcpy_dispatch:
        // pre-decrement by 128 to detect nearly-done condition easily, but
        // also need to check if we have less than 128 bytes left at this
        // point due to alignment code above
        subs        r2, r2, #128
        blo         .L_memcpy_lt_128presub

        // Denver does better if both source and dest are aligned so
        // we'll special-case that even though the code is virually identical
        tst         r1, #0xF
        bne         .L_memcpy_neon_unalign_src_pld

        // DRAM memcpy should be throttled slightly to get full bandwidth
        //
        cmp         r2, #32768
        bhi         .L_memcpy_neon_unalign_src_pld
        .align      4
1:
        /* copy 128 bytes in each loop */
        subs        r2, r2, #128

        /* preload a cache line */
        pld         [r1, #PREFETCH_DISTANCE]
        /* copy a cache line */
        vld1.8      {q0, q1}, [r1, :128]!
        vst1.8      {q0, q1}, [r0, :256]!
        vld1.8      {q0, q1}, [r1, :128]!
        vst1.8      {q0, q1}, [r0, :256]!
        /* preload a cache line */
        pld         [r1, #PREFETCH_DISTANCE]
        /* copy a cache line */
        vld1.8      {q0, q1}, [r1, :128]!
        vst1.8      {q0, q1}, [r0, :256]!
        vld1.8      {q0, q1}, [r1, :128]!
        vst1.8      {q0, q1}, [r0, :256]!

        bhs         1b
        adds        r2, r2, #128
        bne         .L_memcpy_lt_128bytes_align
        pop         {r0, pc}

        .align      4
.L_memcpy_neon_unalign_src_pld:
1:
        /* copy 128 bytes in each loop */
        subs        r2, r2, #128

        /* preload a cache line */
        pld         [r1, #PREFETCH_DISTANCE]
        /* copy a cache line */
        vld1.8      {q0, q1}, [r1]!
        vst1.8      {q0, q1}, [r0, :256]!
        vld1.8      {q0, q1}, [r1]!
        vst1.8      {q0, q1}, [r0, :256]!
        /* preload a cache line */
        pld         [r1, #PREFETCH_DISTANCE]
        /* copy a cache line */
        vld1.8      {q0, q1}, [r1]!
        vst1.8      {q0, q1}, [r0, :256]!
        vld1.8      {q0, q1}, [r1]!
        vst1.8      {q0, q1}, [r0, :256]!

        bhs         1b
        adds        r2, r2, #128
        bne         .L_memcpy_lt_128bytes_align
        pop         {r0, pc}

.L_memcpy_lt_128presub:
        add         r2, r2, #128
.L_memcpy_lt_128bytes_align:
        /* copy 64 bytes */
        movs        ip, r2, lsl #26
        bcc         1f
        vld1.8      {q0, q1}, [r1]!
        vst1.8      {q0, q1}, [r0, :256]!
        vld1.8      {q0, q1}, [r1]!
        vst1.8      {q0, q1}, [r0, :256]!
1:      /* copy 32 bytes */
        bpl         1f
        vld1.8      {q0, q1}, [r1]!
        vst1.8      {q0, q1}, [r0, :256]!
1:      /* copy 16 bytes */
        movs        ip, r2, lsl #28
        bcc         1f
        vld1.8      {q0}, [r1]!
        vst1.8      {q0}, [r0, :128]!
1:      /* copy 8 bytes */
        bpl         1f
        vld1.8      {d0}, [r1]!
        vst1.8      {d0}, [r0, :64]!
1:      /* copy 4 bytes */
        tst         r2, #4
        itt         ne
        ldrne       ip, [r1], #4
        strne       ip, [r0], #4
        /* copy 2 bytes */
        movs        ip, r2, lsl #31
        itt         cs
        ldrhcs      ip, [r1], #2
        strhcs      ip, [r0], #2
        /* copy 1 byte */
        itt         mi
        ldrbmi      ip, [r1]
        strbmi      ip, [r0]

        pop         {r0, pc}

.L_memcpy_lt_128bytes:
        /* copy 64 bytes */
        movs        ip, r2, lsl #26
        bcc         1f
        vld1.8      {q0, q1}, [r1]!
        vst1.8      {q0, q1}, [r0]!
        vld1.8      {q0, q1}, [r1]!
        vst1.8      {q0, q1}, [r0]!
1:      /* copy 32 bytes */
        bpl	    .L_memcpy_lt_32bytes
        vld1.8      {q0, q1}, [r1]!
        vst1.8      {q0, q1}, [r0]!
.L_memcpy_lt_32bytes:
        /* copy 16 bytes */
        movs        ip, r2, lsl #28
        bcc         1f
        vld1.8      {q0}, [r1]!
        vst1.8      {q0}, [r0]!
1:      /* copy 8 bytes */
        bpl         1f
        vld1.8      {d0}, [r1]!
        vst1.8      {d0}, [r0]!
1:      /* copy 4 bytes */
        tst         r2, #4
        itt         ne
        ldrne       ip, [r1], #4
        strne       ip, [r0], #4
        /* copy 2 bytes */
        movs        ip, r2, lsl #31
        itt         cs
        ldrhcs      ip, [r1], #2
        strhcs      ip, [r0], #2
        /* copy 1 byte */
        itt         mi
        ldrbmi      ip, [r1]
        strbmi      ip, [r0]

.L_memcpy_done:
        pop         {r0, pc}
END(MEMCPY_BASE)