Merge "Remove arm assembler not referenced from any makefile."

2014-12-15 18:46:16 +00:00 · 2014-12-15 18:46:16 +00:00 · ad01c98319
commit ad01c98319
parent 0e32e39df0 f92cc30571
3 changed files with 0 additions and 1617 deletions
--- a/libc/arch-arm/bionic/memcpy.S
+++ b/libc/arch-arm/bionic/memcpy.S
@ -1,686 +0,0 @@
 /*
 * Copyright (C) 2008 The Android Open Source Project
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
 #include <machine/cpu-features.h>
 #include <private/bionic_asm.h>
 #if defined(__ARM_NEON__) && !defined(ARCH_ARM_USE_NON_NEON_MEMCPY)
        .text
        .fpu    neon
 #ifdef HAVE_32_BYTE_CACHE_LINE
 /* a prefetch distance of 2 cache-lines */
 #define CACHE_LINE_SIZE     32
 #else
 /* a prefetch distance of 4 cache-lines works best experimentally */
 #define CACHE_LINE_SIZE     64
 #endif
 ENTRY(memcpy)
        .save       {r0, lr}
        /* start preloading as early as possible */
        pld         [r1, #(CACHE_LINE_SIZE * 0)]
        stmfd       sp!, {r0, lr}
        pld         [r1, #(CACHE_LINE_SIZE * 1)]
 /* If Neon supports unaligned access then remove the align code,
 * unless a size limit has been specified.
 */
 #ifndef NEON_UNALIGNED_ACCESS
        /* do we have at least 16-bytes to copy (needed for alignment below) */
        cmp         r2, #16
        blo         5f
        /* check if buffers are aligned. If so, run arm-only version */
        eor         r3, r0, r1
        ands        r3, r3, #0x3
        beq         11f
        /* align destination to cache-line for the write-buffer */
        rsb         r3, r0, #0
        ands        r3, r3, #0xF
        beq         2f
        /* copy up to 15-bytes (count in r3) */
        sub         r2, r2, r3
        movs        ip, r3, lsl #31
        ldrmib      lr, [r1], #1
        strmib      lr, [r0], #1
        ldrcsb      ip, [r1], #1
        ldrcsb      lr, [r1], #1
        strcsb      ip, [r0], #1
        strcsb      lr, [r0], #1
        movs        ip, r3, lsl #29
        bge         1f
        // copies 4 bytes, destination 32-bits aligned
        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
 1:      bcc         2f
        // copies 8 bytes, destination 64-bits aligned
        vld1.8      {d0}, [r1]!
        vst1.8      {d0}, [r0, :64]!
 2:
        /* preload immediately the next cache line, which we may need */
        pld         [r1, #(CACHE_LINE_SIZE * 0)]
        pld         [r1, #(CACHE_LINE_SIZE * 1)]
 #ifdef HAVE_32_BYTE_CACHE_LINE
        /* make sure we have at least 32 bytes to copy */
        subs        r2, r2, #32
        blo         4f
        /* preload all the cache lines we need.
         * NOTE: the number of pld below depends on PREFETCH_DISTANCE,
         * ideally would would increase the distance in the main loop to
         * avoid the goofy code below. In practice this doesn't seem to make
         * a big difference.
         */
        pld         [r1, #(PREFETCH_DISTANCE)]
 1:      /* The main loop copies 32 bytes at a time */
        vld1.8      {d0  - d3},   [r1]!
        pld         [r1, #(PREFETCH_DISTANCE)]
        subs        r2, r2, #32
        vst1.8      {d0  - d3},   [r0, :128]!
        bhs         1b
 #else
        /* make sure we have at least 64 bytes to copy */
        subs        r2, r2, #64
        blo         2f
        /* preload all the cache lines we need. */
        pld         [r1, #(CACHE_LINE_SIZE * 2)]
        pld         [r1, #(CACHE_LINE_SIZE * 3)]
 1:      /* The main loop copies 64 bytes at a time */
        vld1.8      {d0 - d3}, [r1]!
        vld1.8      {d4 - d7}, [r1]!
 #ifdef  HAVE_32_BYTE_CACHE_LINE
        pld         [r1, #(CACHE_LINE_SIZE * 2)]
        pld         [r1, #(CACHE_LINE_SIZE * 3)]
 #else
        pld         [r1, #(CACHE_LINE_SIZE * 3)]
 #endif
        subs        r2, r2, #64
        vst1.8      {d0 - d3}, [r0, :128]!
        vst1.8      {d4 - d7}, [r0, :128]!
        bhs         1b
 2:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
        add         r2, r2, #64
        subs        r2, r2, #32
        blo         4f
 3:      /* 32 bytes at a time. These cache lines were already preloaded */
        vld1.8      {d0 - d3}, [r1]!
        subs        r2, r2, #32
        vst1.8      {d0 - d3}, [r0, :128]!
        bhs         3b
 #endif
 4:      /* less than 32 left */
        add         r2, r2, #32
        tst         r2, #0x10
        beq         5f
        // copies 16 bytes, 128-bits aligned
        vld1.8      {d0, d1}, [r1]!
        vst1.8      {d0, d1}, [r0, :128]!
 5:      /* copy up to 15-bytes (count in r2) */
        movs        ip, r2, lsl #29
        bcc         1f
        vld1.8      {d0}, [r1]!
        vst1.8      {d0}, [r0]!
 1:      bge         2f
        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0]!
 2:      movs        ip, r2, lsl #31
        ldrmib      r3, [r1], #1
        ldrcsb      ip, [r1], #1
        ldrcsb      lr, [r1], #1
        strmib      r3, [r0], #1
        strcsb      ip, [r0], #1
        strcsb      lr, [r0], #1
        ldmfd       sp!, {r0, lr}
        bx          lr
 #else   /* NEON_UNALIGNED_ACCESS */
        // Check so divider is at least 16 bytes, needed for alignment code.
        cmp         r2, #16
        blo         5f
 #ifdef NEON_MEMCPY_ALIGNMENT_DIVIDER
        /* Check the upper size limit for Neon unaligned memory access in memcpy */
 #if NEON_MEMCPY_ALIGNMENT_DIVIDER >= 16
        cmp         r2, #NEON_MEMCPY_ALIGNMENT_DIVIDER
        blo         3f
 #endif
        /* check if buffers are aligned. If so, run arm-only version */
        eor         r3, r0, r1
        ands        r3, r3, #0x3
        beq         11f
        /* align destination to 16 bytes for the write-buffer */
        rsb         r3, r0, #0
        ands        r3, r3, #0xF
        beq         3f
        /* copy up to 15-bytes (count in r3) */
        sub         r2, r2, r3
        movs        ip, r3, lsl #31
        ldrmib      lr, [r1], #1
        strmib      lr, [r0], #1
        ldrcsb      ip, [r1], #1
        ldrcsb      lr, [r1], #1
        strcsb      ip, [r0], #1
        strcsb      lr, [r0], #1
        movs        ip, r3, lsl #29
        bge         1f
        // copies 4 bytes, destination 32-bits aligned
        vld1.32     {d0[0]}, [r1]!
        vst1.32     {d0[0]}, [r0, :32]!
 1:      bcc         2f
        // copies 8 bytes, destination 64-bits aligned
        vld1.8      {d0}, [r1]!
        vst1.8      {d0}, [r0, :64]!
 2:
        /* preload immediately the next cache line, which we may need */
        pld         [r1, #(CACHE_LINE_SIZE * 0)]
        pld         [r1, #(CACHE_LINE_SIZE * 1)]
 3:
 #endif
        /* make sure we have at least 64 bytes to copy */
        subs        r2, r2, #64
        blo         2f
        /* preload all the cache lines we need */
        pld         [r1, #(CACHE_LINE_SIZE * 2)]
        pld         [r1, #(CACHE_LINE_SIZE * 3)]
 1:      /* The main loop copies 64 bytes at a time */
        vld1.8      {d0 - d3}, [r1]!
        vld1.8      {d4 - d7}, [r1]!
 #ifdef  HAVE_32_BYTE_CACHE_LINE
        pld         [r1, #(CACHE_LINE_SIZE * 2)]
        pld         [r1, #(CACHE_LINE_SIZE * 3)]
 #else
        pld         [r1, #(CACHE_LINE_SIZE * 3)]
 #endif
        subs        r2, r2, #64
        vst1.8      {d0 - d3}, [r0]!
        vst1.8      {d4 - d7}, [r0]!
        bhs         1b
 2:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
        add         r2, r2, #64
        subs        r2, r2, #32
        blo         4f
 3:      /* 32 bytes at a time. These cache lines were already preloaded */
        vld1.8      {d0 - d3}, [r1]!
        subs        r2, r2, #32
        vst1.8      {d0 - d3}, [r0]!
        bhs         3b
 4:      /* less than 32 left */
        add         r2, r2, #32
        tst         r2, #0x10
        beq         5f
        // copies 16 bytes, 128-bits aligned
        vld1.8      {d0, d1}, [r1]!
        vst1.8      {d0, d1}, [r0]!
 5:      /* copy up to 15-bytes (count in r2) */
        movs        ip, r2, lsl #29
        bcc         1f
        vld1.8      {d0}, [r1]!
        vst1.8      {d0}, [r0]!
 1:      bge         2f
        vld1.32     {d0[0]}, [r1]!
        vst1.32     {d0[0]}, [r0]!
 2:      movs        ip, r2, lsl #31
        ldrmib      r3, [r1], #1
        ldrcsb      ip, [r1], #1
        ldrcsb      lr, [r1], #1
        strmib      r3, [r0], #1
        strcsb      ip, [r0], #1
        strcsb      lr, [r0], #1
        ldmfd       sp!, {r0, lr}
        bx          lr
 #endif  /* NEON_UNALIGNED_ACCESS */
 11:
        /* Simple arm-only copy loop to handle aligned copy operations */
        stmfd       sp!, {r4, r5, r6, r7, r8}
        pld         [r1, #(CACHE_LINE_SIZE * 2)]
        /* Check alignment */
        rsb         r3, r1, #0
        ands        r3, #3
        beq         2f
        /* align source to 32 bits. We need to insert 2 instructions between
         * a ldr[b|h] and str[b|h] because byte and half-word instructions
         * stall 2 cycles.
         */
        movs        r12, r3, lsl #31
        sub         r2, r2, r3      /* we know that r3 <= r2 because r2 >= 4 */
        ldrmib      r3, [r1], #1
        ldrcsb      r4, [r1], #1
        ldrcsb      r5, [r1], #1
        strmib      r3, [r0], #1
        strcsb      r4, [r0], #1
        strcsb      r5, [r0], #1
 2:
        subs        r2, #32
        blt         5f
        pld         [r1, #(CACHE_LINE_SIZE * 3)]
 3:      /* Main copy loop, copying 32 bytes at a time */
        pld         [r1, #(CACHE_LINE_SIZE * 4)]
        ldmia       r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
        subs        r2, r2, #32
        stmia       r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
        bge         3b
 5:      /* Handle any remaining bytes */
        adds        r2, #32
        beq         6f
        movs        r12, r2, lsl #28
        ldmcsia     r1!, {r3, r4, r5, r6}   /* 16 bytes */
        ldmmiia     r1!, {r7, r8}           /*  8 bytes */
        stmcsia     r0!, {r3, r4, r5, r6}
        stmmiia     r0!, {r7, r8}
        movs        r12, r2, lsl #30
        ldrcs       r3, [r1], #4            /*  4 bytes */
        ldrmih      r4, [r1], #2            /*  2 bytes */
        strcs       r3, [r0], #4
        strmih      r4, [r0], #2
        tst         r2, #0x1
        ldrneb      r3, [r1]                /*  last byte  */
        strneb      r3, [r0]
 6:
        ldmfd       sp!, {r4, r5, r6, r7, r8}
        ldmfd       sp!, {r0, pc}
 END(memcpy)
 #else   /* __ARM_ARCH__ < 7 */
 		/*
 		 * Optimized memcpy() for ARM.
         *
 		 * note that memcpy() always returns the destination pointer,
 		 * so we have to preserve R0.
 		 */
 ENTRY(memcpy)
 		/* The stack must always be 64-bits aligned to be compliant with the
 		 * ARM ABI. Since we have to save R0, we might as well save R4
 		 * which we can use for better pipelining of the reads below
 		 */
        .save       {r0, r4, lr}
        stmfd       sp!, {r0, r4, lr}
        /* Making room for r5-r11 which will be spilled later */
        .pad        #28
        sub         sp, sp, #28
        // preload the destination because we'll align it to a cache line
        // with small writes. Also start the source "pump".
        pld         [r0, #0]
        pld         [r1, #0]
        pld         [r1, #32]
 		/* it simplifies things to take care of len<4 early */
 		cmp			r2, #4
 		blo			copy_last_3_and_return
 		/* compute the offset to align the source
 		 * offset = (4-(src&3))&3 = -src & 3
 		 */
 		rsb			r3, r1, #0
 		ands		r3, r3, #3
 		beq			src_aligned
 		/* align source to 32 bits. We need to insert 2 instructions between
 		 * a ldr[b|h] and str[b|h] because byte and half-word instructions
 		 * stall 2 cycles.
 		 */
 		movs		r12, r3, lsl #31
 		sub			r2, r2, r3		/* we know that r3 <= r2 because r2 >= 4 */
 		ldrmib		r3, [r1], #1
 		ldrcsb		r4, [r1], #1
 		ldrcsb		r12,[r1], #1
        strmib		r3, [r0], #1
 		strcsb		r4, [r0], #1
 		strcsb		r12,[r0], #1
 src_aligned:
 		/* see if src and dst are aligned together (congruent) */
 		eor			r12, r0, r1
 		tst			r12, #3
 		bne			non_congruent
        /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
         * frame. Don't update sp.
         */
        stmea		sp, {r5-r11}
 		/* align the destination to a cache-line */
 		rsb         r3, r0, #0
 		ands		r3, r3, #0x1C
 		beq         congruent_aligned32
 		cmp         r3, r2
 		andhi		r3, r2, #0x1C
 		/* conditionnaly copies 0 to 7 words (length in r3) */
 		movs		r12, r3, lsl #28
 		ldmcsia		r1!, {r4, r5, r6, r7}	/* 16 bytes */
 		ldmmiia		r1!, {r8, r9}			/*  8 bytes */
 		stmcsia		r0!, {r4, r5, r6, r7}
 		stmmiia		r0!, {r8, r9}
 		tst         r3, #0x4
 		ldrne		r10,[r1], #4			/*  4 bytes */
 		strne		r10,[r0], #4
 		sub         r2, r2, r3
 congruent_aligned32:
 		/*
 		 * here source is aligned to 32 bytes.
 		 */
 cached_aligned32:
        subs        r2, r2, #32
        blo         less_than_32_left
        /*
         * We preload a cache-line up to 64 bytes ahead. On the 926, this will
         * stall only until the requested world is fetched, but the linefill
         * continues in the the background.
         * While the linefill is going, we write our previous cache-line
         * into the write-buffer (which should have some free space).
         * When the linefill is done, the writebuffer will
         * start dumping its content into memory
         *
         * While all this is going, we then load a full cache line into
         * 8 registers, this cache line should be in the cache by now
         * (or partly in the cache).
         *
         * This code should work well regardless of the source/dest alignment.
         *
         */
        // Align the preload register to a cache-line because the cpu does
        // "critical word first" (the first word requested is loaded first).
        bic         r12, r1, #0x1F
        add         r12, r12, #64
 1:      ldmia       r1!, { r4-r11 }
        pld         [r12, #64]
        subs        r2, r2, #32
        // NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
        // for ARM9 preload will not be safely guarded by the preceding subs.
        // When it is safely guarded the only possibility to have SIGSEGV here
        // is because the caller overstates the length.
        ldrhi       r3, [r12], #32      /* cheap ARM9 preload */
        stmia       r0!, { r4-r11 }
 		bhs         1b
        add         r2, r2, #32
 less_than_32_left:
 		/*
 		 * less than 32 bytes left at this point (length in r2)
 		 */
 		/* skip all this if there is nothing to do, which should
 		 * be a common case (if not executed the code below takes
 		 * about 16 cycles)
 		 */
 		tst			r2, #0x1F
 		beq			1f
 		/* conditionnaly copies 0 to 31 bytes */
 		movs		r12, r2, lsl #28
 		ldmcsia		r1!, {r4, r5, r6, r7}	/* 16 bytes */
 		ldmmiia		r1!, {r8, r9}			/*  8 bytes */
 		stmcsia		r0!, {r4, r5, r6, r7}
 		stmmiia		r0!, {r8, r9}
 		movs		r12, r2, lsl #30
 		ldrcs		r3, [r1], #4			/*  4 bytes */
 		ldrmih		r4, [r1], #2			/*  2 bytes */
 		strcs		r3, [r0], #4
 		strmih		r4, [r0], #2
 		tst         r2, #0x1
 		ldrneb		r3, [r1]				/*  last byte  */
 		strneb		r3, [r0]
 		/* we're done! restore everything and return */
 1:		ldmfd		sp!, {r5-r11}
 		ldmfd		sp!, {r0, r4, lr}
 		bx			lr
 		/********************************************************************/
 non_congruent:
 		/*
 		 * here source is aligned to 4 bytes
 		 * but destination is not.
 		 *
 		 * in the code below r2 is the number of bytes read
 		 * (the number of bytes written is always smaller, because we have
 		 * partial words in the shift queue)
 		 */
 		cmp			r2, #4
 		blo			copy_last_3_and_return
        /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
         * frame. Don't update sp.
         */
        stmea		sp, {r5-r11}
 		/* compute shifts needed to align src to dest */
 		rsb			r5, r0, #0
 		and			r5, r5, #3			/* r5 = # bytes in partial words */
 		mov			r12, r5, lsl #3		/* r12 = right */
 		rsb			lr, r12, #32		/* lr = left  */
 		/* read the first word */
 		ldr			r3, [r1], #4
 		sub			r2, r2, #4
 		/* write a partial word (0 to 3 bytes), such that destination
 		 * becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
 		 */
 		movs		r5, r5, lsl #31
 		strmib		r3, [r0], #1
 		movmi		r3, r3, lsr #8
 		strcsb		r3, [r0], #1
 		movcs		r3, r3, lsr #8
 		strcsb		r3, [r0], #1
 		movcs		r3, r3, lsr #8
 		cmp			r2, #4
 		blo			partial_word_tail
 		/* Align destination to 32 bytes (cache line boundary) */
 1:		tst			r0, #0x1c
 		beq			2f
 		ldr			r5, [r1], #4
 		sub         r2, r2, #4
 		orr			r4, r3, r5,		lsl lr
 		mov			r3, r5,			lsr r12
 		str			r4, [r0], #4
        cmp         r2, #4
 		bhs			1b
 		blo			partial_word_tail
 		/* copy 32 bytes at a time */
 2:		subs		r2, r2, #32
 		blo			less_than_thirtytwo
 		/* Use immediate mode for the shifts, because there is an extra cycle
 		 * for register shifts, which could account for up to 50% of
 		 * performance hit.
 		 */
        cmp			r12, #24
 		beq			loop24
 		cmp			r12, #8
 		beq			loop8
 loop16:
        ldr         r12, [r1], #4
 1:      mov         r4, r12
 		ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11}
        pld         [r1, #64]
        subs        r2, r2, #32
        ldrhs       r12, [r1], #4
 		orr			r3, r3, r4,		lsl #16
 		mov			r4, r4,			lsr #16
 		orr			r4, r4, r5,		lsl #16
 		mov			r5, r5,			lsr #16
 		orr			r5, r5, r6,		lsl #16
 		mov			r6, r6,			lsr #16
 		orr			r6, r6, r7,		lsl #16
 		mov			r7, r7,			lsr #16
 		orr			r7, r7, r8,		lsl #16
 		mov			r8, r8,			lsr #16
 		orr			r8, r8, r9,		lsl #16
 		mov			r9, r9,			lsr #16
 		orr			r9, r9, r10,	lsl #16
 		mov			r10, r10,		lsr #16
 		orr			r10, r10, r11,	lsl #16
 		stmia		r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
 		mov			r3, r11,		lsr #16
 		bhs			1b
 		b			less_than_thirtytwo
 loop8:
        ldr         r12, [r1], #4
 1:      mov         r4, r12
 		ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11}
        pld         [r1, #64]
 		subs		r2, r2, #32
        ldrhs       r12, [r1], #4
 		orr			r3, r3, r4,		lsl #24
 		mov			r4, r4,			lsr #8
 		orr			r4, r4, r5,		lsl #24
 		mov			r5, r5,			lsr #8
 		orr			r5, r5, r6,		lsl #24
 		mov			r6, r6,			lsr #8
 		orr			r6, r6, r7,		lsl #24
 		mov			r7, r7,			lsr #8
 		orr			r7, r7, r8,		lsl #24
 		mov			r8, r8,			lsr #8
 		orr			r8, r8, r9,		lsl #24
 		mov			r9, r9,			lsr #8
 		orr			r9, r9, r10,	lsl #24
 		mov			r10, r10,		lsr #8
 		orr			r10, r10, r11,	lsl #24
 		stmia		r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
 		mov			r3, r11,		lsr #8
 		bhs			1b
 		b			less_than_thirtytwo
 loop24:
        ldr         r12, [r1], #4
 1:      mov         r4, r12
 		ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11}
        pld         [r1, #64]
 		subs		r2, r2, #32
        ldrhs       r12, [r1], #4
 		orr			r3, r3, r4,		lsl #8
 		mov			r4, r4,			lsr #24
 		orr			r4, r4, r5,		lsl #8
 		mov			r5, r5,			lsr #24
 		orr			r5, r5, r6,		lsl #8
 		mov			r6, r6,			lsr #24
 		orr			r6, r6, r7,		lsl #8
 		mov			r7, r7,			lsr #24
 		orr			r7, r7, r8,		lsl #8
 		mov			r8, r8,			lsr #24
 		orr			r8, r8, r9,		lsl #8
 		mov			r9, r9,			lsr #24
 		orr			r9, r9, r10,	lsl #8
 		mov			r10, r10,		lsr #24
 		orr			r10, r10, r11,	lsl #8
 		stmia		r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
 		mov			r3, r11,		lsr #24
 		bhs			1b
 less_than_thirtytwo:
 		/* copy the last 0 to 31 bytes of the source */
 		rsb			r12, lr, #32		/* we corrupted r12, recompute it  */
 		add			r2, r2, #32
 		cmp			r2, #4
 		blo			partial_word_tail
 1:		ldr			r5, [r1], #4
 		sub         r2, r2, #4
 		orr			r4, r3, r5,		lsl lr
 		mov			r3,	r5,			lsr r12
 		str			r4, [r0], #4
        cmp         r2, #4
 		bhs			1b
 partial_word_tail:
 		/* we have a partial word in the input buffer */
 		movs		r5, lr, lsl #(31-3)
 		strmib		r3, [r0], #1
 		movmi		r3, r3, lsr #8
 		strcsb		r3, [r0], #1
 		movcs		r3, r3, lsr #8
 		strcsb		r3, [r0], #1
 		/* Refill spilled registers from the stack. Don't update sp. */
 		ldmfd		sp, {r5-r11}
 copy_last_3_and_return:
 		movs		r2, r2, lsl #31	/* copy remaining 0, 1, 2 or 3 bytes */
 		ldrmib		r2, [r1], #1
 		ldrcsb		r3, [r1], #1
 		ldrcsb		r12,[r1]
 		strmib		r2, [r0], #1
 		strcsb		r3, [r0], #1
 		strcsb		r12,[r0]
        /* we're done! restore sp and spilled registers and return */
        add         sp,  sp, #28
 		ldmfd		sp!, {r0, r4, lr}
 		bx			lr
 END(memcpy)
 #endif    /* __ARM_ARCH__ < 7 */
--- a/libc/arch-arm/bionic/memcpy.a9.S
+++ b/libc/arch-arm/bionic/memcpy.a9.S
@ -1,614 +0,0 @@
 /* Copyright (c) 2013, Linaro Limited
   All rights reserved.
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
   are met:
      * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
      * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
      * Neither the name of Linaro Limited nor the names of its
      contributors may be used to endorse or promote products derived
      from this software without specific prior written permission.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 /*
   This memcpy routine is optimised for Cortex-A15 cores and takes advantage
   of VFP or NEON when built with the appropriate flags.
   Assumptions:
    ARMv6 (ARMv7-a if using Neon)
    ARM state
    Unaligned accesses
    LDRD/STRD support unaligned word accesses
 */
 #include <machine/cpu-features.h>
 #include <private/bionic_asm.h>
 	.syntax unified
 	/* This implementation requires ARM state.  */
 	.arm
 #ifdef __ARM_NEON__
 	.fpu	neon
 	.arch	armv7-a
 # define FRAME_SIZE	4
 # define USE_VFP
 # define USE_NEON
 #elif !defined (__SOFTFP__)
 	.arch	armv6
 	.fpu	vfpv2
 # define FRAME_SIZE	32
 # define USE_VFP
 #else
 	.arch	armv6
 # define FRAME_SIZE    32
 #endif
 /* Old versions of GAS incorrectly implement the NEON align semantics.  */
 #ifdef BROKEN_ASM_NEON_ALIGN
 #define ALIGN(addr, align) addr,:align
 #else
 #define ALIGN(addr, align) addr:align
 #endif
 #define PC_OFFSET	8	/* PC pipeline compensation.  */
 #define INSN_SIZE	4
 /* Call parameters.  */
 #define dstin	r0
 #define src	r1
 #define count	r2
 /* Locals.  */
 #define tmp1	r3
 #define dst	ip
 #define tmp2	r10
 #ifndef USE_NEON
 /* For bulk copies using GP registers.  */
 #define	A_l	r2		/* Call-clobbered.  */
 #define	A_h	r3		/* Call-clobbered.  */
 #define	B_l	r4
 #define	B_h	r5
 #define	C_l	r6
 #define	C_h	r7
 #define	D_l	r8
 #define	D_h	r9
 #endif
 /* Number of lines ahead to pre-fetch data.  If you change this the code
   below will need adjustment to compensate.  */
 #define prefetch_lines	5
 #ifdef USE_VFP
 	.macro	cpy_line_vfp vreg, base
 	vstr	\vreg, [dst, #\base]
 	vldr	\vreg, [src, #\base]
 	vstr	d0, [dst, #\base + 8]
 	vldr	d0, [src, #\base + 8]
 	vstr	d1, [dst, #\base + 16]
 	vldr	d1, [src, #\base + 16]
 	vstr	d2, [dst, #\base + 24]
 	vldr	d2, [src, #\base + 24]
 	vstr	\vreg, [dst, #\base + 32]
 	vldr	\vreg, [src, #\base + prefetch_lines * 64 - 32]
 	vstr	d0, [dst, #\base + 40]
 	vldr	d0, [src, #\base + 40]
 	vstr	d1, [dst, #\base + 48]
 	vldr	d1, [src, #\base + 48]
 	vstr	d2, [dst, #\base + 56]
 	vldr	d2, [src, #\base + 56]
 	.endm
 	.macro	cpy_tail_vfp vreg, base
 	vstr	\vreg, [dst, #\base]
 	vldr	\vreg, [src, #\base]
 	vstr	d0, [dst, #\base + 8]
 	vldr	d0, [src, #\base + 8]
 	vstr	d1, [dst, #\base + 16]
 	vldr	d1, [src, #\base + 16]
 	vstr	d2, [dst, #\base + 24]
 	vldr	d2, [src, #\base + 24]
 	vstr	\vreg, [dst, #\base + 32]
 	vstr	d0, [dst, #\base + 40]
 	vldr	d0, [src, #\base + 40]
 	vstr	d1, [dst, #\base + 48]
 	vldr	d1, [src, #\base + 48]
 	vstr	d2, [dst, #\base + 56]
 	vldr	d2, [src, #\base + 56]
 	.endm
 #endif
 	.p2align 6
 ENTRY(memcpy)
 	mov	dst, dstin	/* Preserve dstin, we need to return it.  */
 	cmp	count, #64
 	bge	.Lcpy_not_short
 	/* Deal with small copies quickly by dropping straight into the
 	   exit block.  */
 .Ltail63unaligned:
 #ifdef USE_NEON
 	and	tmp1, count, #0x38
 	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
 	add	pc, pc, tmp1
 	vld1.8	{d0}, [src]!	/* 14 words to go.  */
 	vst1.8	{d0}, [dst]!
 	vld1.8	{d0}, [src]!	/* 12 words to go.  */
 	vst1.8	{d0}, [dst]!
 	vld1.8	{d0}, [src]!	/* 10 words to go.  */
 	vst1.8	{d0}, [dst]!
 	vld1.8	{d0}, [src]!	/* 8 words to go.  */
 	vst1.8	{d0}, [dst]!
 	vld1.8	{d0}, [src]!	/* 6 words to go.  */
 	vst1.8	{d0}, [dst]!
 	vld1.8	{d0}, [src]!	/* 4 words to go.  */
 	vst1.8	{d0}, [dst]!
 	vld1.8	{d0}, [src]!	/* 2 words to go.  */
 	vst1.8	{d0}, [dst]!
 	tst	count, #4
 	ldrne	tmp1, [src], #4
 	strne	tmp1, [dst], #4
 #else
 	/* Copy up to 15 full words of data.  May not be aligned.  */
 	/* Cannot use VFP for unaligned data.  */
 	and	tmp1, count, #0x3c
 	add	dst, dst, tmp1
 	add	src, src, tmp1
 	rsb	tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
 	/* Jump directly into the sequence below at the correct offset.  */
 	add	pc, pc, tmp1, lsl #1
 	ldr	tmp1, [src, #-60]	/* 15 words to go.  */
 	str	tmp1, [dst, #-60]
 	ldr	tmp1, [src, #-56]	/* 14 words to go.  */
 	str	tmp1, [dst, #-56]
 	ldr	tmp1, [src, #-52]
 	str	tmp1, [dst, #-52]
 	ldr	tmp1, [src, #-48]	/* 12 words to go.  */
 	str	tmp1, [dst, #-48]
 	ldr	tmp1, [src, #-44]
 	str	tmp1, [dst, #-44]
 	ldr	tmp1, [src, #-40]	/* 10 words to go.  */
 	str	tmp1, [dst, #-40]
 	ldr	tmp1, [src, #-36]
 	str	tmp1, [dst, #-36]
 	ldr	tmp1, [src, #-32]	/* 8 words to go.  */
 	str	tmp1, [dst, #-32]
 	ldr	tmp1, [src, #-28]
 	str	tmp1, [dst, #-28]
 	ldr	tmp1, [src, #-24]	/* 6 words to go.  */
 	str	tmp1, [dst, #-24]
 	ldr	tmp1, [src, #-20]
 	str	tmp1, [dst, #-20]
 	ldr	tmp1, [src, #-16]	/* 4 words to go.  */
 	str	tmp1, [dst, #-16]
 	ldr	tmp1, [src, #-12]
 	str	tmp1, [dst, #-12]
 	ldr	tmp1, [src, #-8]	/* 2 words to go.  */
 	str	tmp1, [dst, #-8]
 	ldr	tmp1, [src, #-4]
 	str	tmp1, [dst, #-4]
 #endif
 	lsls	count, count, #31
 	ldrhcs	tmp1, [src], #2
 	ldrbne	src, [src]		/* Src is dead, use as a scratch.  */
 	strhcs	tmp1, [dst], #2
 	strbne	src, [dst]
 	bx	lr
 .Lcpy_not_short:
 	/* At least 64 bytes to copy, but don't know the alignment yet.  */
 	str	tmp2, [sp, #-FRAME_SIZE]!
 	and	tmp2, src, #7
 	and	tmp1, dst, #7
 	cmp	tmp1, tmp2
 	bne	.Lcpy_notaligned
 #ifdef USE_VFP
 	/* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
 	   that the FP pipeline is much better at streaming loads and
 	   stores.  This is outside the critical loop.  */
 	vmov.f32	s0, s0
 #endif
 	/* SRC and DST have the same mutual 32-bit alignment, but we may
 	   still need to pre-copy some bytes to get to natural alignment.
 	   We bring DST into full 64-bit alignment.  */
 	lsls	tmp2, dst, #29
 	beq	1f
 	rsbs	tmp2, tmp2, #0
 	sub	count, count, tmp2, lsr #29
 	ldrmi	tmp1, [src], #4
 	strmi	tmp1, [dst], #4
 	lsls	tmp2, tmp2, #2
 	ldrhcs	tmp1, [src], #2
 	ldrbne	tmp2, [src], #1
 	strhcs	tmp1, [dst], #2
 	strbne	tmp2, [dst], #1
 1:
 	subs	tmp2, count, #64	/* Use tmp2 for count.  */
 	blt	.Ltail63aligned
 	cmp	tmp2, #512
 	bge	.Lcpy_body_long
 .Lcpy_body_medium:			/* Count in tmp2.  */
 #ifdef USE_VFP
 1:
 	vldr	d0, [src, #0]
 	subs	tmp2, tmp2, #64
 	vldr	d1, [src, #8]
 	vstr	d0, [dst, #0]
 	vldr	d0, [src, #16]
 	vstr	d1, [dst, #8]
 	vldr	d1, [src, #24]
 	vstr	d0, [dst, #16]
 	vldr	d0, [src, #32]
 	vstr	d1, [dst, #24]
 	vldr	d1, [src, #40]
 	vstr	d0, [dst, #32]
 	vldr	d0, [src, #48]
 	vstr	d1, [dst, #40]
 	vldr	d1, [src, #56]
 	vstr	d0, [dst, #48]
 	add	src, src, #64
 	vstr	d1, [dst, #56]
 	add	dst, dst, #64
 	bge	1b
 	tst	tmp2, #0x3f
 	beq	.Ldone
 .Ltail63aligned:			/* Count in tmp2.  */
 	and	tmp1, tmp2, #0x38
 	add	dst, dst, tmp1
 	add	src, src, tmp1
 	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
 	add	pc, pc, tmp1
 	vldr	d0, [src, #-56]	/* 14 words to go.  */
 	vstr	d0, [dst, #-56]
 	vldr	d0, [src, #-48]	/* 12 words to go.  */
 	vstr	d0, [dst, #-48]
 	vldr	d0, [src, #-40]	/* 10 words to go.  */
 	vstr	d0, [dst, #-40]
 	vldr	d0, [src, #-32]	/* 8 words to go.  */
 	vstr	d0, [dst, #-32]
 	vldr	d0, [src, #-24]	/* 6 words to go.  */
 	vstr	d0, [dst, #-24]
 	vldr	d0, [src, #-16]	/* 4 words to go.  */
 	vstr	d0, [dst, #-16]
 	vldr	d0, [src, #-8]	/* 2 words to go.  */
 	vstr	d0, [dst, #-8]
 #else
 	sub	src, src, #8
 	sub	dst, dst, #8
 1:
 	ldrd	A_l, A_h, [src, #8]
 	strd	A_l, A_h, [dst, #8]
 	ldrd	A_l, A_h, [src, #16]
 	strd	A_l, A_h, [dst, #16]
 	ldrd	A_l, A_h, [src, #24]
 	strd	A_l, A_h, [dst, #24]
 	ldrd	A_l, A_h, [src, #32]
 	strd	A_l, A_h, [dst, #32]
 	ldrd	A_l, A_h, [src, #40]
 	strd	A_l, A_h, [dst, #40]
 	ldrd	A_l, A_h, [src, #48]
 	strd	A_l, A_h, [dst, #48]
 	ldrd	A_l, A_h, [src, #56]
 	strd	A_l, A_h, [dst, #56]
 	ldrd	A_l, A_h, [src, #64]!
 	strd	A_l, A_h, [dst, #64]!
 	subs	tmp2, tmp2, #64
 	bge	1b
 	tst	tmp2, #0x3f
 	bne	1f
 	ldr	tmp2,[sp], #FRAME_SIZE
 	bx	lr
 1:
 	add	src, src, #8
 	add	dst, dst, #8
 .Ltail63aligned:			/* Count in tmp2.  */
 	/* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
 	   we know that the src and dest are 32-bit aligned so we can use
 	   LDRD/STRD to improve efficiency.  */
 	/* TMP2 is now negative, but we don't care about that.  The bottom
 	   six bits still tell us how many bytes are left to copy.  */
 	and	tmp1, tmp2, #0x38
 	add	dst, dst, tmp1
 	add	src, src, tmp1
 	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
 	add	pc, pc, tmp1
 	ldrd	A_l, A_h, [src, #-56]	/* 14 words to go.  */
 	strd	A_l, A_h, [dst, #-56]
 	ldrd	A_l, A_h, [src, #-48]	/* 12 words to go.  */
 	strd	A_l, A_h, [dst, #-48]
 	ldrd	A_l, A_h, [src, #-40]	/* 10 words to go.  */
 	strd	A_l, A_h, [dst, #-40]
 	ldrd	A_l, A_h, [src, #-32]	/* 8 words to go.  */
 	strd	A_l, A_h, [dst, #-32]
 	ldrd	A_l, A_h, [src, #-24]	/* 6 words to go.  */
 	strd	A_l, A_h, [dst, #-24]
 	ldrd	A_l, A_h, [src, #-16]	/* 4 words to go.  */
 	strd	A_l, A_h, [dst, #-16]
 	ldrd	A_l, A_h, [src, #-8]	/* 2 words to go.  */
 	strd	A_l, A_h, [dst, #-8]
 #endif
 	tst	tmp2, #4
 	ldrne	tmp1, [src], #4
 	strne	tmp1, [dst], #4
 	lsls	tmp2, tmp2, #31		/* Count (tmp2) now dead. */
 	ldrhcs	tmp1, [src], #2
 	ldrbne	tmp2, [src]
 	strhcs	tmp1, [dst], #2
 	strbne	tmp2, [dst]
 .Ldone:
 	ldr	tmp2, [sp], #FRAME_SIZE
 	bx	lr
 .Lcpy_body_long:			/* Count in tmp2.  */
 	/* Long copy.  We know that there's at least (prefetch_lines * 64)
 	   bytes to go.  */
 #ifdef USE_VFP
 	/* Don't use PLD.  Instead, read some data in advance of the current
 	   copy position into a register.  This should act like a PLD
 	   operation but we won't have to repeat the transfer.  */
 	vldr	d3, [src, #0]
 	vldr	d4, [src, #64]
 	vldr	d5, [src, #128]
 	vldr	d6, [src, #192]
 	vldr	d7, [src, #256]
 	vldr	d0, [src, #8]
 	vldr	d1, [src, #16]
 	vldr	d2, [src, #24]
 	add	src, src, #32
 	subs	tmp2, tmp2, #prefetch_lines * 64 * 2
 	blt	2f
 1:
 	cpy_line_vfp	d3, 0
 	cpy_line_vfp	d4, 64
 	cpy_line_vfp	d5, 128
 	add	dst, dst, #3 * 64
 	add	src, src, #3 * 64
 	cpy_line_vfp	d6, 0
 	cpy_line_vfp	d7, 64
 	add	dst, dst, #2 * 64
 	add	src, src, #2 * 64
 	subs	tmp2, tmp2, #prefetch_lines * 64
 	bge	1b
 2:
 	cpy_tail_vfp	d3, 0
 	cpy_tail_vfp	d4, 64
 	cpy_tail_vfp	d5, 128
 	add	src, src, #3 * 64
 	add	dst, dst, #3 * 64
 	cpy_tail_vfp	d6, 0
 	vstr	d7, [dst, #64]
 	vldr	d7, [src, #64]
 	vstr	d0, [dst, #64 + 8]
 	vldr	d0, [src, #64 + 8]
 	vstr	d1, [dst, #64 + 16]
 	vldr	d1, [src, #64 + 16]
 	vstr	d2, [dst, #64 + 24]
 	vldr	d2, [src, #64 + 24]
 	vstr	d7, [dst, #64 + 32]
 	add	src, src, #96
 	vstr	d0, [dst, #64 + 40]
 	vstr	d1, [dst, #64 + 48]
 	vstr	d2, [dst, #64 + 56]
 	add	dst, dst, #128
 	add	tmp2, tmp2, #prefetch_lines * 64
 	b	.Lcpy_body_medium
 #else
 	/* Long copy.  Use an SMS style loop to maximize the I/O
 	   bandwidth of the core.  We don't have enough spare registers
 	   to synthesise prefetching, so use PLD operations.  */
 	/* Pre-bias src and dst.  */
 	sub	src, src, #8
 	sub	dst, dst, #8
 	pld	[src, #8]
 	pld	[src, #72]
 	subs	tmp2, tmp2, #64
 	pld	[src, #136]
 	ldrd	A_l, A_h, [src, #8]
 	strd	B_l, B_h, [sp, #8]
 	ldrd	B_l, B_h, [src, #16]
 	strd	C_l, C_h, [sp, #16]
 	ldrd	C_l, C_h, [src, #24]
 	strd	D_l, D_h, [sp, #24]
 	pld	[src, #200]
 	ldrd	D_l, D_h, [src, #32]!
 	b	1f
 	.p2align	6
 2:
 	pld	[src, #232]
 	strd	A_l, A_h, [dst, #40]
 	ldrd	A_l, A_h, [src, #40]
 	strd	B_l, B_h, [dst, #48]
 	ldrd	B_l, B_h, [src, #48]
 	strd	C_l, C_h, [dst, #56]
 	ldrd	C_l, C_h, [src, #56]
 	strd	D_l, D_h, [dst, #64]!
 	ldrd	D_l, D_h, [src, #64]!
 	subs	tmp2, tmp2, #64
 1:
 	strd	A_l, A_h, [dst, #8]
 	ldrd	A_l, A_h, [src, #8]
 	strd	B_l, B_h, [dst, #16]
 	ldrd	B_l, B_h, [src, #16]
 	strd	C_l, C_h, [dst, #24]
 	ldrd	C_l, C_h, [src, #24]
 	strd	D_l, D_h, [dst, #32]
 	ldrd	D_l, D_h, [src, #32]
 	bcs	2b
 	/* Save the remaining bytes and restore the callee-saved regs.  */
 	strd	A_l, A_h, [dst, #40]
 	add	src, src, #40
 	strd	B_l, B_h, [dst, #48]
 	ldrd	B_l, B_h, [sp, #8]
 	strd	C_l, C_h, [dst, #56]
 	ldrd	C_l, C_h, [sp, #16]
 	strd	D_l, D_h, [dst, #64]
 	ldrd	D_l, D_h, [sp, #24]
 	add	dst, dst, #72
 	tst	tmp2, #0x3f
 	bne	.Ltail63aligned
 	ldr	tmp2, [sp], #FRAME_SIZE
 	bx	lr
 #endif
 .Lcpy_notaligned:
 	pld	[src]
 	pld	[src, #64]
 	/* There's at least 64 bytes to copy, but there is no mutual
 	   alignment.  */
 	/* Bring DST to 64-bit alignment.  */
 	lsls	tmp2, dst, #29
 	pld	[src, #(2 * 64)]
 	beq	1f
 	rsbs	tmp2, tmp2, #0
 	sub	count, count, tmp2, lsr #29
 	ldrmi	tmp1, [src], #4
 	strmi	tmp1, [dst], #4
 	lsls	tmp2, tmp2, #2
 	ldrbne	tmp1, [src], #1
 	ldrhcs	tmp2, [src], #2
 	strbne	tmp1, [dst], #1
 	strhcs	tmp2, [dst], #2
 1:
 	pld	[src, #(3 * 64)]
 	subs	count, count, #64
 	ldrmi	tmp2, [sp], #FRAME_SIZE
 	bmi	.Ltail63unaligned
 	pld	[src, #(4 * 64)]
 #ifdef USE_NEON
 	vld1.8	{d0-d3}, [src]!
 	vld1.8	{d4-d7}, [src]!
 	subs	count, count, #64
 	bmi	2f
 1:
 	pld	[src, #(4 * 64)]
 	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
 	vld1.8	{d0-d3}, [src]!
 	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
 	vld1.8	{d4-d7}, [src]!
 	subs	count, count, #64
 	bpl	1b
 2:
 	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
 	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
 	ands	count, count, #0x3f
 #else
 	/* Use an SMS style loop to maximize the I/O bandwidth.  */
 	sub	src, src, #4
 	sub	dst, dst, #8
 	subs	tmp2, count, #64	/* Use tmp2 for count.  */
 	ldr	A_l, [src, #4]
 	ldr	A_h, [src, #8]
 	strd	B_l, B_h, [sp, #8]
 	ldr	B_l, [src, #12]
 	ldr	B_h, [src, #16]
 	strd	C_l, C_h, [sp, #16]
 	ldr	C_l, [src, #20]
 	ldr	C_h, [src, #24]
 	strd	D_l, D_h, [sp, #24]
 	ldr	D_l, [src, #28]
 	ldr	D_h, [src, #32]!
 	b	1f
 	.p2align	6
 2:
 	pld	[src, #(5 * 64) - (32 - 4)]
 	strd	A_l, A_h, [dst, #40]
 	ldr	A_l, [src, #36]
 	ldr	A_h, [src, #40]
 	strd	B_l, B_h, [dst, #48]
 	ldr	B_l, [src, #44]
 	ldr	B_h, [src, #48]
 	strd	C_l, C_h, [dst, #56]
 	ldr	C_l, [src, #52]
 	ldr	C_h, [src, #56]
 	strd	D_l, D_h, [dst, #64]!
 	ldr	D_l, [src, #60]
 	ldr	D_h, [src, #64]!
 	subs	tmp2, tmp2, #64
 1:
 	strd	A_l, A_h, [dst, #8]
 	ldr	A_l, [src, #4]
 	ldr	A_h, [src, #8]
 	strd	B_l, B_h, [dst, #16]
 	ldr	B_l, [src, #12]
 	ldr	B_h, [src, #16]
 	strd	C_l, C_h, [dst, #24]
 	ldr	C_l, [src, #20]
 	ldr	C_h, [src, #24]
 	strd	D_l, D_h, [dst, #32]
 	ldr	D_l, [src, #28]
 	ldr	D_h, [src, #32]
 	bcs	2b
 	/* Save the remaining bytes and restore the callee-saved regs.  */
 	strd	A_l, A_h, [dst, #40]
 	add	src, src, #36
 	strd	B_l, B_h, [dst, #48]
 	ldrd	B_l, B_h, [sp, #8]
 	strd	C_l, C_h, [dst, #56]
 	ldrd	C_l, C_h, [sp, #16]
 	strd	D_l, D_h, [dst, #64]
 	ldrd	D_l, D_h, [sp, #24]
 	add	dst, dst, #72
 	ands	count, tmp2, #0x3f
 #endif
 	ldr	tmp2, [sp], #FRAME_SIZE
 	bne	.Ltail63unaligned
 	bx	lr
 END(memcpy)
--- a/libc/arch-arm/bionic/strcmp.S
+++ b/libc/arch-arm/bionic/strcmp.S
@ -1,317 +0,0 @@
 /*
 * Copyright (c) 2011 The Android Open Source Project
 * Copyright (c) 2008 ARM Ltd
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the company may not be used to endorse or promote
 *    products derived from this software without specific prior written
 *    permission.
 *
 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #include <machine/cpu-features.h>
 #include <private/bionic_asm.h>
 	.text
 #ifdef __ARMEB__
 #define SHFT2LSB lsl
 #define SHFT2LSBEQ lsleq
 #define SHFT2MSB lsr
 #define SHFT2MSBEQ lsreq
 #define MSB 0x000000ff
 #define LSB 0xff000000
 #else
 #define SHFT2LSB lsr
 #define SHFT2LSBEQ lsreq
 #define SHFT2MSB lsl
 #define SHFT2MSBEQ lsleq
 #define MSB 0xff000000
 #define LSB 0x000000ff
 #endif
 #define magic1(REG) REG
 #define magic2(REG) REG, lsl #7
 ENTRY(strcmp)
 	pld	[r0, #0]
 	pld	[r1, #0]
 	eor	r2, r0, r1
 	tst	r2, #3
 	/* Strings not at same byte offset from a word boundary.  */
 	bne	.Lstrcmp_unaligned
 	ands	r2, r0, #3
 	bic	r0, r0, #3
 	bic	r1, r1, #3
 	ldr	ip, [r0], #4
 	it	eq
 	ldreq	r3, [r1], #4
 	beq	1f
 	/* Although s1 and s2 have identical initial alignment, they are
 	 * not currently word aligned.  Rather than comparing bytes,
 	 * make sure that any bytes fetched from before the addressed
 	 * bytes are forced to 0xff.  Then they will always compare
 	 * equal.
 	 */
 	eor	r2, r2, #3
 	lsl	r2, r2, #3
 	mvn	r3, #MSB
 	SHFT2LSB	r2, r3, r2
 	ldr	r3, [r1], #4
 	orr	ip, ip, r2
 	orr	r3, r3, r2
 1:
 	/* Load the 'magic' constant 0x01010101. */
 	str	r4, [sp, #-4]!
 	mov	r4, #1
 	orr	r4, r4, r4, lsl #8
 	orr	r4, r4, r4, lsl #16
 	.p2align	2
 4:
 	pld	[r0, #8]
 	pld	[r1, #8]
 	sub	r2, ip, magic1(r4)
 	cmp	ip, r3
 	itttt	eq
 	/* check for any zero bytes in first word */
 	biceq	r2, r2, ip
 	tsteq	r2, magic2(r4)
 	ldreq	ip, [r0], #4
 	ldreq	r3, [r1], #4
 	beq	4b
 2:
 	/* There's a zero or a different byte in the word */
 	SHFT2MSB	r0, ip, #24
 	SHFT2LSB	ip, ip, #8
 	cmp	r0, #1
 	it	cs
 	cmpcs	r0, r3, SHFT2MSB #24
 	it	eq
 	SHFT2LSBEQ r3, r3, #8
 	beq	2b
 	/* On a big-endian machine, r0 contains the desired byte in bits
 	 * 0-7; on a little-endian machine they are in bits 24-31.  In
 	 * both cases the other bits in r0 are all zero.  For r3 the
 	 * interesting byte is at the other end of the word, but the
 	 * other bits are not necessarily zero.  We need a signed result
 	 * representing the differnece in the unsigned bytes, so for the
 	 * little-endian case we can't just shift the interesting bits up.
 	 */
 #ifdef __ARMEB__
 	sub	r0, r0, r3, lsr #24
 #else
 	and	r3, r3, #255
 	/* No RSB instruction in Thumb2 */
 #ifdef __thumb2__
 	lsr	r0, r0, #24
 	sub	r0, r0, r3
 #else
 	rsb	r0, r3, r0, lsr #24
 #endif
 #endif
 	ldr	r4, [sp], #4
 	bx	lr
 .Lstrcmp_unaligned:
 	wp1 .req r0
 	wp2 .req r1
 	b1  .req r2
 	w1  .req r4
 	w2  .req r5
 	t1  .req ip
 	@ r3 is scratch
 	/* First of all, compare bytes until wp1(sp1) is word-aligned. */
 1:
 	tst	wp1, #3
 	beq	2f
 	ldrb	r2, [wp1], #1
 	ldrb	r3, [wp2], #1
 	cmp	r2, #1
 	it	cs
 	cmpcs	r2, r3
 	beq	1b
 	sub	r0, r2, r3
 	bx	lr
 2:
 	str	r5, [sp, #-4]!
 	str	r4, [sp, #-4]!
 	mov	b1, #1
 	orr	b1, b1, b1, lsl #8
 	orr	b1, b1, b1, lsl #16
 	and	t1, wp2, #3
 	bic	wp2, wp2, #3
 	ldr	w1, [wp1], #4
 	ldr	w2, [wp2], #4
 	cmp	t1, #2
 	beq	2f
 	bhi	3f
 	/* Critical inner Loop: Block with 3 bytes initial overlap */
 	.p2align	2
 1:
 	bic	t1, w1, #MSB
 	cmp	t1, w2, SHFT2LSB #8
 	sub	r3, w1, b1
 	bic	r3, r3, w1
 	bne	4f
 	ands	r3, r3, b1, lsl #7
 	it	eq
 	ldreq	w2, [wp2], #4
 	bne	5f
 	eor	t1, t1, w1
 	cmp	t1, w2, SHFT2MSB #24
 	bne	6f
 	ldr	w1, [wp1], #4
 	b	1b
 4:
 	SHFT2LSB	w2, w2, #8
 	b	8f
 5:
 #ifdef __ARMEB__
 	/* The syndrome value may contain false ones if the string ends
 	 * with the bytes 0x01 0x00
 	 */
 	tst	w1, #0xff000000
 	itt	ne
 	tstne	w1, #0x00ff0000
 	tstne	w1, #0x0000ff00
 	beq	7f
 #else
 	bics	r3, r3, #0xff000000
 	bne	7f
 #endif
 	ldrb	w2, [wp2]
 	SHFT2LSB	t1, w1, #24
 #ifdef __ARMEB__
 	lsl	w2, w2, #24
 #endif
 	b	8f
 6:
 	SHFT2LSB	t1, w1, #24
 	and	w2, w2, #LSB
 	b	8f
 	/* Critical inner Loop: Block with 2 bytes initial overlap */
 	.p2align	2
 2:
 	SHFT2MSB	t1, w1, #16
 	sub	r3, w1, b1
 	SHFT2LSB	t1, t1, #16
 	bic	r3, r3, w1
 	cmp	t1, w2, SHFT2LSB #16
 	bne	4f
 	ands	r3, r3, b1, lsl #7
 	it	eq
 	ldreq	w2, [wp2], #4
 	bne	5f
 	eor	t1, t1, w1
 	cmp	t1, w2, SHFT2MSB #16
 	bne	6f
 	ldr	w1, [wp1], #4
 	b	2b
 5:
 #ifdef __ARMEB__
 	/* The syndrome value may contain false ones if the string ends
 	 * with the bytes 0x01 0x00
 	 */
 	tst	w1, #0xff000000
 	it	ne
 	tstne	w1, #0x00ff0000
 	beq	7f
 #else
 	lsls	r3, r3, #16
 	bne	7f
 #endif
 	ldrh	w2, [wp2]
 	SHFT2LSB	t1, w1, #16
 #ifdef __ARMEB__
 	lsl	w2, w2, #16
 #endif
 	b	8f
 6:
 	SHFT2MSB	w2, w2, #16
 	SHFT2LSB	t1, w1, #16
 4:
 	SHFT2LSB	w2, w2, #16
 	b	8f
 	/* Critical inner Loop: Block with 1 byte initial overlap */
 	.p2align	2
 3:
 	and	t1, w1, #LSB
 	cmp	t1, w2, SHFT2LSB #24
 	sub	r3, w1, b1
 	bic	r3, r3, w1
 	bne	4f
 	ands	r3, r3, b1, lsl #7
 	it	eq
 	ldreq	w2, [wp2], #4
 	bne	5f
 	eor	t1, t1, w1
 	cmp	t1, w2, SHFT2MSB #8
 	bne	6f
 	ldr	w1, [wp1], #4
 	b	3b
 4:
 	SHFT2LSB	w2, w2, #24
 	b	8f
 5:
 	/* The syndrome value may contain false ones if the string ends
 	 * with the bytes 0x01 0x00
 	 */
 	tst	w1, #LSB
 	beq	7f
 	ldr	w2, [wp2], #4
 6:
 	SHFT2LSB	t1, w1, #8
 	bic	w2, w2, #MSB
 	b	8f
 7:
 	mov	r0, #0
 	ldr	r4, [sp], #4
 	ldr	r5, [sp], #4
 	bx	lr
 8:
 	and	r2, t1, #LSB
 	and	r0, w2, #LSB
 	cmp	r0, #1
 	it	cs
 	cmpcs	r0, r2
 	itt	eq
 	SHFT2LSBEQ	t1, t1, #8
 	SHFT2LSBEQ	w2, w2, #8
 	beq	8b
 	sub	r0, r2, r0
 	ldr	r4, [sp], #4
 	ldr	r5, [sp], #4
 	bx	lr
 END(strcmp)