platform_system_core/libpixelflinger/t32cb16blend.S

/* libs/pixelflinger/t32cb16blend.S
**
** Copyright 2006, The Android Open Source Project
**
** Licensed under the Apache License, Version 2.0 (the "License"); 
** you may not use this file except in compliance with the License. 
** You may obtain a copy of the License at 
**
**     http://www.apache.org/licenses/LICENSE-2.0 
**
** Unless required by applicable law or agreed to in writing, software 
** distributed under the License is distributed on an "AS IS" BASIS, 
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
** See the License for the specific language governing permissions and 
** limitations under the License.
*/


	.text
	.syntax unified
	.balign 4
	
	.global scanline_t32cb16blend_arm


/*
 * .macro pixel
 *
 * \DREG is a 32-bit register containing *two* original destination RGB565 
 *       pixels, with the even one in the low-16 bits, and the odd one in the
 *       high 16 bits.
 *
 * \SRC is a 32-bit 0xAABBGGRR pixel value, with pre-multiplied colors.
 *
 * \FB is a target register that will contain the blended pixel values.
 *
 * \ODD is either 0 or 1 and indicates if we're blending the lower or 
 *      upper 16-bit pixels in DREG into FB
 *
 *
 * clobbered: r6, r7, lr
 *
 */

.macro pixel,   DREG, SRC, FB, ODD

    // SRC = 0xAABBGGRR
    mov     r7, \SRC, lsr #24           // sA
    add     r7, r7, r7, lsr #7          // sA + (sA >> 7)
    rsb     r7, r7, #0x100              // sA = 0x100 - (sA+(sA>>7))

1:

.if \ODD

    // red
    mov     lr, \DREG, lsr #(16 + 11)
    smulbb  lr, r7, lr
    mov     r6, \SRC, lsr #3
    and     r6, r6, #0x1F
    add     lr, r6, lr, lsr #8
    cmp     lr, #0x1F
    orrhs   \FB, \FB, #(0x1F<<(16 + 11))
    orrlo   \FB, \FB, lr, lsl #(16 + 11)

        // green
        and     r6, \DREG, #(0x3F<<(16 + 5))
        smulbt  r6, r7, r6
        mov     lr, \SRC, lsr #(8+2)
        and     lr, lr, #0x3F
        add     r6, lr, r6, lsr #(5+8)
        cmp     r6, #0x3F
        orrhs   \FB, \FB, #(0x3F<<(16 + 5))
        orrlo   \FB, \FB, r6, lsl #(16 + 5)

            // blue
            and     lr, \DREG, #(0x1F << 16)
            smulbt  lr, r7, lr
            mov     r6, \SRC, lsr #(8+8+3)
            and     r6, r6, #0x1F
            add     lr, r6, lr, lsr #8
            cmp     lr, #0x1F
            orrhs   \FB, \FB, #(0x1F << 16)
            orrlo   \FB, \FB, lr, lsl #16

.else

    // red
    mov     lr, \DREG, lsr #11
    and     lr, lr, #0x1F
    smulbb  lr, r7, lr
    mov     r6, \SRC, lsr #3
    and     r6, r6, #0x1F
    add     lr, r6, lr, lsr #8
    cmp     lr, #0x1F
    movhs   \FB, #(0x1F<<11)
    movlo   \FB, lr, lsl #11


        // green
        and     r6, \DREG, #(0x3F<<5)
        smulbb  r6, r7, r6
        mov     lr, \SRC, lsr #(8+2)
        and     lr, lr, #0x3F
        add     r6, lr, r6, lsr #(5+8)
        cmp     r6, #0x3F
        orrhs   \FB, \FB, #(0x3F<<5)
        orrlo   \FB, \FB, r6, lsl #5

            // blue
            and     lr, \DREG, #0x1F
            smulbb  lr, r7, lr
            mov     r6, \SRC, lsr #(8+8+3)
            and     r6, r6, #0x1F
            add     lr, r6, lr, lsr #8
            cmp     lr, #0x1F
            orrhs   \FB, \FB, #0x1F
            orrlo   \FB, \FB, lr

.endif

    .endm
    

// r0:  dst ptr
// r1:  src ptr
// r2:  count
// r3:  d
// r4:  s0
// r5:  s1
// r6:  pixel
// r7:  pixel
// r8:  free
// r9:  free
// r10: free
// r11: free
// r12: scratch
// r14: pixel

scanline_t32cb16blend_arm:
    stmfd	sp!, {r4-r7, lr}

    pld     [r0]
    pld     [r1]

    // align DST to 32 bits
    tst     r0, #0x3
    beq     aligned
    subs    r2, r2, #1
    ldmfdlo sp!, {r4-r7, lr}        // return
    bxlo    lr

last:
    ldr     r4, [r1], #4
    ldrh    r3, [r0]
    pixel   r3, r4, r12, 0
    strh    r12, [r0], #2

aligned:
    subs    r2, r2, #2
    blo     9f

    // The main loop is unrolled twice and processes 4 pixels
8:  ldmia   r1!, {r4, r5}
    // stream the source
    pld     [r1, #32]
    add     r0, r0, #4
    // it's all zero, skip this pixel
    orrs    r3, r4, r5
    beq     7f
    
    // load the destination
    ldr     r3, [r0, #-4]
    // stream the destination
    pld     [r0, #32]
    pixel   r3, r4, r12, 0
    pixel   r3, r5, r12, 1
    // effectively, we're getting write-combining by virtue of the
    // cpu's write-back cache.
    str     r12, [r0, #-4]

    // 2nd iterration of the loop, don't stream anything
    subs    r2, r2, #2
    movlt   r4, r5
    blt     9f
    ldmia   r1!, {r4, r5}
    add     r0, r0, #4
    orrs    r3, r4, r5
    beq     7f
    ldr     r3, [r0, #-4]
    pixel   r3, r4, r12, 0
    pixel   r3, r5, r12, 16
    str     r12, [r0, #-4]

    
7:  subs    r2, r2, #2
    bhs     8b
    mov     r4, r5

9:  adds    r2, r2, #1
    ldmfdlo sp!, {r4-r7, lr}        // return
    bxlo    lr
    b       last
auto import from //depot/cupcake/@135843 2009-03-04 04:32:55 +01:00			`/* libs/pixelflinger/t32cb16blend.S`
			`**`
			`** Copyright 2006, The Android Open Source Project`
			`**`
			`** Licensed under the Apache License, Version 2.0 (the "License");`
			`** you may not use this file except in compliance with the License.`
			`** You may obtain a copy of the License at`
			`**`
			`** http://www.apache.org/licenses/LICENSE-2.0`
			`**`
			`** Unless required by applicable law or agreed to in writing, software`
			`** distributed under the License is distributed on an "AS IS" BASIS,`
			`** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`** See the License for the specific language governing permissions and`
			`** limitations under the License.`
			`*/`


			`.text`
To compile with llvm integrated assembler. * Explicitly specify default .align 0. * Use standard ldmfdlo instruction. * Before and after gas outputs are identical, with align 0 sections. * Objdump showed .text/.data/.bss section alignment attributes are 2^0 from gas and 2^2 from llvm assembler. These .S files might be working when compiled by gas, but llvm assembler's output should be more correct or conservative. Change-Id: I4e578dbc8155c0d06d1bbc1c33ec4cc851a18479 2015-08-21 21:17:43 +02:00			`.syntax unified`
Replace .align with .balign to avoid ambiguity Directive .align is arch-dependent, .balign is not. Change-Id: Ibf2097da29f743f2c87c79d2a88ce1abd0aa6227 2016-07-13 22:08:18 +02:00			`.balign 4`
auto import from //depot/cupcake/@135843 2009-03-04 04:32:55 +01:00
			`.global scanline_t32cb16blend_arm`


fix part of [2017702] OpenGL bugs with alpha values of 1.0 in the source during blending into 8888 buffers when ONE / ONE_MINUS_SRC_ALPHA blending mode was used, the code wasn't saturating the color component. the reason was that this mode is used for premltiplied alpha blending, however, if used with a non premultiplied source, the color component would wrap. unfortunately, this costs 6 extra cycles per pixels, however... "correctness" prevails. this should not impact the UI since it's using h/w acceleration most of the time it also doesn't impact games which should be using h/w GL. This change will slow the emulator down a bit. 2009-08-18 10:07:35 +02:00			`/*`
			`* .macro pixel`
			`*`
			`* \DREG is a 32-bit register containing two original destination RGB565`
			`* pixels, with the even one in the low-16 bits, and the odd one in the`
			`* high 16 bits.`
			`*`
			`* \SRC is a 32-bit 0xAABBGGRR pixel value, with pre-multiplied colors.`
			`*`
			`* \FB is a target register that will contain the blended pixel values.`
			`*`
			`* \ODD is either 0 or 1 and indicates if we're blending the lower or`
			`* upper 16-bit pixels in DREG into FB`
			`*`
			`*`
			`* clobbered: r6, r7, lr`
			`*`
			`*/`

			`.macro pixel, DREG, SRC, FB, ODD`

			`// SRC = 0xAABBGGRR`
auto import from //depot/cupcake/@135843 2009-03-04 04:32:55 +01:00			`mov r7, \SRC, lsr #24 // sA`
			`add r7, r7, r7, lsr #7 // sA + (sA >> 7)`
			`rsb r7, r7, #0x100 // sA = 0x100 - (sA+(sA>>7))`

			`1:`

fix part of [2017702] OpenGL bugs with alpha values of 1.0 in the source during blending into 8888 buffers when ONE / ONE_MINUS_SRC_ALPHA blending mode was used, the code wasn't saturating the color component. the reason was that this mode is used for premltiplied alpha blending, however, if used with a non premultiplied source, the color component would wrap. unfortunately, this costs 6 extra cycles per pixels, however... "correctness" prevails. this should not impact the UI since it's using h/w acceleration most of the time it also doesn't impact games which should be using h/w GL. This change will slow the emulator down a bit. 2009-08-18 10:07:35 +02:00			`.if \ODD`
auto import from //depot/cupcake/@135843 2009-03-04 04:32:55 +01:00
			`// red`
fix part of [2017702] OpenGL bugs with alpha values of 1.0 in the source during blending into 8888 buffers when ONE / ONE_MINUS_SRC_ALPHA blending mode was used, the code wasn't saturating the color component. the reason was that this mode is used for premltiplied alpha blending, however, if used with a non premultiplied source, the color component would wrap. unfortunately, this costs 6 extra cycles per pixels, however... "correctness" prevails. this should not impact the UI since it's using h/w acceleration most of the time it also doesn't impact games which should be using h/w GL. This change will slow the emulator down a bit. 2009-08-18 10:07:35 +02:00			`mov lr, \DREG, lsr #(16 + 11)`
auto import from //depot/cupcake/@135843 2009-03-04 04:32:55 +01:00			`smulbb lr, r7, lr`
			`mov r6, \SRC, lsr #3`
			`and r6, r6, #0x1F`
			`add lr, r6, lr, lsr #8`
fix part of [2017702] OpenGL bugs with alpha values of 1.0 in the source during blending into 8888 buffers when ONE / ONE_MINUS_SRC_ALPHA blending mode was used, the code wasn't saturating the color component. the reason was that this mode is used for premltiplied alpha blending, however, if used with a non premultiplied source, the color component would wrap. unfortunately, this costs 6 extra cycles per pixels, however... "correctness" prevails. this should not impact the UI since it's using h/w acceleration most of the time it also doesn't impact games which should be using h/w GL. This change will slow the emulator down a bit. 2009-08-18 10:07:35 +02:00			`cmp lr, #0x1F`
			`orrhs \FB, \FB, #(0x1F<<(16 + 11))`
			`orrlo \FB, \FB, lr, lsl #(16 + 11)`
auto import from //depot/cupcake/@135843 2009-03-04 04:32:55 +01:00
			`// green`
fix part of [2017702] OpenGL bugs with alpha values of 1.0 in the source during blending into 8888 buffers when ONE / ONE_MINUS_SRC_ALPHA blending mode was used, the code wasn't saturating the color component. the reason was that this mode is used for premltiplied alpha blending, however, if used with a non premultiplied source, the color component would wrap. unfortunately, this costs 6 extra cycles per pixels, however... "correctness" prevails. this should not impact the UI since it's using h/w acceleration most of the time it also doesn't impact games which should be using h/w GL. This change will slow the emulator down a bit. 2009-08-18 10:07:35 +02:00			`and r6, \DREG, #(0x3F<<(16 + 5))`
auto import from //depot/cupcake/@135843 2009-03-04 04:32:55 +01:00			`smulbt r6, r7, r6`
			`mov lr, \SRC, lsr #(8+2)`
			`and lr, lr, #0x3F`
			`add r6, lr, r6, lsr #(5+8)`
fix part of [2017702] OpenGL bugs with alpha values of 1.0 in the source during blending into 8888 buffers when ONE / ONE_MINUS_SRC_ALPHA blending mode was used, the code wasn't saturating the color component. the reason was that this mode is used for premltiplied alpha blending, however, if used with a non premultiplied source, the color component would wrap. unfortunately, this costs 6 extra cycles per pixels, however... "correctness" prevails. this should not impact the UI since it's using h/w acceleration most of the time it also doesn't impact games which should be using h/w GL. This change will slow the emulator down a bit. 2009-08-18 10:07:35 +02:00			`cmp r6, #0x3F`
			`orrhs \FB, \FB, #(0x3F<<(16 + 5))`
			`orrlo \FB, \FB, r6, lsl #(16 + 5)`
auto import from //depot/cupcake/@135843 2009-03-04 04:32:55 +01:00
			`// blue`
fix part of [2017702] OpenGL bugs with alpha values of 1.0 in the source during blending into 8888 buffers when ONE / ONE_MINUS_SRC_ALPHA blending mode was used, the code wasn't saturating the color component. the reason was that this mode is used for premltiplied alpha blending, however, if used with a non premultiplied source, the color component would wrap. unfortunately, this costs 6 extra cycles per pixels, however... "correctness" prevails. this should not impact the UI since it's using h/w acceleration most of the time it also doesn't impact games which should be using h/w GL. This change will slow the emulator down a bit. 2009-08-18 10:07:35 +02:00			`and lr, \DREG, #(0x1F << 16)`
auto import from //depot/cupcake/@135843 2009-03-04 04:32:55 +01:00			`smulbt lr, r7, lr`
			`mov r6, \SRC, lsr #(8+8+3)`
			`and r6, r6, #0x1F`
			`add lr, r6, lr, lsr #8`
fix part of [2017702] OpenGL bugs with alpha values of 1.0 in the source during blending into 8888 buffers when ONE / ONE_MINUS_SRC_ALPHA blending mode was used, the code wasn't saturating the color component. the reason was that this mode is used for premltiplied alpha blending, however, if used with a non premultiplied source, the color component would wrap. unfortunately, this costs 6 extra cycles per pixels, however... "correctness" prevails. this should not impact the UI since it's using h/w acceleration most of the time it also doesn't impact games which should be using h/w GL. This change will slow the emulator down a bit. 2009-08-18 10:07:35 +02:00			`cmp lr, #0x1F`
			`orrhs \FB, \FB, #(0x1F << 16)`
			`orrlo \FB, \FB, lr, lsl #16`
auto import from //depot/cupcake/@135843 2009-03-04 04:32:55 +01:00
			`.else`

			`// red`
fix part of [2017702] OpenGL bugs with alpha values of 1.0 in the source during blending into 8888 buffers when ONE / ONE_MINUS_SRC_ALPHA blending mode was used, the code wasn't saturating the color component. the reason was that this mode is used for premltiplied alpha blending, however, if used with a non premultiplied source, the color component would wrap. unfortunately, this costs 6 extra cycles per pixels, however... "correctness" prevails. this should not impact the UI since it's using h/w acceleration most of the time it also doesn't impact games which should be using h/w GL. This change will slow the emulator down a bit. 2009-08-18 10:07:35 +02:00			`mov lr, \DREG, lsr #11`
auto import from //depot/cupcake/@135843 2009-03-04 04:32:55 +01:00			`and lr, lr, #0x1F`
			`smulbb lr, r7, lr`
			`mov r6, \SRC, lsr #3`
			`and r6, r6, #0x1F`
			`add lr, r6, lr, lsr #8`
fix part of [2017702] OpenGL bugs with alpha values of 1.0 in the source during blending into 8888 buffers when ONE / ONE_MINUS_SRC_ALPHA blending mode was used, the code wasn't saturating the color component. the reason was that this mode is used for premltiplied alpha blending, however, if used with a non premultiplied source, the color component would wrap. unfortunately, this costs 6 extra cycles per pixels, however... "correctness" prevails. this should not impact the UI since it's using h/w acceleration most of the time it also doesn't impact games which should be using h/w GL. This change will slow the emulator down a bit. 2009-08-18 10:07:35 +02:00			`cmp lr, #0x1F`
			`movhs \FB, #(0x1F<<11)`
			`movlo \FB, lr, lsl #11`

auto import from //depot/cupcake/@135843 2009-03-04 04:32:55 +01:00
			`// green`
			`and r6, \DREG, #(0x3F<<5)`
			`smulbb r6, r7, r6`
			`mov lr, \SRC, lsr #(8+2)`
			`and lr, lr, #0x3F`
			`add r6, lr, r6, lsr #(5+8)`
fix part of [2017702] OpenGL bugs with alpha values of 1.0 in the source during blending into 8888 buffers when ONE / ONE_MINUS_SRC_ALPHA blending mode was used, the code wasn't saturating the color component. the reason was that this mode is used for premltiplied alpha blending, however, if used with a non premultiplied source, the color component would wrap. unfortunately, this costs 6 extra cycles per pixels, however... "correctness" prevails. this should not impact the UI since it's using h/w acceleration most of the time it also doesn't impact games which should be using h/w GL. This change will slow the emulator down a bit. 2009-08-18 10:07:35 +02:00			`cmp r6, #0x3F`
			`orrhs \FB, \FB, #(0x3F<<5)`
			`orrlo \FB, \FB, r6, lsl #5`
auto import from //depot/cupcake/@135843 2009-03-04 04:32:55 +01:00
			`// blue`
			`and lr, \DREG, #0x1F`
			`smulbb lr, r7, lr`
			`mov r6, \SRC, lsr #(8+8+3)`
			`and r6, r6, #0x1F`
			`add lr, r6, lr, lsr #8`
fix part of [2017702] OpenGL bugs with alpha values of 1.0 in the source during blending into 8888 buffers when ONE / ONE_MINUS_SRC_ALPHA blending mode was used, the code wasn't saturating the color component. the reason was that this mode is used for premltiplied alpha blending, however, if used with a non premultiplied source, the color component would wrap. unfortunately, this costs 6 extra cycles per pixels, however... "correctness" prevails. this should not impact the UI since it's using h/w acceleration most of the time it also doesn't impact games which should be using h/w GL. This change will slow the emulator down a bit. 2009-08-18 10:07:35 +02:00			`cmp lr, #0x1F`
			`orrhs \FB, \FB, #0x1F`
			`orrlo \FB, \FB, lr`
auto import from //depot/cupcake/@135843 2009-03-04 04:32:55 +01:00
			`.endif`

			`.endm`


			`// r0: dst ptr`
			`// r1: src ptr`
			`// r2: count`
			`// r3: d`
			`// r4: s0`
			`// r5: s1`
			`// r6: pixel`
			`// r7: pixel`
			`// r8: free`
			`// r9: free`
			`// r10: free`
			`// r11: free`
			`// r12: scratch`
			`// r14: pixel`

			`scanline_t32cb16blend_arm:`
			`stmfd sp!, {r4-r7, lr}`

			`pld [r0]`
			`pld [r1]`

			`// align DST to 32 bits`
			`tst r0, #0x3`
			`beq aligned`
			`subs r2, r2, #1`
To compile with llvm integrated assembler. * Explicitly specify default .align 0. * Use standard ldmfdlo instruction. * Before and after gas outputs are identical, with align 0 sections. * Objdump showed .text/.data/.bss section alignment attributes are 2^0 from gas and 2^2 from llvm assembler. These .S files might be working when compiled by gas, but llvm assembler's output should be more correct or conservative. Change-Id: I4e578dbc8155c0d06d1bbc1c33ec4cc851a18479 2015-08-21 21:17:43 +02:00			`ldmfdlo sp!, {r4-r7, lr} // return`
auto import from //depot/cupcake/@135843 2009-03-04 04:32:55 +01:00			`bxlo lr`

			`last:`
			`ldr r4, [r1], #4`
			`ldrh r3, [r0]`
			`pixel r3, r4, r12, 0`
			`strh r12, [r0], #2`

			`aligned:`
			`subs r2, r2, #2`
			`blo 9f`

fix part of [2017702] OpenGL bugs with alpha values of 1.0 in the source during blending into 8888 buffers when ONE / ONE_MINUS_SRC_ALPHA blending mode was used, the code wasn't saturating the color component. the reason was that this mode is used for premltiplied alpha blending, however, if used with a non premultiplied source, the color component would wrap. unfortunately, this costs 6 extra cycles per pixels, however... "correctness" prevails. this should not impact the UI since it's using h/w acceleration most of the time it also doesn't impact games which should be using h/w GL. This change will slow the emulator down a bit. 2009-08-18 10:07:35 +02:00			`// The main loop is unrolled twice and processes 4 pixels`
auto import from //depot/cupcake/@135843 2009-03-04 04:32:55 +01:00			`8: ldmia r1!, {r4, r5}`
			`// stream the source`
			`pld [r1, #32]`
			`add r0, r0, #4`
			`// it's all zero, skip this pixel`
			`orrs r3, r4, r5`
			`beq 7f`

			`// load the destination`
			`ldr r3, [r0, #-4]`
			`// stream the destination`
			`pld [r0, #32]`
			`pixel r3, r4, r12, 0`
fix part of [2017702] OpenGL bugs with alpha values of 1.0 in the source during blending into 8888 buffers when ONE / ONE_MINUS_SRC_ALPHA blending mode was used, the code wasn't saturating the color component. the reason was that this mode is used for premltiplied alpha blending, however, if used with a non premultiplied source, the color component would wrap. unfortunately, this costs 6 extra cycles per pixels, however... "correctness" prevails. this should not impact the UI since it's using h/w acceleration most of the time it also doesn't impact games which should be using h/w GL. This change will slow the emulator down a bit. 2009-08-18 10:07:35 +02:00			`pixel r3, r5, r12, 1`
auto import from //depot/cupcake/@135843 2009-03-04 04:32:55 +01:00			`// effectively, we're getting write-combining by virtue of the`
			`// cpu's write-back cache.`
			`str r12, [r0, #-4]`

			`// 2nd iterration of the loop, don't stream anything`
			`subs r2, r2, #2`
			`movlt r4, r5`
			`blt 9f`
			`ldmia r1!, {r4, r5}`
			`add r0, r0, #4`
			`orrs r3, r4, r5`
			`beq 7f`
			`ldr r3, [r0, #-4]`
			`pixel r3, r4, r12, 0`
			`pixel r3, r5, r12, 16`
			`str r12, [r0, #-4]`


			`7: subs r2, r2, #2`
			`bhs 8b`
			`mov r4, r5`

			`9: adds r2, r2, #1`
To compile with llvm integrated assembler. * Explicitly specify default .align 0. * Use standard ldmfdlo instruction. * Before and after gas outputs are identical, with align 0 sections. * Objdump showed .text/.data/.bss section alignment attributes are 2^0 from gas and 2^2 from llvm assembler. These .S files might be working when compiled by gas, but llvm assembler's output should be more correct or conservative. Change-Id: I4e578dbc8155c0d06d1bbc1c33ec4cc851a18479 2015-08-21 21:17:43 +02:00			`ldmfdlo sp!, {r4-r7, lr} // return`
auto import from //depot/cupcake/@135843 2009-03-04 04:32:55 +01:00			`bxlo lr`
			`b last`