platform_system_core/libpixelflinger/codeflinger/texturing.cpp
Elliott Hughes d05f1d51ad Remove dead code.
Intel accidentally made this dead code in 2010 with commit
2bef93cc20, and no one's ever noticed.

Since no one noticing for so long implies that it doesn't matter,
let's just kill the supposedly optimized code.

Change-Id: Id5b37056cb8884c20bfe2db362e19b46f02e337d
2014-12-03 19:35:16 -08:00

1268 lines
45 KiB
C++

/* libs/pixelflinger/codeflinger/texturing.cpp
**
** Copyright 2006, The Android Open Source Project
**
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
**
** http://www.apache.org/licenses/LICENSE-2.0
**
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*/
#include <assert.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/types.h>
#include <cutils/log.h>
#include "GGLAssembler.h"
namespace android {
// ---------------------------------------------------------------------------
// iterators are initialized like this:
// (intToFixedCenter(x) * dx)>>16 + x0
// ((x<<16 + 0x8000) * dx)>>16 + x0
// ((x<<16)*dx + (0x8000*dx))>>16 + x0
// ( (x*dx) + dx>>1 ) + x0
// (x*dx) + (dx>>1 + x0)
void GGLAssembler::init_iterated_color(fragment_parts_t& parts, const reg_t& x)
{
context_t const* c = mBuilderContext.c;
const needs_t& needs = mBuilderContext.needs;
if (mSmooth) {
// NOTE: we could take this case in the mDithering + !mSmooth case,
// but this would use up to 4 more registers for the color components
// for only a little added quality.
// Currently, this causes the system to run out of registers in
// some case (see issue #719496)
comment("compute initial iterated color (smooth and/or dither case)");
parts.iterated_packed = 0;
parts.packed = 0;
// 0x1: color component
// 0x2: iterators
const int optReload = mOptLevel >> 1;
if (optReload >= 3) parts.reload = 0; // reload nothing
else if (optReload == 2) parts.reload = 2; // reload iterators
else if (optReload == 1) parts.reload = 1; // reload colors
else if (optReload <= 0) parts.reload = 3; // reload both
if (!mSmooth) {
// we're not smoothing (just dithering), we never have to
// reload the iterators
parts.reload &= ~2;
}
Scratch scratches(registerFile());
const int t0 = (parts.reload & 1) ? scratches.obtain() : 0;
const int t1 = (parts.reload & 2) ? scratches.obtain() : 0;
for (int i=0 ; i<4 ; i++) {
if (!mInfo[i].iterated)
continue;
// this component exists in the destination and is not replaced
// by a texture unit.
const int c = (parts.reload & 1) ? t0 : obtainReg();
if (i==0) CONTEXT_LOAD(c, iterators.ydady);
if (i==1) CONTEXT_LOAD(c, iterators.ydrdy);
if (i==2) CONTEXT_LOAD(c, iterators.ydgdy);
if (i==3) CONTEXT_LOAD(c, iterators.ydbdy);
parts.argb[i].reg = c;
if (mInfo[i].smooth) {
parts.argb_dx[i].reg = (parts.reload & 2) ? t1 : obtainReg();
const int dvdx = parts.argb_dx[i].reg;
CONTEXT_LOAD(dvdx, generated_vars.argb[i].dx);
MLA(AL, 0, c, x.reg, dvdx, c);
// adjust the color iterator to make sure it won't overflow
if (!mAA) {
// this is not needed when we're using anti-aliasing
// because we will (have to) clamp the components
// anyway.
int end = scratches.obtain();
MOV(AL, 0, end, reg_imm(parts.count.reg, LSR, 16));
MLA(AL, 1, end, dvdx, end, c);
SUB(MI, 0, c, c, end);
BIC(AL, 0, c, c, reg_imm(c, ASR, 31));
scratches.recycle(end);
}
}
if (parts.reload & 1) {
CONTEXT_STORE(c, generated_vars.argb[i].c);
}
}
} else {
// We're not smoothed, so we can
// just use a packed version of the color and extract the
// components as needed (or not at all if we don't blend)
// figure out if we need the iterated color
int load = 0;
for (int i=0 ; i<4 ; i++) {
component_info_t& info = mInfo[i];
if ((info.inDest || info.needed) && !info.replaced)
load |= 1;
}
parts.iterated_packed = 1;
parts.packed = (!mTextureMachine.mask && !mBlending
&& !mFog && !mDithering);
parts.reload = 0;
if (load || parts.packed) {
if (mBlending || mDithering || mInfo[GGLFormat::ALPHA].needed) {
comment("load initial iterated color (8888 packed)");
parts.iterated.setTo(obtainReg(),
&(c->formats[GGL_PIXEL_FORMAT_RGBA_8888]));
CONTEXT_LOAD(parts.iterated.reg, packed8888);
} else {
comment("load initial iterated color (dest format packed)");
parts.iterated.setTo(obtainReg(), &mCbFormat);
// pre-mask the iterated color
const int bits = parts.iterated.size();
const uint32_t size = ((bits>=32) ? 0 : (1LU << bits)) - 1;
uint32_t mask = 0;
if (mMasking) {
for (int i=0 ; i<4 ; i++) {
const int component_mask = 1<<i;
const int h = parts.iterated.format.c[i].h;
const int l = parts.iterated.format.c[i].l;
if (h && (!(mMasking & component_mask))) {
mask |= ((1<<(h-l))-1) << l;
}
}
}
if (mMasking && ((mask & size)==0)) {
// none of the components are present in the mask
} else {
CONTEXT_LOAD(parts.iterated.reg, packed);
if (mCbFormat.size == 1) {
AND(AL, 0, parts.iterated.reg,
parts.iterated.reg, imm(0xFF));
} else if (mCbFormat.size == 2) {
MOV(AL, 0, parts.iterated.reg,
reg_imm(parts.iterated.reg, LSR, 16));
}
}
// pre-mask the iterated color
if (mMasking) {
build_and_immediate(parts.iterated.reg, parts.iterated.reg,
mask, bits);
}
}
}
}
}
void GGLAssembler::build_iterated_color(
component_t& fragment,
const fragment_parts_t& parts,
int component,
Scratch& regs)
{
fragment.setTo( regs.obtain(), 0, 32, CORRUPTIBLE);
if (!mInfo[component].iterated)
return;
if (parts.iterated_packed) {
// iterated colors are packed, extract the one we need
extract(fragment, parts.iterated, component);
} else {
fragment.h = GGL_COLOR_BITS;
fragment.l = GGL_COLOR_BITS - 8;
fragment.flags |= CLEAR_LO;
// iterated colors are held in their own register,
// (smooth and/or dithering case)
if (parts.reload==3) {
// this implies mSmooth
Scratch scratches(registerFile());
int dx = scratches.obtain();
CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c);
CONTEXT_LOAD(dx, generated_vars.argb[component].dx);
ADD(AL, 0, dx, fragment.reg, dx);
CONTEXT_STORE(dx, generated_vars.argb[component].c);
} else if (parts.reload & 1) {
CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c);
} else {
// we don't reload, so simply rename the register and mark as
// non CORRUPTIBLE so that the texture env or blending code
// won't modify this (renamed) register
regs.recycle(fragment.reg);
fragment.reg = parts.argb[component].reg;
fragment.flags &= ~CORRUPTIBLE;
}
if (mInfo[component].smooth && mAA) {
// when using smooth shading AND anti-aliasing, we need to clamp
// the iterators because there is always an extra pixel on the
// edges, which most of the time will cause an overflow
// (since technically its outside of the domain).
BIC(AL, 0, fragment.reg, fragment.reg,
reg_imm(fragment.reg, ASR, 31));
component_sat(fragment);
}
}
}
// ---------------------------------------------------------------------------
void GGLAssembler::decodeLogicOpNeeds(const needs_t& needs)
{
// gather some informations about the components we need to process...
const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR;
switch(opcode) {
case GGL_COPY:
mLogicOp = 0;
break;
case GGL_CLEAR:
case GGL_SET:
mLogicOp = LOGIC_OP;
break;
case GGL_AND:
case GGL_AND_REVERSE:
case GGL_AND_INVERTED:
case GGL_XOR:
case GGL_OR:
case GGL_NOR:
case GGL_EQUIV:
case GGL_OR_REVERSE:
case GGL_OR_INVERTED:
case GGL_NAND:
mLogicOp = LOGIC_OP|LOGIC_OP_SRC|LOGIC_OP_DST;
break;
case GGL_NOOP:
case GGL_INVERT:
mLogicOp = LOGIC_OP|LOGIC_OP_DST;
break;
case GGL_COPY_INVERTED:
mLogicOp = LOGIC_OP|LOGIC_OP_SRC;
break;
};
}
void GGLAssembler::decodeTMUNeeds(const needs_t& needs, context_t const* c)
{
uint8_t replaced=0;
mTextureMachine.mask = 0;
mTextureMachine.activeUnits = 0;
for (int i=GGL_TEXTURE_UNIT_COUNT-1 ; i>=0 ; i--) {
texture_unit_t& tmu = mTextureMachine.tmu[i];
if (replaced == 0xF) {
// all components are replaced, skip this TMU.
tmu.format_idx = 0;
tmu.mask = 0;
tmu.replaced = replaced;
continue;
}
tmu.format_idx = GGL_READ_NEEDS(T_FORMAT, needs.t[i]);
tmu.format = c->formats[tmu.format_idx];
tmu.bits = tmu.format.size*8;
tmu.swrap = GGL_READ_NEEDS(T_S_WRAP, needs.t[i]);
tmu.twrap = GGL_READ_NEEDS(T_T_WRAP, needs.t[i]);
tmu.env = ggl_needs_to_env(GGL_READ_NEEDS(T_ENV, needs.t[i]));
tmu.pot = GGL_READ_NEEDS(T_POT, needs.t[i]);
tmu.linear = GGL_READ_NEEDS(T_LINEAR, needs.t[i])
&& tmu.format.size!=3; // XXX: only 8, 16 and 32 modes for now
// 5551 linear filtering is not supported
if (tmu.format_idx == GGL_PIXEL_FORMAT_RGBA_5551)
tmu.linear = 0;
tmu.mask = 0;
tmu.replaced = replaced;
if (tmu.format_idx) {
mTextureMachine.activeUnits++;
if (tmu.format.c[0].h) tmu.mask |= 0x1;
if (tmu.format.c[1].h) tmu.mask |= 0x2;
if (tmu.format.c[2].h) tmu.mask |= 0x4;
if (tmu.format.c[3].h) tmu.mask |= 0x8;
if (tmu.env == GGL_REPLACE) {
replaced |= tmu.mask;
} else if (tmu.env == GGL_DECAL) {
if (!tmu.format.c[GGLFormat::ALPHA].h) {
// if we don't have alpha, decal does nothing
tmu.mask = 0;
} else {
// decal always ignores At
tmu.mask &= ~(1<<GGLFormat::ALPHA);
}
}
}
mTextureMachine.mask |= tmu.mask;
//printf("%d: mask=%08lx, replaced=%08lx\n",
// i, int(tmu.mask), int(tmu.replaced));
}
mTextureMachine.replaced = replaced;
mTextureMachine.directTexture = 0;
//printf("replaced=%08lx\n", mTextureMachine.replaced);
}
void GGLAssembler::init_textures(
tex_coord_t* coords,
const reg_t& x, const reg_t& y)
{
context_t const* c = mBuilderContext.c;
const needs_t& needs = mBuilderContext.needs;
int Rctx = mBuilderContext.Rctx;
int Rx = x.reg;
int Ry = y.reg;
if (mTextureMachine.mask) {
comment("compute texture coordinates");
}
// init texture coordinates for each tmu
const int cb_format_idx = GGL_READ_NEEDS(CB_FORMAT, needs.n);
const bool multiTexture = mTextureMachine.activeUnits > 1;
for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
const texture_unit_t& tmu = mTextureMachine.tmu[i];
if (tmu.format_idx == 0)
continue;
if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
(tmu.twrap == GGL_NEEDS_WRAP_11))
{
// 1:1 texture
pointer_t& txPtr = coords[i].ptr;
txPtr.setTo(obtainReg(), tmu.bits);
CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydsdy);
ADD(AL, 0, Rx, Rx, reg_imm(txPtr.reg, ASR, 16)); // x += (s>>16)
CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydtdy);
ADD(AL, 0, Ry, Ry, reg_imm(txPtr.reg, ASR, 16)); // y += (t>>16)
// merge base & offset
CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].stride);
SMLABB(AL, Rx, Ry, txPtr.reg, Rx); // x+y*stride
CONTEXT_ADDR_LOAD(txPtr.reg, generated_vars.texture[i].data);
base_offset(txPtr, txPtr, Rx);
} else {
Scratch scratches(registerFile());
reg_t& s = coords[i].s;
reg_t& t = coords[i].t;
// s = (x * dsdx)>>16 + ydsdy
// s = (x * dsdx)>>16 + (y*dsdy)>>16 + s0
// t = (x * dtdx)>>16 + ydtdy
// t = (x * dtdx)>>16 + (y*dtdy)>>16 + t0
s.setTo(obtainReg());
t.setTo(obtainReg());
const int need_w = GGL_READ_NEEDS(W, needs.n);
if (need_w) {
CONTEXT_LOAD(s.reg, state.texture[i].iterators.ydsdy);
CONTEXT_LOAD(t.reg, state.texture[i].iterators.ydtdy);
} else {
int ydsdy = scratches.obtain();
int ydtdy = scratches.obtain();
CONTEXT_LOAD(s.reg, generated_vars.texture[i].dsdx);
CONTEXT_LOAD(ydsdy, state.texture[i].iterators.ydsdy);
CONTEXT_LOAD(t.reg, generated_vars.texture[i].dtdx);
CONTEXT_LOAD(ydtdy, state.texture[i].iterators.ydtdy);
MLA(AL, 0, s.reg, Rx, s.reg, ydsdy);
MLA(AL, 0, t.reg, Rx, t.reg, ydtdy);
}
if ((mOptLevel&1)==0) {
CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]);
CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]);
recycleReg(s.reg);
recycleReg(t.reg);
}
}
// direct texture?
if (!multiTexture && !mBlending && !mDithering && !mFog &&
cb_format_idx == tmu.format_idx && !tmu.linear &&
mTextureMachine.replaced == tmu.mask)
{
mTextureMachine.directTexture = i + 1;
}
}
}
void GGLAssembler::build_textures( fragment_parts_t& parts,
Scratch& regs)
{
context_t const* c = mBuilderContext.c;
const needs_t& needs = mBuilderContext.needs;
int Rctx = mBuilderContext.Rctx;
// We don't have a way to spill registers automatically
// spill depth and AA regs, when we know we may have to.
// build the spill list...
uint32_t spill_list = 0;
for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
const texture_unit_t& tmu = mTextureMachine.tmu[i];
if (tmu.format_idx == 0)
continue;
if (tmu.linear) {
// we may run out of register if we have linear filtering
// at 1 or 4 bytes / pixel on any texture unit.
if (tmu.format.size == 1) {
// if depth and AA enabled, we'll run out of 1 register
if (parts.z.reg > 0 && parts.covPtr.reg > 0)
spill_list |= 1<<parts.covPtr.reg;
}
if (tmu.format.size == 4) {
// if depth or AA enabled, we'll run out of 1 or 2 registers
if (parts.z.reg > 0)
spill_list |= 1<<parts.z.reg;
if (parts.covPtr.reg > 0)
spill_list |= 1<<parts.covPtr.reg;
}
}
}
Spill spill(registerFile(), *this, spill_list);
const bool multiTexture = mTextureMachine.activeUnits > 1;
for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
const texture_unit_t& tmu = mTextureMachine.tmu[i];
if (tmu.format_idx == 0)
continue;
pointer_t& txPtr = parts.coords[i].ptr;
pixel_t& texel = parts.texel[i];
// repeat...
if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
(tmu.twrap == GGL_NEEDS_WRAP_11))
{ // 1:1 textures
comment("fetch texel");
texel.setTo(regs.obtain(), &tmu.format);
load(txPtr, texel, WRITE_BACK);
} else {
Scratch scratches(registerFile());
reg_t& s = parts.coords[i].s;
reg_t& t = parts.coords[i].t;
if ((mOptLevel&1)==0) {
comment("reload s/t (multitexture or linear filtering)");
s.reg = scratches.obtain();
t.reg = scratches.obtain();
CONTEXT_LOAD(s.reg, generated_vars.texture[i].spill[0]);
CONTEXT_LOAD(t.reg, generated_vars.texture[i].spill[1]);
}
if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
return;
comment("compute repeat/clamp");
int u = scratches.obtain();
int v = scratches.obtain();
int width = scratches.obtain();
int height = scratches.obtain();
int U = 0;
int V = 0;
if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
return;
CONTEXT_LOAD(width, generated_vars.texture[i].width);
CONTEXT_LOAD(height, generated_vars.texture[i].height);
int FRAC_BITS = 0;
if (tmu.linear) {
// linear interpolation
if (tmu.format.size == 1) {
// for 8-bits textures, we can afford
// 7 bits of fractional precision at no
// additional cost (we can't do 8 bits
// because filter8 uses signed 16 bits muls)
FRAC_BITS = 7;
} else if (tmu.format.size == 2) {
// filter16() is internally limited to 4 bits, so:
// FRAC_BITS=2 generates less instructions,
// FRAC_BITS=3,4,5 creates unpleasant artifacts,
// FRAC_BITS=6+ looks good
FRAC_BITS = 6;
} else if (tmu.format.size == 4) {
// filter32() is internally limited to 8 bits, so:
// FRAC_BITS=4 looks good
// FRAC_BITS=5+ looks better, but generates 3 extra ipp
FRAC_BITS = 6;
} else {
// for all other cases we use 4 bits.
FRAC_BITS = 4;
}
}
wrapping(u, s.reg, width, tmu.swrap, FRAC_BITS);
wrapping(v, t.reg, height, tmu.twrap, FRAC_BITS);
if (tmu.linear) {
comment("compute linear filtering offsets");
// pixel size scale
const int shift = 31 - gglClz(tmu.format.size);
U = scratches.obtain();
V = scratches.obtain();
if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
return;
// sample the texel center
SUB(AL, 0, u, u, imm(1<<(FRAC_BITS-1)));
SUB(AL, 0, v, v, imm(1<<(FRAC_BITS-1)));
// get the fractionnal part of U,V
AND(AL, 0, U, u, imm((1<<FRAC_BITS)-1));
AND(AL, 0, V, v, imm((1<<FRAC_BITS)-1));
// compute width-1 and height-1
SUB(AL, 0, width, width, imm(1));
SUB(AL, 0, height, height, imm(1));
// get the integer part of U,V and clamp/wrap
// and compute offset to the next texel
if (tmu.swrap == GGL_NEEDS_WRAP_REPEAT) {
// u has already been REPEATed
MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS));
MOV(MI, 0, u, width);
CMP(AL, u, width);
MOV(LT, 0, width, imm(1 << shift));
if (shift)
MOV(GE, 0, width, reg_imm(width, LSL, shift));
RSB(GE, 0, width, width, imm(0));
} else {
// u has not been CLAMPed yet
// algorithm:
// if ((u>>4) >= width)
// u = width<<4
// width = 0
// else
// width = 1<<shift
// u = u>>4; // get integer part
// if (u<0)
// u = 0
// width = 0
// generated_vars.rt = width
CMP(AL, width, reg_imm(u, ASR, FRAC_BITS));
MOV(LE, 0, u, reg_imm(width, LSL, FRAC_BITS));
MOV(LE, 0, width, imm(0));
MOV(GT, 0, width, imm(1 << shift));
MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS));
MOV(MI, 0, u, imm(0));
MOV(MI, 0, width, imm(0));
}
CONTEXT_STORE(width, generated_vars.rt);
const int stride = width;
CONTEXT_LOAD(stride, generated_vars.texture[i].stride);
if (tmu.twrap == GGL_NEEDS_WRAP_REPEAT) {
// v has already been REPEATed
MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS));
MOV(MI, 0, v, height);
CMP(AL, v, height);
MOV(LT, 0, height, imm(1 << shift));
if (shift)
MOV(GE, 0, height, reg_imm(height, LSL, shift));
RSB(GE, 0, height, height, imm(0));
MUL(AL, 0, height, stride, height);
} else {
// v has not been CLAMPed yet
CMP(AL, height, reg_imm(v, ASR, FRAC_BITS));
MOV(LE, 0, v, reg_imm(height, LSL, FRAC_BITS));
MOV(LE, 0, height, imm(0));
if (shift) {
MOV(GT, 0, height, reg_imm(stride, LSL, shift));
} else {
MOV(GT, 0, height, stride);
}
MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS));
MOV(MI, 0, v, imm(0));
MOV(MI, 0, height, imm(0));
}
CONTEXT_STORE(height, generated_vars.lb);
}
scratches.recycle(width);
scratches.recycle(height);
// iterate texture coordinates...
comment("iterate s,t");
int dsdx = scratches.obtain();
int dtdx = scratches.obtain();
if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
return;
CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx);
CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx);
ADD(AL, 0, s.reg, s.reg, dsdx);
ADD(AL, 0, t.reg, t.reg, dtdx);
if ((mOptLevel&1)==0) {
CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]);
CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]);
scratches.recycle(s.reg);
scratches.recycle(t.reg);
}
scratches.recycle(dsdx);
scratches.recycle(dtdx);
// merge base & offset...
comment("merge base & offset");
texel.setTo(regs.obtain(), &tmu.format);
txPtr.setTo(texel.reg, tmu.bits);
int stride = scratches.obtain();
if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
return;
CONTEXT_LOAD(stride, generated_vars.texture[i].stride);
CONTEXT_ADDR_LOAD(txPtr.reg, generated_vars.texture[i].data);
SMLABB(AL, u, v, stride, u); // u+v*stride
base_offset(txPtr, txPtr, u);
// load texel
if (!tmu.linear) {
comment("fetch texel");
load(txPtr, texel, 0);
} else {
// recycle registers we don't need anymore
scratches.recycle(u);
scratches.recycle(v);
scratches.recycle(stride);
comment("fetch texel, bilinear");
switch (tmu.format.size) {
case 1: filter8(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
case 2: filter16(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
case 3: filter24(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
case 4: filter32(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
}
}
}
}
}
void GGLAssembler::build_iterate_texture_coordinates(
const fragment_parts_t& parts)
{
const bool multiTexture = mTextureMachine.activeUnits > 1;
for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
const texture_unit_t& tmu = mTextureMachine.tmu[i];
if (tmu.format_idx == 0)
continue;
if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
(tmu.twrap == GGL_NEEDS_WRAP_11))
{ // 1:1 textures
const pointer_t& txPtr = parts.coords[i].ptr;
ADD(AL, 0, txPtr.reg, txPtr.reg, imm(txPtr.size>>3));
} else {
Scratch scratches(registerFile());
int s = parts.coords[i].s.reg;
int t = parts.coords[i].t.reg;
if ((mOptLevel&1)==0) {
s = scratches.obtain();
t = scratches.obtain();
CONTEXT_LOAD(s, generated_vars.texture[i].spill[0]);
CONTEXT_LOAD(t, generated_vars.texture[i].spill[1]);
}
int dsdx = scratches.obtain();
int dtdx = scratches.obtain();
CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx);
CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx);
ADD(AL, 0, s, s, dsdx);
ADD(AL, 0, t, t, dtdx);
if ((mOptLevel&1)==0) {
CONTEXT_STORE(s, generated_vars.texture[i].spill[0]);
CONTEXT_STORE(t, generated_vars.texture[i].spill[1]);
}
}
}
}
void GGLAssembler::filter8(
const fragment_parts_t& /*parts*/,
pixel_t& texel, const texture_unit_t& tmu,
int U, int V, pointer_t& txPtr,
int FRAC_BITS)
{
if (tmu.format.components != GGL_ALPHA &&
tmu.format.components != GGL_LUMINANCE)
{
// this is a packed format, and we don't support
// linear filtering (it's probably RGB 332)
// Should not happen with OpenGL|ES
LDRB(AL, texel.reg, txPtr.reg);
return;
}
// ------------------------
// about ~22 cycles / pixel
Scratch scratches(registerFile());
int pixel= scratches.obtain();
int d = scratches.obtain();
int u = scratches.obtain();
int k = scratches.obtain();
int rt = scratches.obtain();
int lb = scratches.obtain();
// RB -> U * V
CONTEXT_LOAD(rt, generated_vars.rt);
CONTEXT_LOAD(lb, generated_vars.lb);
int offset = pixel;
ADD(AL, 0, offset, lb, rt);
LDRB(AL, pixel, txPtr.reg, reg_scale_pre(offset));
SMULBB(AL, u, U, V);
SMULBB(AL, d, pixel, u);
RSB(AL, 0, k, u, imm(1<<(FRAC_BITS*2)));
// LB -> (1-U) * V
RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
LDRB(AL, pixel, txPtr.reg, reg_scale_pre(lb));
SMULBB(AL, u, U, V);
SMLABB(AL, d, pixel, u, d);
SUB(AL, 0, k, k, u);
// LT -> (1-U)*(1-V)
RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
LDRB(AL, pixel, txPtr.reg);
SMULBB(AL, u, U, V);
SMLABB(AL, d, pixel, u, d);
// RT -> U*(1-V)
LDRB(AL, pixel, txPtr.reg, reg_scale_pre(rt));
SUB(AL, 0, u, k, u);
SMLABB(AL, texel.reg, pixel, u, d);
for (int i=0 ; i<4 ; i++) {
if (!texel.format.c[i].h) continue;
texel.format.c[i].h = FRAC_BITS*2+8;
texel.format.c[i].l = FRAC_BITS*2; // keeping 8 bits in enough
}
texel.format.size = 4;
texel.format.bitsPerPixel = 32;
texel.flags |= CLEAR_LO;
}
void GGLAssembler::filter16(
const fragment_parts_t& /*parts*/,
pixel_t& texel, const texture_unit_t& tmu,
int U, int V, pointer_t& txPtr,
int FRAC_BITS)
{
// compute the mask
// XXX: it would be nice if the mask below could be computed
// automatically.
uint32_t mask = 0;
int shift = 0;
int prec = 0;
switch (tmu.format_idx) {
case GGL_PIXEL_FORMAT_RGB_565:
// source: 00000ggg.ggg00000 | rrrrr000.000bbbbb
// result: gggggggg.gggrrrrr | rrrrr0bb.bbbbbbbb
mask = 0x07E0F81F;
shift = 16;
prec = 5;
break;
case GGL_PIXEL_FORMAT_RGBA_4444:
// 0000,1111,0000,1111 | 0000,1111,0000,1111
mask = 0x0F0F0F0F;
shift = 12;
prec = 4;
break;
case GGL_PIXEL_FORMAT_LA_88:
// 0000,0000,1111,1111 | 0000,0000,1111,1111
// AALL -> 00AA | 00LL
mask = 0x00FF00FF;
shift = 8;
prec = 8;
break;
default:
// unsupported format, do something sensical...
ALOGE("Unsupported 16-bits texture format (%d)", tmu.format_idx);
LDRH(AL, texel.reg, txPtr.reg);
return;
}
const int adjust = FRAC_BITS*2 - prec;
const int round = 0;
// update the texel format
texel.format.size = 4;
texel.format.bitsPerPixel = 32;
texel.flags |= CLEAR_HI|CLEAR_LO;
for (int i=0 ; i<4 ; i++) {
if (!texel.format.c[i].h) continue;
const uint32_t offset = (mask & tmu.format.mask(i)) ? 0 : shift;
texel.format.c[i].h = tmu.format.c[i].h + offset + prec;
texel.format.c[i].l = texel.format.c[i].h - (tmu.format.bits(i) + prec);
}
// ------------------------
// about ~40 cycles / pixel
Scratch scratches(registerFile());
int pixel= scratches.obtain();
int d = scratches.obtain();
int u = scratches.obtain();
int k = scratches.obtain();
// RB -> U * V
int offset = pixel;
CONTEXT_LOAD(offset, generated_vars.rt);
CONTEXT_LOAD(u, generated_vars.lb);
ADD(AL, 0, offset, offset, u);
LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
SMULBB(AL, u, U, V);
ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
build_and_immediate(pixel, pixel, mask, 32);
if (adjust) {
if (round)
ADD(AL, 0, u, u, imm(1<<(adjust-1)));
MOV(AL, 0, u, reg_imm(u, LSR, adjust));
}
MUL(AL, 0, d, pixel, u);
RSB(AL, 0, k, u, imm(1<<prec));
// LB -> (1-U) * V
CONTEXT_LOAD(offset, generated_vars.lb);
RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
SMULBB(AL, u, U, V);
ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
build_and_immediate(pixel, pixel, mask, 32);
if (adjust) {
if (round)
ADD(AL, 0, u, u, imm(1<<(adjust-1)));
MOV(AL, 0, u, reg_imm(u, LSR, adjust));
}
MLA(AL, 0, d, pixel, u, d);
SUB(AL, 0, k, k, u);
// LT -> (1-U)*(1-V)
RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
LDRH(AL, pixel, txPtr.reg);
SMULBB(AL, u, U, V);
ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
build_and_immediate(pixel, pixel, mask, 32);
if (adjust) {
if (round)
ADD(AL, 0, u, u, imm(1<<(adjust-1)));
MOV(AL, 0, u, reg_imm(u, LSR, adjust));
}
MLA(AL, 0, d, pixel, u, d);
// RT -> U*(1-V)
CONTEXT_LOAD(offset, generated_vars.rt);
LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
SUB(AL, 0, u, k, u);
ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
build_and_immediate(pixel, pixel, mask, 32);
MLA(AL, 0, texel.reg, pixel, u, d);
}
void GGLAssembler::filter24(
const fragment_parts_t& /*parts*/,
pixel_t& texel, const texture_unit_t& /*tmu*/,
int /*U*/, int /*V*/, pointer_t& txPtr,
int /*FRAC_BITS*/)
{
// not supported yet (currently disabled)
load(txPtr, texel, 0);
}
void GGLAssembler::filter32(
const fragment_parts_t& /*parts*/,
pixel_t& texel, const texture_unit_t& /*tmu*/,
int U, int V, pointer_t& txPtr,
int FRAC_BITS)
{
const int adjust = FRAC_BITS*2 - 8;
const int round = 0;
// ------------------------
// about ~38 cycles / pixel
Scratch scratches(registerFile());
int pixel= scratches.obtain();
int dh = scratches.obtain();
int u = scratches.obtain();
int k = scratches.obtain();
int temp = scratches.obtain();
int dl = scratches.obtain();
int mask = scratches.obtain();
MOV(AL, 0, mask, imm(0xFF));
ORR(AL, 0, mask, mask, imm(0xFF0000));
// RB -> U * V
int offset = pixel;
CONTEXT_LOAD(offset, generated_vars.rt);
CONTEXT_LOAD(u, generated_vars.lb);
ADD(AL, 0, offset, offset, u);
LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
SMULBB(AL, u, U, V);
AND(AL, 0, temp, mask, pixel);
if (adjust) {
if (round)
ADD(AL, 0, u, u, imm(1<<(adjust-1)));
MOV(AL, 0, u, reg_imm(u, LSR, adjust));
}
MUL(AL, 0, dh, temp, u);
AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
MUL(AL, 0, dl, temp, u);
RSB(AL, 0, k, u, imm(0x100));
// LB -> (1-U) * V
CONTEXT_LOAD(offset, generated_vars.lb);
RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
SMULBB(AL, u, U, V);
AND(AL, 0, temp, mask, pixel);
if (adjust) {
if (round)
ADD(AL, 0, u, u, imm(1<<(adjust-1)));
MOV(AL, 0, u, reg_imm(u, LSR, adjust));
}
MLA(AL, 0, dh, temp, u, dh);
AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
MLA(AL, 0, dl, temp, u, dl);
SUB(AL, 0, k, k, u);
// LT -> (1-U)*(1-V)
RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
LDR(AL, pixel, txPtr.reg);
SMULBB(AL, u, U, V);
AND(AL, 0, temp, mask, pixel);
if (adjust) {
if (round)
ADD(AL, 0, u, u, imm(1<<(adjust-1)));
MOV(AL, 0, u, reg_imm(u, LSR, adjust));
}
MLA(AL, 0, dh, temp, u, dh);
AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
MLA(AL, 0, dl, temp, u, dl);
// RT -> U*(1-V)
CONTEXT_LOAD(offset, generated_vars.rt);
LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
SUB(AL, 0, u, k, u);
AND(AL, 0, temp, mask, pixel);
MLA(AL, 0, dh, temp, u, dh);
AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
MLA(AL, 0, dl, temp, u, dl);
AND(AL, 0, dh, mask, reg_imm(dh, LSR, 8));
AND(AL, 0, dl, dl, reg_imm(mask, LSL, 8));
ORR(AL, 0, texel.reg, dh, dl);
}
void GGLAssembler::build_texture_environment(
component_t& fragment,
const fragment_parts_t& parts,
int component,
Scratch& regs)
{
const uint32_t component_mask = 1<<component;
const bool multiTexture = mTextureMachine.activeUnits > 1;
for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; i++) {
texture_unit_t& tmu = mTextureMachine.tmu[i];
if (tmu.mask & component_mask) {
// replace or modulate with this texture
if ((tmu.replaced & component_mask) == 0) {
// not replaced by a later tmu...
Scratch scratches(registerFile());
pixel_t texel(parts.texel[i]);
if (multiTexture &&
tmu.swrap == GGL_NEEDS_WRAP_11 &&
tmu.twrap == GGL_NEEDS_WRAP_11)
{
texel.reg = scratches.obtain();
texel.flags |= CORRUPTIBLE;
comment("fetch texel (multitexture 1:1)");
load(parts.coords[i].ptr, texel, WRITE_BACK);
}
component_t incoming(fragment);
modify(fragment, regs);
switch (tmu.env) {
case GGL_REPLACE:
extract(fragment, texel, component);
break;
case GGL_MODULATE:
modulate(fragment, incoming, texel, component);
break;
case GGL_DECAL:
decal(fragment, incoming, texel, component);
break;
case GGL_BLEND:
blend(fragment, incoming, texel, component, i);
break;
case GGL_ADD:
add(fragment, incoming, texel, component);
break;
}
}
}
}
}
// ---------------------------------------------------------------------------
void GGLAssembler::wrapping(
int d,
int coord, int size,
int tx_wrap, int tx_linear)
{
// notes:
// if tx_linear is set, we need 4 extra bits of precision on the result
// SMULL/UMULL is 3 cycles
Scratch scratches(registerFile());
int c = coord;
if (tx_wrap == GGL_NEEDS_WRAP_REPEAT) {
// UMULL takes 4 cycles (interlocked), and we can get away with
// 2 cycles using SMULWB, but we're loosing 16 bits of precision
// out of 32 (this is not a problem because the iterator keeps
// its full precision)
// UMULL(AL, 0, size, d, c, size);
// note: we can't use SMULTB because it's signed.
MOV(AL, 0, d, reg_imm(c, LSR, 16-tx_linear));
SMULWB(AL, d, d, size);
} else if (tx_wrap == GGL_NEEDS_WRAP_CLAMP_TO_EDGE) {
if (tx_linear) {
// 1 cycle
MOV(AL, 0, d, reg_imm(coord, ASR, 16-tx_linear));
} else {
// 4 cycles (common case)
MOV(AL, 0, d, reg_imm(coord, ASR, 16));
BIC(AL, 0, d, d, reg_imm(d, ASR, 31));
CMP(AL, d, size);
SUB(GE, 0, d, size, imm(1));
}
}
}
// ---------------------------------------------------------------------------
void GGLAssembler::modulate(
component_t& dest,
const component_t& incoming,
const pixel_t& incomingTexel, int component)
{
Scratch locals(registerFile());
integer_t texel(locals.obtain(), 32, CORRUPTIBLE);
extract(texel, incomingTexel, component);
const int Nt = texel.size();
// Nt should always be less than 10 bits because it comes
// from the TMU.
int Ni = incoming.size();
// Ni could be big because it comes from previous MODULATEs
if (Nt == 1) {
// texel acts as a bit-mask
// dest = incoming & ((texel << incoming.h)-texel)
RSB(AL, 0, dest.reg, texel.reg, reg_imm(texel.reg, LSL, incoming.h));
AND(AL, 0, dest.reg, dest.reg, incoming.reg);
dest.l = incoming.l;
dest.h = incoming.h;
dest.flags |= (incoming.flags & CLEAR_LO);
} else if (Ni == 1) {
MOV(AL, 0, dest.reg, reg_imm(incoming.reg, LSL, 31-incoming.h));
AND(AL, 0, dest.reg, texel.reg, reg_imm(dest.reg, ASR, 31));
dest.l = 0;
dest.h = Nt;
} else {
int inReg = incoming.reg;
int shift = incoming.l;
if ((Nt + Ni) > 32) {
// we will overflow, reduce the precision of Ni to 8 bits
// (Note Nt cannot be more than 10 bits which happens with
// 565 textures and GGL_LINEAR)
shift += Ni-8;
Ni = 8;
}
// modulate by the component with the lowest precision
if (Nt >= Ni) {
if (shift) {
// XXX: we should be able to avoid this shift
// when shift==16 && Nt<16 && Ni<16, in which
// we could use SMULBT below.
MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift));
inReg = dest.reg;
shift = 0;
}
// operation: (Cf*Ct)/((1<<Ni)-1)
// approximated with: Cf*(Ct + Ct>>(Ni-1))>>Ni
// this operation doesn't change texel's size
ADD(AL, 0, dest.reg, inReg, reg_imm(inReg, LSR, Ni-1));
if (Nt<16 && Ni<16) SMULBB(AL, dest.reg, texel.reg, dest.reg);
else MUL(AL, 0, dest.reg, texel.reg, dest.reg);
dest.l = Ni;
dest.h = Nt + Ni;
} else {
if (shift && (shift != 16)) {
// if shift==16, we can use 16-bits mul instructions later
MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift));
inReg = dest.reg;
shift = 0;
}
// operation: (Cf*Ct)/((1<<Nt)-1)
// approximated with: Ct*(Cf + Cf>>(Nt-1))>>Nt
// this operation doesn't change incoming's size
Scratch scratches(registerFile());
int t = (texel.flags & CORRUPTIBLE) ? texel.reg : dest.reg;
if (t == inReg)
t = scratches.obtain();
ADD(AL, 0, t, texel.reg, reg_imm(texel.reg, LSR, Nt-1));
if (Nt<16 && Ni<16) {
if (shift==16) SMULBT(AL, dest.reg, t, inReg);
else SMULBB(AL, dest.reg, t, inReg);
} else MUL(AL, 0, dest.reg, t, inReg);
dest.l = Nt;
dest.h = Nt + Ni;
}
// low bits are not valid
dest.flags |= CLEAR_LO;
// no need to keep more than 8 bits/component
if (dest.size() > 8)
dest.l = dest.h-8;
}
}
void GGLAssembler::decal(
component_t& dest,
const component_t& incoming,
const pixel_t& incomingTexel, int component)
{
// RGBA:
// Cv = Cf*(1 - At) + Ct*At = Cf + (Ct - Cf)*At
// Av = Af
Scratch locals(registerFile());
integer_t texel(locals.obtain(), 32, CORRUPTIBLE);
integer_t factor(locals.obtain(), 32, CORRUPTIBLE);
extract(texel, incomingTexel, component);
extract(factor, incomingTexel, GGLFormat::ALPHA);
// no need to keep more than 8-bits for decal
int Ni = incoming.size();
int shift = incoming.l;
if (Ni > 8) {
shift += Ni-8;
Ni = 8;
}
integer_t incomingNorm(incoming.reg, Ni, incoming.flags);
if (shift) {
MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift));
incomingNorm.reg = dest.reg;
incomingNorm.flags |= CORRUPTIBLE;
}
ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1));
build_blendOneMinusFF(dest, factor, incomingNorm, texel);
}
void GGLAssembler::blend(
component_t& dest,
const component_t& incoming,
const pixel_t& incomingTexel, int component, int tmu)
{
// RGBA:
// Cv = (1 - Ct)*Cf + Ct*Cc = Cf + (Cc - Cf)*Ct
// Av = At*Af
if (component == GGLFormat::ALPHA) {
modulate(dest, incoming, incomingTexel, component);
return;
}
Scratch locals(registerFile());
integer_t color(locals.obtain(), 8, CORRUPTIBLE);
integer_t factor(locals.obtain(), 32, CORRUPTIBLE);
LDRB(AL, color.reg, mBuilderContext.Rctx,
immed12_pre(GGL_OFFSETOF(state.texture[tmu].env_color[component])));
extract(factor, incomingTexel, component);
// no need to keep more than 8-bits for blend
int Ni = incoming.size();
int shift = incoming.l;
if (Ni > 8) {
shift += Ni-8;
Ni = 8;
}
integer_t incomingNorm(incoming.reg, Ni, incoming.flags);
if (shift) {
MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift));
incomingNorm.reg = dest.reg;
incomingNorm.flags |= CORRUPTIBLE;
}
ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1));
build_blendOneMinusFF(dest, factor, incomingNorm, color);
}
void GGLAssembler::add(
component_t& dest,
const component_t& incoming,
const pixel_t& incomingTexel, int component)
{
// RGBA:
// Cv = Cf + Ct;
Scratch locals(registerFile());
component_t incomingTemp(incoming);
// use "dest" as a temporary for extracting the texel, unless "dest"
// overlaps "incoming".
integer_t texel(dest.reg, 32, CORRUPTIBLE);
if (dest.reg == incomingTemp.reg)
texel.reg = locals.obtain();
extract(texel, incomingTexel, component);
if (texel.s < incomingTemp.size()) {
expand(texel, texel, incomingTemp.size());
} else if (texel.s > incomingTemp.size()) {
if (incomingTemp.flags & CORRUPTIBLE) {
expand(incomingTemp, incomingTemp, texel.s);
} else {
incomingTemp.reg = locals.obtain();
expand(incomingTemp, incoming, texel.s);
}
}
if (incomingTemp.l) {
ADD(AL, 0, dest.reg, texel.reg,
reg_imm(incomingTemp.reg, LSR, incomingTemp.l));
} else {
ADD(AL, 0, dest.reg, texel.reg, incomingTemp.reg);
}
dest.l = 0;
dest.h = texel.size();
component_sat(dest);
}
// ----------------------------------------------------------------------------
}; // namespace android