-
Notifications
You must be signed in to change notification settings - Fork 70
Shader patching
The Broadcom Videocore IV GPU needs a couple of operations to happen in shader code that might have fixed function hardware on other platforms.
These are:
- writing stencil state setup register
- writing depth value to depth buffer
- performing blending in software
- writing vertex parameter memory read and write setup registers
Since the project does not include a compiler, but rather works with an assembly based shader setup, I decided not to patch shaders based on the state provided to the driver, but rather let the developer have full control. This means that regardless of what
- depth write state
- blending state
- stencil state
- vertex attribute state
is passed to the driver, this will not be reflected in the final behaviour unless the developer adds it to the assembly shaders. This will enable developers to take full control and optimise shaders to the last cycle.
The following helper functions are provided:
- blending is implemented in software on the Broadcom Videocore IV GPU. Various blending modes can be implemented in the following way:
/// r0 contains sRGBA (in BGRA form)
"sig_none ; r0 = or.always(a, a, uni, nop) ; nop = nop(r0, r0) ;"
/// load dRGBA to r1 (in BGRA form)
/// load tbl color dRGBA from r4
"sig_color_load ; nop = nop(r0, r0) ; nop = nop(r0, r0) ;"
"sig_none ; r1 = or.always(r4, r4) ; nop = nop(r0, r0) ;"
//if factors are not separate
if(bas->srcAlphaBlendFactor == bas->srcColorBlendFactor &&
bas->dstAlphaBlendFactor == bas->dstColorBlendFactor)
{
switch(bas->srcAlphaBlendFactor)
{
case VK_BLEND_FACTOR_ZERO:
"sig_small_imm ; r2 = or.always(b, b, nop, 0) ; nop = nop(r0, r0) ;"
break;
case VK_BLEND_FACTOR_ONE:
"sig_small_imm ; r2 = or.always(b, b, nop, -1) ; nop = nop(r0, r0) ;"
break;
case VK_BLEND_FACTOR_SRC_COLOR:
"sig_none ; r2 = or.always(r0, r0) ; nop = nop(r0, r0) ;"
break;
case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
"sig_none ; r2 = not.always(r0, r0) ; nop = nop(r0, r0) ;"
break;
case VK_BLEND_FACTOR_DST_COLOR:
"sig_none ; r2 = or.always(r1, r1) ; nop = nop(r0, r0) ;"
break;
case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
"sig_none ; r2 = not.always(r1, r1) ; nop = nop(r0, r0) ;"
break;
case VK_BLEND_FACTOR_SRC_ALPHA:
"sig_none ; r2.8888 = or.always.8d(r0, r0) ; nop = nop(r0, r0) ;"
break;
case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
"sig_none ; r2.8888 = or.always.8d(r0, r0) ; nop = nop(r0, r0) ;"
"sig_none ; r2 = not.always(r2, r2) ; nop = nop(r0, r0) ;"
case VK_BLEND_FACTOR_DST_ALPHA:
"sig_none ; r2.8888 = or.always.8d(r1, r1) ; nop = nop(r0, r0) ;"
break;
case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
"sig_none ; r2.8888 = or.always.8d(r1, r1) ; nop = nop(r0, r0) ;"
"sig_none ; r2 = not.always(r2, r2) ; nop = nop(r0, r0) ;"
break;
case VK_BLEND_FACTOR_CONSTANT_COLOR:
case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
case VK_BLEND_FACTOR_CONSTANT_ALPHA:
case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
"sig_load_imm ; r2 = load32.always(0xffffffff) ; nop = load32() ;"
break;
case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
"sig_none ; r2.8888 = or.always.8d(r0, r0) ; nop = nop(r0, r0) ;" //sAAAA
"sig_none ; r3.8888 = or.always.8d(r1, r1) ; nop = nop(r0, r0) ;" //dAAAA
"sig_none ; r3 = not.always(r3, r3) ; nop = nop(r0, r0) ;" //1-dAAAA
"sig_none ; nop = nop(r0, r0) ; r2 = v8min.always(r2, r3) ;" //min(sAAAA, 1-dAAAA)
"sig_load_imm ; r3 = load32.always(0xff000000) ; nop = load32() ;" //load alpha = 1
"sig_small_imm ; r2 = or.always(r2, r3) ; nop = nop(r0, r0) ;" //set alpha to 1
break;
}
/// Multiply sRGBA and source factor
"sig_none ; nop = nop(r0, r0) ; r0 = v8muld.always(r0, r2) ;"
///repeat for
//bas->dstAlphaBlendFactor
/// Multiply dRGBA and destination factor
"sig_none ; nop = nop(r0, r0) ; r1 = v8muld.always(r1, r2) ;"
}
else //separate factors
{
//
}
switch(bas->alphaBlendOp)
{
case VK_BLEND_OP_ADD:
"sig_none ; nop = nop(r0, r0) ; tlb_color_all = v8adds.always(r0, r1) ;"
break;
case VK_BLEND_OP_SUBTRACT:
"sig_none ; nop = nop(r0, r0) ; tlb_color_all = v8subs.always(r0, r1) ;"
break;
case VK_BLEND_OP_REVERSE_SUBTRACT:
"sig_none ; nop = nop(r0, r0) ; tlb_color_all = v8subs.always(r1, r0) ;"
break;
case VK_BLEND_OP_MIN:
"sig_none ; nop = nop(r0, r0) ; tlb_color_all = v8min.always(r0, r1) ;"
break;
case VK_BLEND_OP_MAX:
"sig_none ; nop = nop(r0, r0) ; tlb_color_all = v8max.always(r0, r1) ;"
break;
}
- depth writes are executed using the following
"sig_none ; tlb_z = or.always(r0, r0) ; nop = nop(r0, r0) ;"
- stencil setup state can be set in a shader using:
"sig_none ; tlb_stencil_setup = or.always(r0, r0) ; nop = nop(r0, r0) ;"
-
please note that a depth write must be executed for stencil and depth testing to occur
-
stencil setup state can be encoded using the following function.
void encodeStencilValue(uint32_t *values, uint32_t* numValues, VkStencilOpState front, VkStencilOpState back, uint8_t stencilTestEnable)
{
assert(values);
assert(numValues);
if(!stencilTestEnable)
{
front.compareOp = back.compareOp = VK_COMPARE_OP_ALWAYS;
}
if(front.compareMask == back.compareMask &&
front.compareOp == back.compareOp &&
front.depthFailOp == back.depthFailOp &&
front.failOp == back.failOp &&
front.passOp == back.passOp &&
front.reference == back.reference &&
front.writeMask == back.writeMask
)
{
*numValues = 1;
values[0] = 0
| (front.compareMask & 0xff)
| (front.reference & 0xff) << 0x8
| (getCompareOp(front.compareOp) & 0x7) << 16
| (getStencilOp(front.failOp) & 0x7) << 19
| (getStencilOp(front.passOp) & 0x7) << 22
| (getStencilOp(front.depthFailOp) & 0x7) << 25
| 3 << 30; //front and back
switch(front.writeMask)
{
case 0x1:
values[0] |= 0 << 28;
break;
case 0x3:
values[0] |= 1 << 28;
break;
case 0xf:
values[0] |= 2 << 28;
break;
case 0xff:
values[0] |= 3 << 28;
break;
default:
values[1] = 0
| (front.writeMask & 0xff)
| (front.writeMask & 0xff) << 8;
*numValues = 2;
break;
};
}
else
{
*numValues = 2;
values[0] = 0
| (front.compareMask & 0xff)
| (front.reference & 0xff) << 0x8
| (getCompareOp(front.compareOp) & 0x7) << 16
| (getStencilOp(front.failOp) & 0x7) << 19
| (getStencilOp(front.passOp) & 0x7) << 22
| (getStencilOp(front.depthFailOp) & 0x7) << 25
| 1 << 30; //front
values[1] = 0
| (back.compareMask & 0xff)
| (back.reference & 0xff) << 0x8
| (getCompareOp(back.compareOp) & 0x7) << 16
| (getStencilOp(back.failOp) & 0x7) << 19
| (getStencilOp(back.passOp) & 0x7) << 22
| (getStencilOp(back.depthFailOp) & 0x7) << 25
| 2 << 30; //front
if((front.writeMask == 0x1 ||
front.writeMask == 0x3 ||
front.writeMask == 0xf ||
front.writeMask == 0xff) &&
(back.writeMask == 0x1 ||
back.writeMask == 0x3 ||
back.writeMask == 0xf ||
back.writeMask == 0xff))
{
switch(front.writeMask)
{
case 0x1:
values[0] |= 0 << 28;
break;
case 0x3:
values[0] |= 1 << 28;
break;
case 0xf:
values[0] |= 2 << 28;
break;
case 0xff:
values[0] |= 3 << 28;
break;
};
switch(back.writeMask)
{
case 0x1:
values[1] |= 0 << 28;
break;
case 0x3:
values[1] |= 1 << 28;
break;
case 0xf:
values[1] |= 2 << 28;
break;
case 0xff:
values[1] |= 3 << 28;
break;
};
}
else
{
values[2] = 0
| (front.writeMask & 0xff)
| (back.writeMask & 0xff) << 8;
*numValues = 3;
}
}
}
- Vertex attribute setup can be encoded using
uint32_t encodeVPMSetup(uint8_t stride,
uint8_t direction, //0 vertical, 1 horizontal
uint8_t isLaned, //0 packed, 1 laned
uint8_t size, //0 8bit, 1 16bit, 2 32bit
uint8_t address, //see doc
uint8_t vectorComponentsToRead //only used for VPM read setup
)
{
uint32_t res = 0;
res |= ((uint32_t)(vectorComponentsToRead) & 0xf) << 20;
res |= ((uint32_t)(stride) & 0x3f) << 12;
res |= ((uint32_t)(direction) & 0x1) << 11;
res |= ((uint32_t)(isLaned) & 0x1) << 10;
res |= ((uint32_t)(size) & 0x3) << 8;
res |= (uint32_t)(address) & 0xff;
return res;
}