Shader patching

The Broadcom Videocore IV GPU needs a couple of operations to happen in shader code that might have fixed function hardware on other platforms.
These are:

writing stencil state setup register
writing depth value to depth buffer
performing blending in software
writing vertex parameter memory read and write setup registers

Since the project does not include a compiler, but rather works with an assembly based shader setup, I decided not to patch shaders based on the state provided to the driver, but rather let the developer have full control. This means that regardless of what

depth write state
blending state
stencil state
vertex attribute state

is passed to the driver, this will not be reflected in the final behaviour unless the developer adds it to the assembly shaders. This will enable developers to take full control and optimise shaders to the last cycle.

The following helper functions are provided:

blending is implemented in software on the Broadcom Videocore IV GPU. Various blending modes can be implemented in the following way:

/// r0 contains sRGBA (in BGRA form)
"sig_none ; r0 = or.always(a, a, uni, nop) ; nop = nop(r0, r0) ;"

/// load dRGBA to r1 (in BGRA form)
/// load tbl color dRGBA from r4
"sig_color_load ; nop = nop(r0, r0) ; nop = nop(r0, r0) ;"
"sig_none ; r1 = or.always(r4, r4) ; nop = nop(r0, r0) ;"

//if factors are not separate
if(bas->srcAlphaBlendFactor == bas->srcColorBlendFactor &&
   bas->dstAlphaBlendFactor == bas->dstColorBlendFactor)
{
	switch(bas->srcAlphaBlendFactor)
	{
	case VK_BLEND_FACTOR_ZERO:
		"sig_small_imm ; r2 = or.always(b, b, nop, 0) ; nop = nop(r0, r0) ;"
		break;
	case VK_BLEND_FACTOR_ONE:
		"sig_small_imm ; r2 = or.always(b, b, nop, -1) ; nop = nop(r0, r0) ;"
		break;
	case VK_BLEND_FACTOR_SRC_COLOR:
		"sig_none ; r2 = or.always(r0, r0) ; nop = nop(r0, r0) ;"
		break;
	case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
		"sig_none ; r2 = not.always(r0, r0) ; nop = nop(r0, r0) ;"
		break;
	case VK_BLEND_FACTOR_DST_COLOR:
		"sig_none ; r2 = or.always(r1, r1) ; nop = nop(r0, r0) ;"
		break;
	case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
		"sig_none ; r2 = not.always(r1, r1) ; nop = nop(r0, r0) ;"
		break;
	case VK_BLEND_FACTOR_SRC_ALPHA:
		"sig_none ; r2.8888 = or.always.8d(r0, r0) ; nop = nop(r0, r0) ;"
		break;
	case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
		"sig_none ; r2.8888 = or.always.8d(r0, r0) ; nop = nop(r0, r0) ;"
		"sig_none ; r2 = not.always(r2, r2) ; nop = nop(r0, r0) ;"
	case VK_BLEND_FACTOR_DST_ALPHA:
		"sig_none ; r2.8888 = or.always.8d(r1, r1) ; nop = nop(r0, r0) ;"
		break;
	case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
		"sig_none ; r2.8888 = or.always.8d(r1, r1) ; nop = nop(r0, r0) ;"
		"sig_none ; r2 = not.always(r2, r2) ; nop = nop(r0, r0) ;"
		break;
	case VK_BLEND_FACTOR_CONSTANT_COLOR:
	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
	case VK_BLEND_FACTOR_CONSTANT_ALPHA:
	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
		"sig_load_imm ; r2 = load32.always(0xffffffff) ; nop = load32() ;"
		break;
	case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
		"sig_none ; r2.8888 = or.always.8d(r0, r0) ; nop = nop(r0, r0) ;" //sAAAA
		"sig_none ; r3.8888 = or.always.8d(r1, r1) ; nop = nop(r0, r0) ;" //dAAAA
		"sig_none ; r3 = not.always(r3, r3) ; nop = nop(r0, r0) ;" //1-dAAAA
		"sig_none ; nop = nop(r0, r0) ; r2 = v8min.always(r2, r3) ;" //min(sAAAA, 1-dAAAA)
		"sig_load_imm ; r3 = load32.always(0xff000000) ; nop = load32() ;" //load alpha = 1
		"sig_small_imm ; r2 = or.always(r2, r3) ; nop = nop(r0, r0) ;" //set alpha to 1
		break;
	}

	/// Multiply sRGBA and source factor
	"sig_none ; nop = nop(r0, r0) ; r0 = v8muld.always(r0, r2) ;"

	///repeat for
	//bas->dstAlphaBlendFactor

	/// Multiply dRGBA and destination factor
	"sig_none ; nop = nop(r0, r0) ; r1 = v8muld.always(r1, r2) ;"
}
else //separate factors
{
	//
}

switch(bas->alphaBlendOp)
{
case VK_BLEND_OP_ADD:
	"sig_none ; nop = nop(r0, r0) ; tlb_color_all = v8adds.always(r0, r1) ;"
	break;
case VK_BLEND_OP_SUBTRACT:
	"sig_none ; nop = nop(r0, r0) ; tlb_color_all = v8subs.always(r0, r1) ;"
	break;
case VK_BLEND_OP_REVERSE_SUBTRACT:
	"sig_none ; nop = nop(r0, r0) ; tlb_color_all = v8subs.always(r1, r0) ;"
	break;
case VK_BLEND_OP_MIN:
	"sig_none ; nop = nop(r0, r0) ; tlb_color_all = v8min.always(r0, r1) ;"
	break;
case VK_BLEND_OP_MAX:
	"sig_none ; nop = nop(r0, r0) ; tlb_color_all = v8max.always(r0, r1) ;"
	break;
}

depth writes are executed using the following

"sig_none ; tlb_z = or.always(r0, r0) ; nop = nop(r0, r0) ;"

stencil setup state can be set in a shader using:

"sig_none ; tlb_stencil_setup = or.always(r0, r0) ; nop = nop(r0, r0) ;"

please note that a depth write must be executed for stencil and depth testing to occur
stencil setup state can be encoded using the following function.

void encodeStencilValue(uint32_t *values, uint32_t* numValues, VkStencilOpState front, VkStencilOpState back, uint8_t stencilTestEnable)
{
	assert(values);
	assert(numValues);

	if(!stencilTestEnable)
	{
		front.compareOp = back.compareOp = VK_COMPARE_OP_ALWAYS;
	}

	if(front.compareMask == back.compareMask &&
	   front.compareOp == back.compareOp &&
	   front.depthFailOp == back.depthFailOp &&
	   front.failOp == back.failOp &&
	   front.passOp == back.passOp &&
	   front.reference == back.reference &&
	   front.writeMask == back.writeMask
	   )
	{
		*numValues = 1;

		values[0] = 0
				| (front.compareMask & 0xff)
				| (front.reference & 0xff) << 0x8
				| (getCompareOp(front.compareOp) & 0x7) << 16
				| (getStencilOp(front.failOp) & 0x7) << 19
				| (getStencilOp(front.passOp) & 0x7) << 22
				| (getStencilOp(front.depthFailOp) & 0x7) << 25
				| 3 << 30; //front and back

		switch(front.writeMask)
		{
		case 0x1:
			values[0] |= 0 << 28;
			break;
		case 0x3:
			values[0] |= 1 << 28;
			break;
		case 0xf:
			values[0] |= 2 << 28;
			break;
		case 0xff:
			values[0] |= 3 << 28;
			break;
		default:
			values[1] = 0
					| (front.writeMask & 0xff)
					| (front.writeMask & 0xff) << 8;
			*numValues = 2;
			break;
		};
	}
	else
	{
		*numValues = 2;

		values[0] = 0
				| (front.compareMask & 0xff)
				| (front.reference & 0xff) << 0x8
				| (getCompareOp(front.compareOp) & 0x7) << 16
				| (getStencilOp(front.failOp) & 0x7) << 19
				| (getStencilOp(front.passOp) & 0x7) << 22
				| (getStencilOp(front.depthFailOp) & 0x7) << 25
				| 1 << 30; //front

		values[1] = 0
				| (back.compareMask & 0xff)
				| (back.reference & 0xff) << 0x8
				| (getCompareOp(back.compareOp) & 0x7) << 16
				| (getStencilOp(back.failOp) & 0x7) << 19
				| (getStencilOp(back.passOp) & 0x7) << 22
				| (getStencilOp(back.depthFailOp) & 0x7) << 25
				| 2 << 30; //front

		if((front.writeMask == 0x1 ||
		   front.writeMask == 0x3 ||
		   front.writeMask == 0xf ||
		   front.writeMask == 0xff) &&
		   (back.writeMask == 0x1 ||
		   back.writeMask == 0x3 ||
		   back.writeMask == 0xf ||
		   back.writeMask == 0xff))
		{
			switch(front.writeMask)
			{
			case 0x1:
				values[0] |= 0 << 28;
				break;
			case 0x3:
				values[0] |= 1 << 28;
				break;
			case 0xf:
				values[0] |= 2 << 28;
				break;
			case 0xff:
				values[0] |= 3 << 28;
				break;
			};

			switch(back.writeMask)
			{
			case 0x1:
				values[1] |= 0 << 28;
				break;
			case 0x3:
				values[1] |= 1 << 28;
				break;
			case 0xf:
				values[1] |= 2 << 28;
				break;
			case 0xff:
				values[1] |= 3 << 28;
				break;
			};
		}
		else
		{
			values[2] = 0
					| (front.writeMask & 0xff)
					| (back.writeMask & 0xff) << 8;
			*numValues = 3;
		}
	}
}

Vertex attribute setup can be encoded using

uint32_t encodeVPMSetup(uint8_t stride,
			uint8_t direction, //0 vertical, 1 horizontal
			uint8_t isLaned, //0 packed, 1 laned
			uint8_t size, //0 8bit, 1 16bit, 2 32bit
			uint8_t address, //see doc
			uint8_t vectorComponentsToRead //only used for VPM read setup
			)
{
	uint32_t res = 0;
	res |= ((uint32_t)(vectorComponentsToRead) & 0xf) << 20;
	res |= ((uint32_t)(stride) & 0x3f) << 12;
	res |= ((uint32_t)(direction) & 0x1) << 11;
	res |= ((uint32_t)(isLaned) & 0x1) << 10;
	res |= ((uint32_t)(size) & 0x3) << 8;
	res |= (uint32_t)(address) & 0xff;

	return res;
}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Shader patching

Clone this wiki locally