Skip to content

Shader patching

Yours3lf edited this page Jun 18, 2020 · 1 revision

The Broadcom Videocore IV GPU needs a couple of operations to happen in shader code that might have fixed function hardware on other platforms.
These are:

  • writing stencil state setup register
  • writing depth value to depth buffer
  • performing blending in software
  • writing vertex parameter memory read and write setup registers

Since the project does not include a compiler, but rather works with an assembly based shader setup, I decided not to patch shaders based on the state provided to the driver, but rather let the developer have full control. This means that regardless of what

  • depth write state
  • blending state
  • stencil state
  • vertex attribute state

is passed to the driver, this will not be reflected in the final behaviour unless the developer adds it to the assembly shaders. This will enable developers to take full control and optimise shaders to the last cycle.

The following helper functions are provided:

  • blending is implemented in software on the Broadcom Videocore IV GPU. Various blending modes can be implemented in the following way:
/// r0 contains sRGBA (in BGRA form)
"sig_none ; r0 = or.always(a, a, uni, nop) ; nop = nop(r0, r0) ;"

/// load dRGBA to r1 (in BGRA form)
/// load tbl color dRGBA from r4
"sig_color_load ; nop = nop(r0, r0) ; nop = nop(r0, r0) ;"
"sig_none ; r1 = or.always(r4, r4) ; nop = nop(r0, r0) ;"

//if factors are not separate
if(bas->srcAlphaBlendFactor == bas->srcColorBlendFactor &&
   bas->dstAlphaBlendFactor == bas->dstColorBlendFactor)
{
	switch(bas->srcAlphaBlendFactor)
	{
	case VK_BLEND_FACTOR_ZERO:
		"sig_small_imm ; r2 = or.always(b, b, nop, 0) ; nop = nop(r0, r0) ;"
		break;
	case VK_BLEND_FACTOR_ONE:
		"sig_small_imm ; r2 = or.always(b, b, nop, -1) ; nop = nop(r0, r0) ;"
		break;
	case VK_BLEND_FACTOR_SRC_COLOR:
		"sig_none ; r2 = or.always(r0, r0) ; nop = nop(r0, r0) ;"
		break;
	case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
		"sig_none ; r2 = not.always(r0, r0) ; nop = nop(r0, r0) ;"
		break;
	case VK_BLEND_FACTOR_DST_COLOR:
		"sig_none ; r2 = or.always(r1, r1) ; nop = nop(r0, r0) ;"
		break;
	case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
		"sig_none ; r2 = not.always(r1, r1) ; nop = nop(r0, r0) ;"
		break;
	case VK_BLEND_FACTOR_SRC_ALPHA:
		"sig_none ; r2.8888 = or.always.8d(r0, r0) ; nop = nop(r0, r0) ;"
		break;
	case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
		"sig_none ; r2.8888 = or.always.8d(r0, r0) ; nop = nop(r0, r0) ;"
		"sig_none ; r2 = not.always(r2, r2) ; nop = nop(r0, r0) ;"
	case VK_BLEND_FACTOR_DST_ALPHA:
		"sig_none ; r2.8888 = or.always.8d(r1, r1) ; nop = nop(r0, r0) ;"
		break;
	case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
		"sig_none ; r2.8888 = or.always.8d(r1, r1) ; nop = nop(r0, r0) ;"
		"sig_none ; r2 = not.always(r2, r2) ; nop = nop(r0, r0) ;"
		break;
	case VK_BLEND_FACTOR_CONSTANT_COLOR:
	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
	case VK_BLEND_FACTOR_CONSTANT_ALPHA:
	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
		"sig_load_imm ; r2 = load32.always(0xffffffff) ; nop = load32() ;"
		break;
	case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
		"sig_none ; r2.8888 = or.always.8d(r0, r0) ; nop = nop(r0, r0) ;" //sAAAA
		"sig_none ; r3.8888 = or.always.8d(r1, r1) ; nop = nop(r0, r0) ;" //dAAAA
		"sig_none ; r3 = not.always(r3, r3) ; nop = nop(r0, r0) ;" //1-dAAAA
		"sig_none ; nop = nop(r0, r0) ; r2 = v8min.always(r2, r3) ;" //min(sAAAA, 1-dAAAA)
		"sig_load_imm ; r3 = load32.always(0xff000000) ; nop = load32() ;" //load alpha = 1
		"sig_small_imm ; r2 = or.always(r2, r3) ; nop = nop(r0, r0) ;" //set alpha to 1
		break;
	}

	/// Multiply sRGBA and source factor
	"sig_none ; nop = nop(r0, r0) ; r0 = v8muld.always(r0, r2) ;"

	///repeat for
	//bas->dstAlphaBlendFactor

	/// Multiply dRGBA and destination factor
	"sig_none ; nop = nop(r0, r0) ; r1 = v8muld.always(r1, r2) ;"
}
else //separate factors
{
	//
}

switch(bas->alphaBlendOp)
{
case VK_BLEND_OP_ADD:
	"sig_none ; nop = nop(r0, r0) ; tlb_color_all = v8adds.always(r0, r1) ;"
	break;
case VK_BLEND_OP_SUBTRACT:
	"sig_none ; nop = nop(r0, r0) ; tlb_color_all = v8subs.always(r0, r1) ;"
	break;
case VK_BLEND_OP_REVERSE_SUBTRACT:
	"sig_none ; nop = nop(r0, r0) ; tlb_color_all = v8subs.always(r1, r0) ;"
	break;
case VK_BLEND_OP_MIN:
	"sig_none ; nop = nop(r0, r0) ; tlb_color_all = v8min.always(r0, r1) ;"
	break;
case VK_BLEND_OP_MAX:
	"sig_none ; nop = nop(r0, r0) ; tlb_color_all = v8max.always(r0, r1) ;"
	break;
}
  • depth writes are executed using the following
"sig_none ; tlb_z = or.always(r0, r0) ; nop = nop(r0, r0) ;"
  • stencil setup state can be set in a shader using:
"sig_none ; tlb_stencil_setup = or.always(r0, r0) ; nop = nop(r0, r0) ;"
  • please note that a depth write must be executed for stencil and depth testing to occur

  • stencil setup state can be encoded using the following function.

void encodeStencilValue(uint32_t *values, uint32_t* numValues, VkStencilOpState front, VkStencilOpState back, uint8_t stencilTestEnable)
{
	assert(values);
	assert(numValues);

	if(!stencilTestEnable)
	{
		front.compareOp = back.compareOp = VK_COMPARE_OP_ALWAYS;
	}

	if(front.compareMask == back.compareMask &&
	   front.compareOp == back.compareOp &&
	   front.depthFailOp == back.depthFailOp &&
	   front.failOp == back.failOp &&
	   front.passOp == back.passOp &&
	   front.reference == back.reference &&
	   front.writeMask == back.writeMask
	   )
	{
		*numValues = 1;

		values[0] = 0
				| (front.compareMask & 0xff)
				| (front.reference & 0xff) << 0x8
				| (getCompareOp(front.compareOp) & 0x7) << 16
				| (getStencilOp(front.failOp) & 0x7) << 19
				| (getStencilOp(front.passOp) & 0x7) << 22
				| (getStencilOp(front.depthFailOp) & 0x7) << 25
				| 3 << 30; //front and back

		switch(front.writeMask)
		{
		case 0x1:
			values[0] |= 0 << 28;
			break;
		case 0x3:
			values[0] |= 1 << 28;
			break;
		case 0xf:
			values[0] |= 2 << 28;
			break;
		case 0xff:
			values[0] |= 3 << 28;
			break;
		default:
			values[1] = 0
					| (front.writeMask & 0xff)
					| (front.writeMask & 0xff) << 8;
			*numValues = 2;
			break;
		};
	}
	else
	{
		*numValues = 2;

		values[0] = 0
				| (front.compareMask & 0xff)
				| (front.reference & 0xff) << 0x8
				| (getCompareOp(front.compareOp) & 0x7) << 16
				| (getStencilOp(front.failOp) & 0x7) << 19
				| (getStencilOp(front.passOp) & 0x7) << 22
				| (getStencilOp(front.depthFailOp) & 0x7) << 25
				| 1 << 30; //front

		values[1] = 0
				| (back.compareMask & 0xff)
				| (back.reference & 0xff) << 0x8
				| (getCompareOp(back.compareOp) & 0x7) << 16
				| (getStencilOp(back.failOp) & 0x7) << 19
				| (getStencilOp(back.passOp) & 0x7) << 22
				| (getStencilOp(back.depthFailOp) & 0x7) << 25
				| 2 << 30; //front

		if((front.writeMask == 0x1 ||
		   front.writeMask == 0x3 ||
		   front.writeMask == 0xf ||
		   front.writeMask == 0xff) &&
		   (back.writeMask == 0x1 ||
		   back.writeMask == 0x3 ||
		   back.writeMask == 0xf ||
		   back.writeMask == 0xff))
		{
			switch(front.writeMask)
			{
			case 0x1:
				values[0] |= 0 << 28;
				break;
			case 0x3:
				values[0] |= 1 << 28;
				break;
			case 0xf:
				values[0] |= 2 << 28;
				break;
			case 0xff:
				values[0] |= 3 << 28;
				break;
			};

			switch(back.writeMask)
			{
			case 0x1:
				values[1] |= 0 << 28;
				break;
			case 0x3:
				values[1] |= 1 << 28;
				break;
			case 0xf:
				values[1] |= 2 << 28;
				break;
			case 0xff:
				values[1] |= 3 << 28;
				break;
			};
		}
		else
		{
			values[2] = 0
					| (front.writeMask & 0xff)
					| (back.writeMask & 0xff) << 8;
			*numValues = 3;
		}
	}
}
  • Vertex attribute setup can be encoded using
uint32_t encodeVPMSetup(uint8_t stride,
			uint8_t direction, //0 vertical, 1 horizontal
			uint8_t isLaned, //0 packed, 1 laned
			uint8_t size, //0 8bit, 1 16bit, 2 32bit
			uint8_t address, //see doc
			uint8_t vectorComponentsToRead //only used for VPM read setup
			)
{
	uint32_t res = 0;
	res |= ((uint32_t)(vectorComponentsToRead) & 0xf) << 20;
	res |= ((uint32_t)(stride) & 0x3f) << 12;
	res |= ((uint32_t)(direction) & 0x1) << 11;
	res |= ((uint32_t)(isLaned) & 0x1) << 10;
	res |= ((uint32_t)(size) & 0x3) << 8;
	res |= (uint32_t)(address) & 0xff;

	return res;
}
Clone this wiki locally