Skip to content
Yours3lf edited this page Jun 18, 2020 · 13 revisions

Welcome to the rpi-vk-driver wiki!

Driver quirks

  • can only submit 7920 draw calls per renderpass currently due to a hardware limitation
  • can only submit 65535 vertices per draw call
  • unsupported Vulkan® functionality is denoted by the UNSUPPORTED() macro which prints an error message to stderr
  • if VkSamplerCreateInfo's mipLodBias is greater than 0.0, it signals to disable automatic LOD calculation within shaders. Then it's up to the developer to supply an LOD level.
  • semaphores currently don't function as expected, see sync.c

Driver debugging

  • the driver is full of asserts that come handy when trying to debug a driver bug or hardware limitation (they only work in debug mode though)
  • the driver also has a lot of TODOs, so if something doesn't quite behave the way it should according to the spec, it's useful to have a look at the source code
  • kernel side error messages can be retrieved by adding drm.debug=0xf to /boot/cmdline.txt
  • Vulkan functions can be exposed (so one doesn't need the Vulkan-Loader) by setting the EXPOSE_DRIVER define to 1 during compilation
  • the driver's CPU overhead can be profiled by setting the RPI_PROFILE define to 1 during compilation
  • command lists submitted to the kernel side can be printed by setting the RPI_PRINT_COMMAND_LISTS define to 1 during compilation.
  • information about the compiled assembly shaders (including disassembly) can be dumped by setting the RPI_DUMP_SHADER_INFO define to 1 during compilation
  • hardware information can be printed by setting RPI_PRINT_HARDWARE_INFO to 1 during compilation

Shader patching

The Broadcom Videocore IV GPU needs a couple of operations to happen in shader code that might have fixed function hardware on other platforms.
These are:

  • writing stencil state setup register
  • writing depth value to depth buffer
  • performing blending in software
  • writing vertex parameter memory read and write setup registers

Since the project does not include a compiler, but rather works with an assembly based shader setup, I decided not to patch shaders based on the state provided to the driver, but rather let the developer have full control. This means that regardless of what

  • depth write state
  • blending state
  • stencil state
  • vertex attribute state

is passed to the driver, this will not be reflected in the final behaviour unless the developer adds it to the assembly shaders. This will enable developers to take full control and optimise shaders to the last cycle.

The following helper functions are provided:

  • blending is implemented in software on the Broadcom Videocore IV GPU. Various blending modes can be implemented in the following way:
/// r0 contains sRGBA (in BGRA form)
"sig_none ; r0 = or.always(a, a, uni, nop) ; nop = nop(r0, r0) ;"

/// load dRGBA to r1 (in BGRA form)
/// load tbl color dRGBA from r4
"sig_color_load ; nop = nop(r0, r0) ; nop = nop(r0, r0) ;"
"sig_none ; r1 = or.always(r4, r4) ; nop = nop(r0, r0) ;"

//if factors are not separate
if(bas->srcAlphaBlendFactor == bas->srcColorBlendFactor &&
   bas->dstAlphaBlendFactor == bas->dstColorBlendFactor)
{
	switch(bas->srcAlphaBlendFactor)
	{
	case VK_BLEND_FACTOR_ZERO:
		"sig_small_imm ; r2 = or.always(b, b, nop, 0) ; nop = nop(r0, r0) ;"
		break;
	case VK_BLEND_FACTOR_ONE:
		"sig_small_imm ; r2 = or.always(b, b, nop, -1) ; nop = nop(r0, r0) ;"
		break;
	case VK_BLEND_FACTOR_SRC_COLOR:
		"sig_none ; r2 = or.always(r0, r0) ; nop = nop(r0, r0) ;"
		break;
	case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
		"sig_none ; r2 = not.always(r0, r0) ; nop = nop(r0, r0) ;"
		break;
	case VK_BLEND_FACTOR_DST_COLOR:
		"sig_none ; r2 = or.always(r1, r1) ; nop = nop(r0, r0) ;"
		break;
	case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
		"sig_none ; r2 = not.always(r1, r1) ; nop = nop(r0, r0) ;"
		break;
	case VK_BLEND_FACTOR_SRC_ALPHA:
		"sig_none ; r2.8888 = or.always.8d(r0, r0) ; nop = nop(r0, r0) ;"
		break;
	case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
		"sig_none ; r2.8888 = or.always.8d(r0, r0) ; nop = nop(r0, r0) ;"
		"sig_none ; r2 = not.always(r2, r2) ; nop = nop(r0, r0) ;"
	case VK_BLEND_FACTOR_DST_ALPHA:
		"sig_none ; r2.8888 = or.always.8d(r1, r1) ; nop = nop(r0, r0) ;"
		break;
	case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
		"sig_none ; r2.8888 = or.always.8d(r1, r1) ; nop = nop(r0, r0) ;"
		"sig_none ; r2 = not.always(r2, r2) ; nop = nop(r0, r0) ;"
		break;
	case VK_BLEND_FACTOR_CONSTANT_COLOR:
	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
	case VK_BLEND_FACTOR_CONSTANT_ALPHA:
	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
		"sig_load_imm ; r2 = load32.always(0xffffffff) ; nop = load32() ;"
		break;
	case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
		"sig_none ; r2.8888 = or.always.8d(r0, r0) ; nop = nop(r0, r0) ;" //sAAAA
		"sig_none ; r3.8888 = or.always.8d(r1, r1) ; nop = nop(r0, r0) ;" //dAAAA
		"sig_none ; r3 = not.always(r3, r3) ; nop = nop(r0, r0) ;" //1-dAAAA
		"sig_none ; nop = nop(r0, r0) ; r2 = v8min.always(r2, r3) ;" //min(sAAAA, 1-dAAAA)
		"sig_load_imm ; r3 = load32.always(0xff000000) ; nop = load32() ;" //load alpha = 1
		"sig_small_imm ; r2 = or.always(r2, r3) ; nop = nop(r0, r0) ;" //set alpha to 1
		break;
	}

	/// Multiply sRGBA and source factor
	"sig_none ; nop = nop(r0, r0) ; r0 = v8muld.always(r0, r2) ;"

	///repeat for
	//bas->dstAlphaBlendFactor

	/// Multiply dRGBA and destination factor
	"sig_none ; nop = nop(r0, r0) ; r1 = v8muld.always(r1, r2) ;"
}
else //separate factors
{
	//
}

switch(bas->alphaBlendOp)
{
case VK_BLEND_OP_ADD:
	"sig_none ; nop = nop(r0, r0) ; tlb_color_all = v8adds.always(r0, r1) ;"
	break;
case VK_BLEND_OP_SUBTRACT:
	"sig_none ; nop = nop(r0, r0) ; tlb_color_all = v8subs.always(r0, r1) ;"
	break;
case VK_BLEND_OP_REVERSE_SUBTRACT:
	"sig_none ; nop = nop(r0, r0) ; tlb_color_all = v8subs.always(r1, r0) ;"
	break;
case VK_BLEND_OP_MIN:
	"sig_none ; nop = nop(r0, r0) ; tlb_color_all = v8min.always(r0, r1) ;"
	break;
case VK_BLEND_OP_MAX:
	"sig_none ; nop = nop(r0, r0) ; tlb_color_all = v8max.always(r0, r1) ;"
	break;
}
  • depth writes are executed using the following
"sig_none ; tlb_z = or.always(r0, r0) ; nop = nop(r0, r0) ;"
  • stencil setup state can be set in a shader using:
"sig_none ; tlb_stencil_setup = or.always(r0, r0) ; nop = nop(r0, r0) ;"
  • please note that a depth write must be executed for stencil and depth testing to occur

  • stencil setup state can be encoded using the following function.

void encodeStencilValue(uint32_t *values, uint32_t* numValues, VkStencilOpState front, VkStencilOpState back, uint8_t stencilTestEnable)
{
	assert(values);
	assert(numValues);

	if(!stencilTestEnable)
	{
		front.compareOp = back.compareOp = VK_COMPARE_OP_ALWAYS;
	}

	if(front.compareMask == back.compareMask &&
	   front.compareOp == back.compareOp &&
	   front.depthFailOp == back.depthFailOp &&
	   front.failOp == back.failOp &&
	   front.passOp == back.passOp &&
	   front.reference == back.reference &&
	   front.writeMask == back.writeMask
	   )
	{
		*numValues = 1;

		values[0] = 0
				| (front.compareMask & 0xff)
				| (front.reference & 0xff) << 0x8
				| (getCompareOp(front.compareOp) & 0x7) << 16
				| (getStencilOp(front.failOp) & 0x7) << 19
				| (getStencilOp(front.passOp) & 0x7) << 22
				| (getStencilOp(front.depthFailOp) & 0x7) << 25
				| 3 << 30; //front and back

		switch(front.writeMask)
		{
		case 0x1:
			values[0] |= 0 << 28;
			break;
		case 0x3:
			values[0] |= 1 << 28;
			break;
		case 0xf:
			values[0] |= 2 << 28;
			break;
		case 0xff:
			values[0] |= 3 << 28;
			break;
		default:
			values[1] = 0
					| (front.writeMask & 0xff)
					| (front.writeMask & 0xff) << 8;
			*numValues = 2;
			break;
		};
	}
	else
	{
		*numValues = 2;

		values[0] = 0
				| (front.compareMask & 0xff)
				| (front.reference & 0xff) << 0x8
				| (getCompareOp(front.compareOp) & 0x7) << 16
				| (getStencilOp(front.failOp) & 0x7) << 19
				| (getStencilOp(front.passOp) & 0x7) << 22
				| (getStencilOp(front.depthFailOp) & 0x7) << 25
				| 1 << 30; //front

		values[1] = 0
				| (back.compareMask & 0xff)
				| (back.reference & 0xff) << 0x8
				| (getCompareOp(back.compareOp) & 0x7) << 16
				| (getStencilOp(back.failOp) & 0x7) << 19
				| (getStencilOp(back.passOp) & 0x7) << 22
				| (getStencilOp(back.depthFailOp) & 0x7) << 25
				| 2 << 30; //front

		if((front.writeMask == 0x1 ||
		   front.writeMask == 0x3 ||
		   front.writeMask == 0xf ||
		   front.writeMask == 0xff) &&
		   (back.writeMask == 0x1 ||
		   back.writeMask == 0x3 ||
		   back.writeMask == 0xf ||
		   back.writeMask == 0xff))
		{
			switch(front.writeMask)
			{
			case 0x1:
				values[0] |= 0 << 28;
				break;
			case 0x3:
				values[0] |= 1 << 28;
				break;
			case 0xf:
				values[0] |= 2 << 28;
				break;
			case 0xff:
				values[0] |= 3 << 28;
				break;
			};

			switch(back.writeMask)
			{
			case 0x1:
				values[1] |= 0 << 28;
				break;
			case 0x3:
				values[1] |= 1 << 28;
				break;
			case 0xf:
				values[1] |= 2 << 28;
				break;
			case 0xff:
				values[1] |= 3 << 28;
				break;
			};
		}
		else
		{
			values[2] = 0
					| (front.writeMask & 0xff)
					| (back.writeMask & 0xff) << 8;
			*numValues = 3;
		}
	}
}
  • Vertex attribute setup can be encoded using
uint32_t encodeVPMSetup(uint8_t stride,
			uint8_t direction, //0 vertical, 1 horizontal
			uint8_t isLaned, //0 packed, 1 laned
			uint8_t size, //0 8bit, 1 16bit, 2 32bit
			uint8_t address, //see doc
			uint8_t vectorComponentsToRead //only used for VPM read setup
			)
{
	uint32_t res = 0;
	res |= ((uint32_t)(vectorComponentsToRead) & 0xf) << 20;
	res |= ((uint32_t)(stride) & 0x3f) << 12;
	res |= ((uint32_t)(direction) & 0x1) << 11;
	res |= ((uint32_t)(isLaned) & 0x1) << 10;
	res |= ((uint32_t)(size) & 0x3) << 8;
	res |= (uint32_t)(address) & 0xff;

	return res;
}

Shader assembly loading

Shader assembly and corresponding assemlby-to-descriptor (and push constant) mapping can be passed to the driver the following way. Pay attention to the SPIR-V magic constants passed as those signal the driver that it's receiving assembly data. You need to include driver/vkExt.h for the structure definitions. The QPU assembler included with the driver can be used to write assembly code in human readable code and convert it to binary form. QPUassembler/qpu_assembler.h/c

char vs_asm_code[] =
///0x40000000 = 2.0
///uni = 1.0
///rb0 = 2 - 1 = 1
"sig_small_imm ; rx0 = fsub.ws.always(b, a, uni, 0x40000000) ; nop = nop(r0, r0) ;\n"
///set up VPM read for subsequent reads
///0x00201a00: 0000 0000 0010 0000 0001 1010 0000 0000
///addr: 0
///size: 32bit
///packed
///horizontal
///stride=1
///vectors to read = 2 (how many components)
"sig_load_imm ; vr_setup = load32.always(0x00201a00) ; nop = load32.always() ;\n"
///uni = viewportXScale
///r0 = vpm * uni
"sig_none ; nop = nop(r0, r0, vpm_read, uni) ; r0 = fmul.always(a, b) ;\n"
///r1 = r0 * rb0 (1)
"sig_none ; nop = nop(r0, r0, nop, rb0) ; r1 = fmul.always(r0, b) ;\n"
///uni = viewportYScale
///ra0.16a = int(r1), r2 = vpm * uni
"sig_none ; rx0.16a = ftoi.always(r1, r1, vpm_read, uni) ; r2 = fmul.always(a, b) ;\n"
///r3 = r2 * rb0
"sig_none ; nop = nop(r0, r0, nop, rb0) ; r3 = fmul.always(r2, b) ;\n"
///ra0.16b = int(r3)
"sig_none ; rx0.16b = ftoi.always(r3, r3) ; nop = nop(r0, r0) ;\n"
///set up VPM write for subsequent writes
///0x00001a00: 0000 0000 0000 0000 0001 1010 0000 0000
///addr: 0
///size: 32bit
///horizontal
///stride = 1
"sig_load_imm ; vw_setup = load32.always.ws(0x00001a00) ; nop = load32.always() ;\n"
///shaded vertex format for PSE
/// Ys and Xs
///vpm = ra0
"sig_none ; vpm = or.always(a, a, ra0, nop) ; nop = nop(r0, r0);\n"
/// Zs
///uni = 0.5
///vpm = uni
"sig_none ; vpm = or.always(a, a, uni, nop) ; nop = nop(r0, r0);\n"
/// 1.0 / Wc
///vpm = rb0 (1)
"sig_none ; vpm = or.always(b, b, nop, rb0) ; nop = nop(r0, r0);\n"
///END
"sig_end ; nop = nop(r0, r0) ; nop = nop(r0, r0) ;\n"
"sig_none ; nop = nop(r0, r0) ; nop = nop(r0, r0) ;\n"
"sig_none ; nop = nop(r0, r0) ; nop = nop(r0, r0) ;\n"
	"\0";

char cs_asm_code[] =
///uni = 1.0
///r3 = 2.0 - uni
"sig_small_imm ; r3 = fsub.always(b, a, uni, 0x40000000) ; nop = nop(r0, r0);\n"
"sig_load_imm ; vr_setup = load32.always(0x00201a00) ; nop = load32.always() ;\n"
///r2 = vpm
"sig_none ; r2 = or.always(a, a, vpm_read, nop) ; nop = nop(r0, r0);\n"
"sig_load_imm ; vw_setup = load32.always.ws(0x00001a00) ; nop = load32.always() ;\n"
///shaded coordinates format for PTB
/// write Xc
///r1 = vpm, vpm = r2
"sig_none ; r1 = or.always(a, a, vpm_read, nop) ; vpm = v8min.always(r2, r2);\n"
/// write Yc
///uni = viewportXscale
///vpm = r1, r2 = r2 * uni
"sig_none ; vpm = or.always(r1, r1, uni, nop) ; r2 = fmul.always(r2, a);\n"
///uni = viewportYscale
///r1 = r1 * uni
"sig_none ; nop = nop(r0, r0, uni, nop) ; r1 = fmul.always(r1, a);\n"
///r0 = r2 * r3
"sig_none ; nop = nop(r0, r0) ; r0 = fmul.always(r2, r3);\n"
///ra0.16a = r0, r1 = r1 * r3
"sig_none ; rx0.16a = ftoi.always(r0, r0) ; r1 = fmul.always(r1, r3) ;\n"
///ra0.16b = r1
"sig_none ; rx0.16b = ftoi.always(r1, r1) ; nop = nop(r0, r0) ;\n"
///write Zc
///vpm = 0
"sig_small_imm ; vpm = or.always(b, b, nop, 0) ; nop = nop(r0, r0) ;\n"
///write Wc
///vpm = 1.0
"sig_small_imm ; vpm = or.always(b, b, nop, 0x3f800000) ; nop = nop(r0, r0) ;\n"
///write Ys and Xs
///vpm = ra0
"sig_none ; vpm = or.always(a, a, ra0, nop) ; nop = nop(r0, r0) ;\n"
///write Zs
///uni = 0.5
///vpm = uni
"sig_none ; vpm = or.always(a, a, uni, nop) ; nop = nop(r0, r0) ;\n"
///write 1/Wc
///vpm = r3
"sig_none ; vpm = or.always(r3, r3) ; nop = nop(r0, r0) ;\n"
///END
"sig_end ; nop = nop(r0, r0) ; nop = nop(r0, r0) ;\n"
"sig_none ; nop = nop(r0, r0) ; nop = nop(r0, r0) ;\n"
"sig_none ; nop = nop(r0, r0) ; nop = nop(r0, r0) ;\n"
	"\0";

//display a color
char fs_asm_code[] =
"sig_none ; nop = nop(r0, r0) ; nop = nop(r0, r0) ;"
///BGRA
"sig_load_imm ; r0 = load32.always(0xffa14ccc) ; nop = load32() ;"
"sig_none ; tlb_color_all = or.always(r0, r0) ; nop = nop(r0, r0) ;"
"sig_end ; nop = nop(r0, r0) ; nop = nop(r0, r0) ;"
"sig_none ; nop = nop(r0, r0) ; nop = nop(r0, r0) ;"
"sig_unlock_score ; nop = nop(r0, r0) ; nop = nop(r0, r0) ;"
	"\0";

VkRpiAssemblyMappingEXT vertexMappings[] = {
	//vertex shader uniforms
	{
		VK_RPI_ASSEMBLY_MAPPING_TYPE_PUSH_CONSTANT,
		VK_DESCRIPTOR_TYPE_MAX_ENUM, //descriptor type
		0, //descriptor set #
		0, //descriptor binding #
		0, //descriptor array element #
		0, //resource offset
	},
	{
		VK_RPI_ASSEMBLY_MAPPING_TYPE_PUSH_CONSTANT,
		VK_DESCRIPTOR_TYPE_MAX_ENUM, //descriptor type
		0, //descriptor set #
		0, //descriptor binding #
		0, //descriptor array element #
		4, //resource offset
	},
	{
		VK_RPI_ASSEMBLY_MAPPING_TYPE_PUSH_CONSTANT,
		VK_DESCRIPTOR_TYPE_MAX_ENUM, //descriptor type
		0, //descriptor set #
		0, //descriptor binding #
		0, //descriptor array element #
		8, //resource offset
	},
	{
		VK_RPI_ASSEMBLY_MAPPING_TYPE_PUSH_CONSTANT,
		VK_DESCRIPTOR_TYPE_MAX_ENUM, //descriptor type
		0, //descriptor set #
		0, //descriptor binding #
		0, //descriptor array element #
		12, //resource offset
	}
};

uint32_t spirv[6];

uint64_t* asm_ptrs[4] = {};
uint32_t asm_sizes[4] = {};

VkRpiAssemblyMappingEXT* asm_mappings[4] = {};
uint32_t asm_mappings_sizes[4] = {};

VkRpiShaderModuleAssemblyCreateInfoEXT shaderModuleCreateInfo = {};
shaderModuleCreateInfo.instructions = asm_ptrs;
shaderModuleCreateInfo.numInstructions = asm_sizes;
shaderModuleCreateInfo.mappings = asm_mappings;
shaderModuleCreateInfo.numMappings = asm_mappings_sizes;

asm_mappings[VK_RPI_ASSEMBLY_TYPE_VERTEX] = vertexMappings;
asm_mappings_sizes[VK_RPI_ASSEMBLY_TYPE_VERTEX] = sizeof(vertexMappings) / sizeof(VkRpiAssemblyMappingEXT);

{ //assemble cs code
	asm_sizes[0] = get_num_instructions(cs_asm_code);
	uint32_t size = sizeof(uint64_t)*asm_sizes[0];
	asm_ptrs[0] = (uint64_t*)malloc(size);
	//modifies the passed string's contents
	assemble_qpu_asm(cs_asm_code, asm_ptrs[0]);
}

{ //assemble vs code
	asm_sizes[1] = get_num_instructions(vs_asm_code);
	uint32_t size = sizeof(uint64_t)*asm_sizes[1];
	asm_ptrs[1] = (uint64_t*)malloc(size);
	assemble_qpu_asm(vs_asm_code, asm_ptrs[1]);
}

{ //assemble fs code
	asm_sizes[2] = get_num_instructions(fs_asm_code);
	uint32_t size = sizeof(uint64_t)*asm_sizes[2];
	asm_ptrs[2] = (uint64_t*)malloc(size);
	assemble_qpu_asm(fs_asm_code, asm_ptrs[2]);
}

spirv[0] = 0x07230203;
spirv[1] = 0x00010000;
spirv[2] = 0x14E45250;
spirv[3] = 1;
spirv[4] = (uint32_t)&shaderModuleCreateInfo;
//words start here
spirv[5] = 1 << 16;

VkShaderModuleCreateInfo smci = {};
smci.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
smci.codeSize = sizeof(uint32_t)*6;
smci.pCode = spirv;
vkCreateShaderModule(device, &smci, 0, &shaderModule);

for(uint32_t c = 0; c < 4; ++c)
{
	free(asm_ptrs[c]);
}
Clone this wiki locally