diff --git a/Documentation/devicetree/bindings/gpu/arm,mali-valhall-csf.yaml b/Documentation/devicetree/bindings/gpu/arm,mali-valhall-csf.yaml
new file mode 100644
index 0000000000000..f9e69e86b033a
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpu/arm,mali-valhall-csf.yaml
@@ -0,0 +1,147 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpu/arm,mali-valhall-csf.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ARM Mali Valhall GPU
+
+maintainers:
+  - Liviu Dudau <liviu.dudau@arm.com>
+  - Boris Brezillon <boris.brezillon@collabora.com>
+
+properties:
+  $nodename:
+    pattern: '^gpu@[a-f0-9]+$'
+
+  compatible:
+    oneOf:
+      - items:
+          - enum:
+              - rockchip,rk3588-mali
+          - const: arm,mali-valhall-csf   # Mali Valhall GPU model/revision is fully discoverable
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    items:
+      - description: Job interrupt
+      - description: MMU interrupt
+      - description: GPU interrupt
+
+  interrupt-names:
+    items:
+      - const: job
+      - const: mmu
+      - const: gpu
+
+  clocks:
+    minItems: 1
+    maxItems: 3
+
+  clock-names:
+    minItems: 1
+    items:
+      - const: core
+      - const: coregroup
+      - const: stacks
+
+  mali-supply: true
+
+  operating-points-v2: true
+  opp-table:
+    type: object
+
+  power-domains:
+    minItems: 1
+    maxItems: 5
+
+  power-domain-names:
+    minItems: 1
+    maxItems: 5
+
+  sram-supply: true
+
+  "#cooling-cells":
+    const: 2
+
+  dynamic-power-coefficient:
+    $ref: /schemas/types.yaml#/definitions/uint32
+    description:
+      A u32 value that represents the running time dynamic
+      power coefficient in units of uW/MHz/V^2. The
+      coefficient can either be calculated from power
+      measurements or derived by analysis.
+
+      The dynamic power consumption of the GPU is
+      proportional to the square of the Voltage (V) and
+      the clock frequency (f). The coefficient is used to
+      calculate the dynamic power as below -
+
+      Pdyn = dynamic-power-coefficient * V^2 * f
+
+      where voltage is in V, frequency is in MHz.
+
+  dma-coherent: true
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - interrupt-names
+  - clocks
+  - mali-supply
+
+additionalProperties: false
+
+allOf:
+  - if:
+      properties:
+        compatible:
+          contains:
+            const: rockchip,rk3588-mali
+    then:
+      properties:
+        clocks:
+          minItems: 3
+        power-domains:
+          maxItems: 1
+        power-domain-names: false
+
+examples:
+  - |
+    #include <dt-bindings/clock/rockchip,rk3588-cru.h>
+    #include <dt-bindings/interrupt-controller/irq.h>
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+    #include <dt-bindings/power/rk3588-power.h>
+
+    gpu: gpu@fb000000 {
+        compatible = "rockchip,rk3588-mali", "arm,mali-valhall-csf";
+        reg = <0xfb000000 0x200000>;
+        interrupts = <GIC_SPI 92 IRQ_TYPE_LEVEL_HIGH 0>,
+                     <GIC_SPI 93 IRQ_TYPE_LEVEL_HIGH 0>,
+                     <GIC_SPI 94 IRQ_TYPE_LEVEL_HIGH 0>;
+        interrupt-names = "job", "mmu", "gpu";
+        clock-names = "core", "coregroup", "stacks";
+        clocks = <&cru CLK_GPU>, <&cru CLK_GPU_COREGROUP>,
+                 <&cru CLK_GPU_STACKS>;
+        power-domains = <&power RK3588_PD_GPU>;
+        operating-points-v2 = <&gpu_opp_table>;
+        mali-supply = <&vdd_gpu_s0>;
+        sram-supply = <&vdd_gpu_mem_s0>;
+
+        gpu_opp_table: opp-table {
+            compatible = "operating-points-v2";
+            opp-300000000 {
+                opp-hz = /bits/ 64 <300000000>;
+                opp-microvolt = <675000 675000 850000>;
+            };
+            opp-400000000 {
+                opp-hz = /bits/ 64 <400000000>;
+                opp-microvolt = <675000 675000 850000>;
+            };
+        };
+    };
+
+...
\ No newline at end of file
diff --git a/Documentation/driver-api/dma-buf.rst b/Documentation/driver-api/dma-buf.rst
index 36a76cbe90954..c010ad76f001a 100644
--- a/Documentation/driver-api/dma-buf.rst
+++ b/Documentation/driver-api/dma-buf.rst
@@ -158,6 +158,12 @@ DMA Fence Signalling Annotations
 .. kernel-doc:: drivers/dma-buf/dma-fence.c
    :doc: fence signalling annotation
 
+DMA Fence Deadline Hints
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. kernel-doc:: drivers/dma-buf/dma-fence.c
+   :doc: deadline hints
+
 DMA Fences Functions Reference
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/Documentation/gpu/driver-uapi.rst b/Documentation/gpu/driver-uapi.rst
index 4411e6919a3df..7b6d9006306cf 100644
--- a/Documentation/gpu/driver-uapi.rst
+++ b/Documentation/gpu/driver-uapi.rst
@@ -6,3 +6,8 @@ drm/i915 uAPI
 =============
 
 .. kernel-doc:: include/uapi/drm/i915_drm.h
+
+drm/panthor uAPI
+================
+
+.. kernel-doc:: i
\ No newline at end of file
diff --git a/Documentation/gpu/drm-mm.rst b/Documentation/gpu/drm-mm.rst
index a79fd3549ff8c..709c4f149f6ec 100644
--- a/Documentation/gpu/drm-mm.rst
+++ b/Documentation/gpu/drm-mm.rst
@@ -466,6 +466,42 @@ DRM MM Range Allocator Function References
 .. kernel-doc:: drivers/gpu/drm/drm_mm.c
    :export:
 
+DRM GPU VA Manager
+==================
+
+Overview
+--------
+
+.. kernel-doc:: drivers/gpu/drm/drm_gpuva_mgr.c
+   :doc: Overview
+
+Split and Merge
+---------------
+
+.. kernel-doc:: drivers/gpu/drm/drm_gpuva_mgr.c
+   :doc: Split and Merge
+
+Locking
+-------
+
+.. kernel-doc:: drivers/gpu/drm/drm_gpuva_mgr.c
+   :doc: Locking
+
+Examples
+--------
+
+.. kernel-doc:: drivers/gpu/drm/drm_gpuva_mgr.c
+   :doc: Examples
+
+DRM GPU VA Manager Function References
+--------------------------------------
+
+.. kernel-doc:: include/drm/drm_gpuva_mgr.h
+   :internal:
+
+.. kernel-doc:: drivers/gpu/drm/drm_gpuva_mgr.c
+   :export:
+
 DRM Buddy Allocator
 ===================
 
@@ -493,6 +529,18 @@ DRM Sync Objects
 .. kernel-doc:: drivers/gpu/drm/drm_syncobj.c
    :export:
 
+DRM Execution context
+=====================
+
+.. kernel-doc:: drivers/gpu/drm/drm_exec.c
+   :doc: Overview
+
+.. kernel-doc:: include/drm/drm_exec.h
+   :internal:
+
+.. kernel-doc:: drivers/gpu/drm/drm_exec.c
+   :export:
+
 GPU Scheduler
 =============
 
@@ -502,6 +550,12 @@ Overview
 .. kernel-doc:: drivers/gpu/drm/scheduler/sched_main.c
    :doc: Overview
 
+Flow Control
+------------
+
+.. kernel-doc:: drivers/gpu/drm/scheduler/sched_main.c
+   :doc: Flow Control
+
 Scheduler Function References
 -----------------------------
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 379387e20a96d..ca4395fa654b1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1622,6 +1622,17 @@ T:	git git://anongit.freedesktop.org/drm/drm-misc
 F:	drivers/gpu/drm/panfrost/
 F:	include/uapi/drm/panfrost_drm.h
 
+ARM MALI PANTHOR DRM DRIVER
+M:	Boris Brezillon <boris.brezillon@collabora.com>
+M:	Steven Price <steven.price@arm.com>
+M:	Liviu Dudau <liviu.dudau@arm.com>
+L:	dri-devel@lists.freedesktop.org
+S:	Supported
+T:	git git://anongit.freedesktop.org/drm/drm-misc
+F:	Documentation/devicetree/bindings/gpu/arm,mali-valhall-csf.yaml
+F:	drivers/gpu/drm/panthor/
+F:	include/uapi/drm/panthor_drm.h
+
 ARM MALI-DP DRM DRIVER
 M:	Liviu Dudau <liviu.dudau@arm.com>
 M:	Brian Starkey <brian.starkey@arm.com>
diff --git a/arch/arm64/boot/dts/rockchip/overlay/Makefile b/arch/arm64/boot/dts/rockchip/overlay/Makefile
index 1d41690a889d1..1cd1565c51d7b 100644
--- a/arch/arm64/boot/dts/rockchip/overlay/Makefile
+++ b/arch/arm64/boot/dts/rockchip/overlay/Makefile
@@ -63,6 +63,7 @@ dtbo-$(CONFIG_ARCH_ROCKCHIP) += \
 	khadas-edge2-cam2.dtbo \
 	khadas-edge2-cam3.dtbo \
 	rockchip-rk3588-opp-oc-24ghz.dtbo \
+	rockchip-rk3588-panthor-gpu.dtbo \
 	rk3588-can0-m0.dtbo \
 	rk3588-can1-m0.dtbo \
 	rk3588-can1-m1.dtbo \
diff --git a/arch/arm64/boot/dts/rockchip/overlay/rockchip-rk3588-panthor-gpu.dts b/arch/arm64/boot/dts/rockchip/overlay/rockchip-rk3588-panthor-gpu.dts
new file mode 100644
index 0000000000000..2da62dbb4a931
--- /dev/null
+++ b/arch/arm64/boot/dts/rockchip/overlay/rockchip-rk3588-panthor-gpu.dts
@@ -0,0 +1,19 @@
+/dts-v1/;
+/plugin/;
+
+/ {
+	fragment@0 {
+		target = <&gpu>;
+		__overlay__ {
+			status = "disabled";
+		};
+	};
+
+	fragment@1 {
+		target = <&gpu_panthor>;
+		__overlay__ {
+			status = "okay";
+			mali-supply = <&vdd_gpu_s0>;
+		};
+	};
+};
diff --git a/arch/arm64/boot/dts/rockchip/rk3588s.dtsi b/arch/arm64/boot/dts/rockchip/rk3588s.dtsi
index 17cf2e55bb170..8ed8fa1690e34 100644
--- a/arch/arm64/boot/dts/rockchip/rk3588s.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3588s.dtsi
@@ -2381,6 +2381,62 @@
 		status = "disabled";
 	};
 
+	gpu_panthor: gpu-panthor@fb000000 {
+		compatible = "rockchip,rk3588-mali", "arm,mali-valhall-csf";
+		reg = <0x0 0xfb000000 0x0 0x200000>;
+		#cooling-cells = <2>;
+		assigned-clocks = <&scmi_clk SCMI_CLK_GPU>;
+		assigned-clock-rates = <200000000>;
+		clocks = <&cru CLK_GPU>, <&cru CLK_GPU_COREGROUP>,
+			 <&cru CLK_GPU_STACKS>;
+		clock-names = "core", "coregroup", "stacks";
+		dynamic-power-coefficient = <2982>;
+		interrupts = <GIC_SPI 92 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 93 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 94 IRQ_TYPE_LEVEL_HIGH>;
+		interrupt-names = "job", "mmu", "gpu";
+		operating-points-v2 = <&gpu_opp_table_panthor>;
+		power-domains = <&power RK3588_PD_GPU>;
+		status = "disabled";
+
+		gpu_opp_table_panthor: opp-table {
+			compatible = "operating-points-v2";
+
+			opp-300000000 {
+				opp-hz = /bits/ 64 <300000000>;
+				opp-microvolt = <675000 675000 850000>;
+			};
+			opp-400000000 {
+				opp-hz = /bits/ 64 <400000000>;
+				opp-microvolt = <675000 675000 850000>;
+			};
+			opp-500000000 {
+				opp-hz = /bits/ 64 <500000000>;
+				opp-microvolt = <675000 675000 850000>;
+			};
+			opp-600000000 {
+				opp-hz = /bits/ 64 <600000000>;
+				opp-microvolt = <675000 675000 850000>;
+			};
+			opp-700000000 {
+				opp-hz = /bits/ 64 <700000000>;
+				opp-microvolt = <700000 700000 850000>;
+			};
+			opp-800000000 {
+				opp-hz = /bits/ 64 <800000000>;
+				opp-microvolt = <750000 750000 850000>;
+			};
+			opp-900000000 {
+				opp-hz = /bits/ 64 <900000000>;
+				opp-microvolt = <800000 800000 850000>;
+			};
+			opp-1000000000 {
+				opp-hz = /bits/ 64 <1000000000>;
+				opp-microvolt = <850000 850000 850000>;
+			};
+		};
+	};
+
 	gpu_opp_table: gpu-opp-table {
 		compatible = "operating-points-v2";
 
diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 1e730bd6af322..1c0deb1e08242 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -1278,6 +1278,34 @@ struct sg_table *dma_buf_map_attachment(struct dma_buf_attachment *attach,
 }
 EXPORT_SYMBOL_NS_GPL(dma_buf_map_attachment, DMA_BUF);
 
+/**
+ * dma_buf_map_attachment_unlocked - Returns the scatterlist table of the attachment;
+ * mapped into _device_ address space. Is a wrapper for map_dma_buf() of the
+ * dma_buf_ops.
+ * @attach:	[in]	attachment whose scatterlist is to be returned
+ * @direction:	[in]	direction of DMA transfer
+ *
+ * Unlocked variant of dma_buf_map_attachment().
+ */
+struct sg_table *
+dma_buf_map_attachment_unlocked(struct dma_buf_attachment *attach,
+				enum dma_data_direction direction)
+{
+	struct sg_table *sg_table;
+
+	might_sleep();
+
+	if (WARN_ON(!attach || !attach->dmabuf))
+		return ERR_PTR(-EINVAL);
+
+	dma_resv_lock(attach->dmabuf->resv, NULL);
+	sg_table = dma_buf_map_attachment(attach, direction);
+	dma_resv_unlock(attach->dmabuf->resv);
+
+	return sg_table;
+}
+EXPORT_SYMBOL_NS_GPL(dma_buf_map_attachment_unlocked, DMA_BUF);
+
 /**
  * dma_buf_unmap_attachment - unmaps and decreases usecount of the buffer;might
  * deallocate the scatterlist associated. Is a wrapper for unmap_dma_buf() of
@@ -1314,6 +1342,31 @@ void dma_buf_unmap_attachment(struct dma_buf_attachment *attach,
 }
 EXPORT_SYMBOL_NS_GPL(dma_buf_unmap_attachment, DMA_BUF);
 
+/**
+ * dma_buf_unmap_attachment_unlocked - unmaps and decreases usecount of the buffer;might
+ * deallocate the scatterlist associated. Is a wrapper for unmap_dma_buf() of
+ * dma_buf_ops.
+ * @attach:	[in]	attachment to unmap buffer from
+ * @sg_table:	[in]	scatterlist info of the buffer to unmap
+ * @direction:	[in]	direction of DMA transfer
+ *
+ * Unlocked variant of dma_buf_unmap_attachment().
+ */
+void dma_buf_unmap_attachment_unlocked(struct dma_buf_attachment *attach,
+				       struct sg_table *sg_table,
+				       enum dma_data_direction direction)
+{
+	might_sleep();
+
+	if (WARN_ON(!attach || !attach->dmabuf || !sg_table))
+		return;
+
+	dma_resv_lock(attach->dmabuf->resv, NULL);
+	dma_buf_unmap_attachment(attach, sg_table, direction);
+	dma_resv_unlock(attach->dmabuf->resv);
+}
+EXPORT_SYMBOL_NS_GPL(dma_buf_unmap_attachment_unlocked, DMA_BUF);
+
 /**
  * dma_buf_move_notify - notify attachments that DMA-buf is moving
  *
@@ -1575,6 +1628,8 @@ int dma_buf_vmap(struct dma_buf *dmabuf, struct iosys_map *map)
 	if (WARN_ON(!dmabuf))
 		return -EINVAL;
 
+	dma_resv_assert_held(dmabuf->resv);
+
 	if (!dmabuf->ops->vmap)
 		return -EINVAL;
 
@@ -1603,6 +1658,33 @@ int dma_buf_vmap(struct dma_buf *dmabuf, struct iosys_map *map)
 }
 EXPORT_SYMBOL_NS_GPL(dma_buf_vmap, DMA_BUF);
 
+/**
+ * dma_buf_vmap_unlocked - Create virtual mapping for the buffer object into kernel
+ * address space. Same restrictions as for vmap and friends apply.
+ * @dmabuf:	[in]	buffer to vmap
+ * @map:	[out]	returns the vmap pointer
+ *
+ * Unlocked version of dma_buf_vmap()
+ *
+ * Returns 0 on success, or a negative errno code otherwise.
+ */
+int dma_buf_vmap_unlocked(struct dma_buf *dmabuf, struct iosys_map *map)
+{
+	int ret;
+
+	iosys_map_clear(map);
+
+	if (WARN_ON(!dmabuf))
+		return -EINVAL;
+
+	dma_resv_lock(dmabuf->resv, NULL);
+	ret = dma_buf_vmap(dmabuf, map);
+	dma_resv_unlock(dmabuf->resv);
+
+	return ret;
+}
+EXPORT_SYMBOL_NS_GPL(dma_buf_vmap_unlocked, DMA_BUF);
+
 /**
  * dma_buf_vunmap - Unmap a vmap obtained by dma_buf_vmap.
  * @dmabuf:	[in]	buffer to vunmap
@@ -1613,6 +1695,8 @@ void dma_buf_vunmap(struct dma_buf *dmabuf, struct iosys_map *map)
 	if (WARN_ON(!dmabuf))
 		return;
 
+	dma_resv_assert_held(dmabuf->resv);
+
 	BUG_ON(iosys_map_is_null(&dmabuf->vmap_ptr));
 	BUG_ON(dmabuf->vmapping_counter == 0);
 	BUG_ON(!iosys_map_is_equal(&dmabuf->vmap_ptr, map));
@@ -1627,6 +1711,22 @@ void dma_buf_vunmap(struct dma_buf *dmabuf, struct iosys_map *map)
 }
 EXPORT_SYMBOL_NS_GPL(dma_buf_vunmap, DMA_BUF);
 
+/**
+ * dma_buf_vunmap_unlocked - Unmap a vmap obtained by dma_buf_vmap.
+ * @dmabuf:	[in]	buffer to vunmap
+ * @map:	[in]	vmap pointer to vunmap
+ */
+void dma_buf_vunmap_unlocked(struct dma_buf *dmabuf, struct iosys_map *map)
+{
+	if (WARN_ON(!dmabuf))
+		return;
+
+	dma_resv_lock(dmabuf->resv, NULL);
+	dma_buf_vunmap(dmabuf, map);
+	dma_resv_unlock(dmabuf->resv);
+}
+EXPORT_SYMBOL_NS_GPL(dma_buf_vunmap_unlocked, DMA_BUF);
+
 #ifdef CONFIG_DEBUG_FS
 static int dma_buf_debug_show(struct seq_file *s, void *unused)
 {
diff --git a/drivers/dma-buf/dma-fence-unwrap.c b/drivers/dma-buf/dma-fence-unwrap.c
index c625bb2b5d563..628af51c81af3 100644
--- a/drivers/dma-buf/dma-fence-unwrap.c
+++ b/drivers/dma-buf/dma-fence-unwrap.c
@@ -76,16 +76,11 @@ struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences,
 		dma_fence_unwrap_for_each(tmp, &iter[i], fences[i]) {
 			if (!dma_fence_is_signaled(tmp)) {
 				++count;
-			} else if (test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT,
-					    &tmp->flags)) {
-				if (ktime_after(tmp->timestamp, timestamp))
-					timestamp = tmp->timestamp;
 			} else {
-				/*
-				 * Use the current time if the fence is
-				 * currently signaling.
-				 */
-				timestamp = ktime_get();
+				ktime_t t = dma_fence_timestamp(tmp);
+
+				if (ktime_after(t, timestamp))
+					timestamp = t;
 			}
 		}
 	}
diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
index eef4786aaf862..8aa8f8cb7071e 100644
--- a/drivers/dma-buf/dma-fence.c
+++ b/drivers/dma-buf/dma-fence.c
@@ -913,6 +913,65 @@ dma_fence_wait_any_timeout(struct dma_fence **fences, uint32_t count,
 }
 EXPORT_SYMBOL(dma_fence_wait_any_timeout);
 
+/**
+ * DOC: deadline hints
+ *
+ * In an ideal world, it would be possible to pipeline a workload sufficiently
+ * that a utilization based device frequency governor could arrive at a minimum
+ * frequency that meets the requirements of the use-case, in order to minimize
+ * power consumption.  But in the real world there are many workloads which
+ * defy this ideal.  For example, but not limited to:
+ *
+ * * Workloads that ping-pong between device and CPU, with alternating periods
+ *   of CPU waiting for device, and device waiting on CPU.  This can result in
+ *   devfreq and cpufreq seeing idle time in their respective domains and in
+ *   result reduce frequency.
+ *
+ * * Workloads that interact with a periodic time based deadline, such as double
+ *   buffered GPU rendering vs vblank sync'd page flipping.  In this scenario,
+ *   missing a vblank deadline results in an *increase* in idle time on the GPU
+ *   (since it has to wait an additional vblank period), sending a signal to
+ *   the GPU's devfreq to reduce frequency, when in fact the opposite is what is
+ *   needed.
+ *
+ * To this end, deadline hint(s) can be set on a &dma_fence via &dma_fence_set_deadline.
+ * The deadline hint provides a way for the waiting driver, or userspace, to
+ * convey an appropriate sense of urgency to the signaling driver.
+ *
+ * A deadline hint is given in absolute ktime (CLOCK_MONOTONIC for userspace
+ * facing APIs).  The time could either be some point in the future (such as
+ * the vblank based deadline for page-flipping, or the start of a compositor's
+ * composition cycle), or the current time to indicate an immediate deadline
+ * hint (Ie. forward progress cannot be made until this fence is signaled).
+ *
+ * Multiple deadlines may be set on a given fence, even in parallel.  See the
+ * documentation for &dma_fence_ops.set_deadline.
+ *
+ * The deadline hint is just that, a hint.  The driver that created the fence
+ * may react by increasing frequency, making different scheduling choices, etc.
+ * Or doing nothing at all.
+ */
+
+/**
+ * dma_fence_set_deadline - set desired fence-wait deadline hint
+ * @fence:    the fence that is to be waited on
+ * @deadline: the time by which the waiter hopes for the fence to be
+ *            signaled
+ *
+ * Give the fence signaler a hint about an upcoming deadline, such as
+ * vblank, by which point the waiter would prefer the fence to be
+ * signaled by.  This is intended to give feedback to the fence signaler
+ * to aid in power management decisions, such as boosting GPU frequency
+ * if a periodic vblank deadline is approaching but the fence is not
+ * yet signaled..
+ */
+void dma_fence_set_deadline(struct dma_fence *fence, ktime_t deadline)
+{
+	if (fence->ops->set_deadline && !dma_fence_is_signaled(fence))
+		fence->ops->set_deadline(fence, deadline);
+}
+EXPORT_SYMBOL(dma_fence_set_deadline);
+
 /**
  * dma_fence_describe - Dump fence describtion into seq_file
  * @fence: the 6fence to describe
diff --git a/drivers/dma-buf/sync_file.c b/drivers/dma-buf/sync_file.c
index af57799c86cee..2e9a316c596a3 100644
--- a/drivers/dma-buf/sync_file.c
+++ b/drivers/dma-buf/sync_file.c
@@ -268,13 +268,10 @@ static int sync_fill_fence_info(struct dma_fence *fence,
 		sizeof(info->driver_name));
 
 	info->status = dma_fence_get_status(fence);
-	while (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags) &&
-	       !test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->flags))
-		cpu_relax();
 	info->timestamp_ns =
-		test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->flags) ?
-		ktime_to_ns(fence->timestamp) :
-		ktime_set(0, 0);
+		dma_fence_is_signaled(fence) ?
+			ktime_to_ns(dma_fence_timestamp(fence)) :
+			ktime_set(0, 0);
 
 	return info->status;
 }
diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig
index cd641b1bce7b7..ad3d8284419ad 100644
--- a/drivers/gpu/drm/Kconfig
+++ b/drivers/gpu/drm/Kconfig
@@ -207,6 +207,19 @@ config DRM_TTM
 	  GPU memory types. Will be enabled automatically if a device driver
 	  uses it.
 
+config DRM_EXEC
+	tristate
+	depends on DRM
+	help
+	  Execution context for command submissions
+
+config DRM_GPUVM
+	tristate
+	depends on DRM
+	help
+	  GPU-VM representation providing helpers to manage a GPUs virtual
+	  address space
+
 config DRM_BUDDY
 	tristate
 	depends on DRM
@@ -415,6 +428,8 @@ source "drivers/gpu/drm/lima/Kconfig"
 
 source "drivers/gpu/drm/panfrost/Kconfig"
 
+source "drivers/gpu/drm/panthor/Kconfig"
+
 source "drivers/gpu/drm/aspeed/Kconfig"
 
 source "drivers/gpu/drm/mcde/Kconfig"
diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile
index c611154667c20..dbb6173fc10b3 100644
--- a/drivers/gpu/drm/Makefile
+++ b/drivers/gpu/drm/Makefile
@@ -40,6 +40,9 @@ obj-$(CONFIG_DRM_PANEL_ORIENTATION_QUIRKS) += drm_panel_orientation_quirks.o
 #
 # Memory-management helpers
 #
+#
+obj-$(CONFIG_DRM_EXEC) += drm_exec.o
+obj-$(CONFIG_DRM_GPUVM) += drm_gpuvm.o
 
 obj-$(CONFIG_DRM_BUDDY) += drm_buddy.o
 
@@ -141,6 +144,7 @@ obj-$(CONFIG_DRM_XEN) += xen/
 obj-$(CONFIG_DRM_VBOXVIDEO) += vboxvideo/
 obj-$(CONFIG_DRM_LIMA)  += lima/
 obj-$(CONFIG_DRM_PANFROST) += panfrost/
+obj-$(CONFIG_DRM_PANTHOR) += panthor/
 obj-$(CONFIG_DRM_ASPEED_GFX) += aspeed/
 obj-$(CONFIG_DRM_MCDE) += mcde/
 obj-$(CONFIG_DRM_TIDSS) += tidss/
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index d2139ac121595..7f4df2929b1a5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -67,10 +67,10 @@ amdgpu_ctx_to_drm_sched_prio(int32_t ctx_prio)
 		return DRM_SCHED_PRIORITY_UNSET;
 
 	case AMDGPU_CTX_PRIORITY_VERY_LOW:
-		return DRM_SCHED_PRIORITY_MIN;
+		return DRM_SCHED_PRIORITY_LOW;
 
 	case AMDGPU_CTX_PRIORITY_LOW:
-		return DRM_SCHED_PRIORITY_MIN;
+		return DRM_SCHED_PRIORITY_LOW;
 
 	case AMDGPU_CTX_PRIORITY_NORMAL:
 		return DRM_SCHED_PRIORITY_NORMAL;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index de61a85c4b022..4a9efe55e06dc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1489,9 +1489,9 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)
 	for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
 		struct amdgpu_ring *ring = adev->rings[i];
 
-		if (!ring || !ring->sched.thread)
+		if (!ring || !drm_sched_wqueue_ready(&ring->sched))
 			continue;
-		kthread_park(ring->sched.thread);
+		drm_sched_wqueue_stop(&ring->sched);
 	}
 
 	seq_printf(m, "run ib test:\n");
@@ -1505,9 +1505,9 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)
 	for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
 		struct amdgpu_ring *ring = adev->rings[i];
 
-		if (!ring || !ring->sched.thread)
+		if (!ring || !drm_sched_wqueue_ready(&ring->sched))
 			continue;
-		kthread_unpark(ring->sched.thread);
+		drm_sched_wqueue_start(&ring->sched);
 	}
 
 	up_write(&adev->reset_domain->sem);
@@ -1727,7 +1727,8 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
 
 	ring = adev->rings[val];
 
-	if (!ring || !ring->funcs->preempt_ib || !ring->sched.thread)
+	if (!ring || !ring->funcs->preempt_ib ||
+	    !drm_sched_wqueue_ready(&ring->sched))
 		return -EINVAL;
 
 	/* the last preemption failed */
@@ -1745,7 +1746,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
 		goto pro_end;
 
 	/* stop the scheduler */
-	kthread_park(ring->sched.thread);
+	drm_sched_wqueue_stop(&ring->sched);
 
 	resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
 
@@ -1781,7 +1782,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
 
 failure:
 	/* restart the scheduler */
-	kthread_unpark(ring->sched.thread);
+	drm_sched_wqueue_start(&ring->sched);
 
 	up_read(&adev->reset_domain->sem);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 30c97ee375636..3fbf974a95435 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2357,7 +2357,8 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
 			break;
 		}
 
-		r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
+		r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL,
+				   DRM_SCHED_PRIORITY_COUNT,
 				   ring->num_hw_submission, amdgpu_job_hang_limit,
 				   timeout, adev->reset_domain->wq,
 				   ring->sched_score, ring->name,
@@ -4653,7 +4654,7 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 		struct amdgpu_ring *ring = adev->rings[i];
 
-		if (!ring || !ring->sched.thread)
+		if (!ring || !drm_sched_wqueue_ready(&ring->sched))
 			continue;
 
 		spin_lock(&ring->sched.job_list_lock);
@@ -4780,7 +4781,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 		struct amdgpu_ring *ring = adev->rings[i];
 
-		if (!ring || !ring->sched.thread)
+		if (!ring || !drm_sched_wqueue_ready(&ring->sched))
 			continue;
 
 		/*clear job fence from fence drv to avoid force_completion
@@ -5179,94 +5180,6 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
 	return 0;
 }
 
-static void amdgpu_device_recheck_guilty_jobs(
-	struct amdgpu_device *adev, struct list_head *device_list_handle,
-	struct amdgpu_reset_context *reset_context)
-{
-	int i, r = 0;
-
-	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
-		struct amdgpu_ring *ring = adev->rings[i];
-		int ret = 0;
-		struct drm_sched_job *s_job;
-
-		if (!ring || !ring->sched.thread)
-			continue;
-
-		s_job = list_first_entry_or_null(&ring->sched.pending_list,
-				struct drm_sched_job, list);
-		if (s_job == NULL)
-			continue;
-
-		/* clear job's guilty and depend the folowing step to decide the real one */
-		drm_sched_reset_karma(s_job);
-		drm_sched_resubmit_jobs_ext(&ring->sched, 1);
-
-		if (!s_job->s_fence->parent) {
-			DRM_WARN("Failed to get a HW fence for job!");
-			continue;
-		}
-
-		ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
-		if (ret == 0) { /* timeout */
-			DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
-						ring->sched.name, s_job->id);
-
-
-			amdgpu_fence_driver_isr_toggle(adev, true);
-
-			/* Clear this failed job from fence array */
-			amdgpu_fence_driver_clear_job_fences(ring);
-
-			amdgpu_fence_driver_isr_toggle(adev, false);
-
-			/* Since the job won't signal and we go for
-			 * another resubmit drop this parent pointer
-			 */
-			dma_fence_put(s_job->s_fence->parent);
-			s_job->s_fence->parent = NULL;
-
-			/* set guilty */
-			drm_sched_increase_karma(s_job);
-			amdgpu_reset_prepare_hwcontext(adev, reset_context);
-retry:
-			/* do hw reset */
-			if (amdgpu_sriov_vf(adev)) {
-				amdgpu_virt_fini_data_exchange(adev);
-				r = amdgpu_device_reset_sriov(adev, false);
-				if (r)
-					adev->asic_reset_res = r;
-			} else {
-				clear_bit(AMDGPU_SKIP_HW_RESET,
-					  &reset_context->flags);
-				r = amdgpu_do_asic_reset(device_list_handle,
-							 reset_context);
-				if (r && r == -EAGAIN)
-					goto retry;
-			}
-
-			/*
-			 * add reset counter so that the following
-			 * resubmitted job could flush vmid
-			 */
-			atomic_inc(&adev->gpu_reset_counter);
-			continue;
-		}
-
-		/* got the hw fence, signal finished fence */
-		atomic_dec(ring->sched.score);
-		dma_fence_get(&s_job->s_fence->finished);
-		dma_fence_signal(&s_job->s_fence->finished);
-		dma_fence_put(&s_job->s_fence->finished);
-
-		/* remove node from list and free the job */
-		spin_lock(&ring->sched.job_list_lock);
-		list_del_init(&s_job->list);
-		spin_unlock(&ring->sched.job_list_lock);
-		ring->sched.ops->free_job(s_job);
-	}
-}
-
 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -5287,7 +5200,6 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
 
 }
 
-
 /**
  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
  *
@@ -5310,7 +5222,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	int i, r = 0;
 	bool need_emergency_restart = false;
 	bool audio_suspended = false;
-	int tmp_vram_lost_counter;
 	bool gpu_reset_for_dev_remove = false;
 
 	gpu_reset_for_dev_remove =
@@ -5409,7 +5320,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 			struct amdgpu_ring *ring = tmp_adev->rings[i];
 
-			if (!ring || !ring->sched.thread)
+			if (!ring || !drm_sched_wqueue_ready(&ring->sched))
 				continue;
 
 			drm_sched_stop(&ring->sched, job ? &job->base : NULL);
@@ -5456,7 +5367,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 		amdgpu_device_stop_pending_resets(tmp_adev);
 	}
 
-	tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
 	/* Actual ASIC resets if needed.*/
 	/* Host driver will handle XGMI hive reset for SRIOV */
 	if (amdgpu_sriov_vf(adev)) {
@@ -5481,22 +5391,10 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	/* Post ASIC reset for all devs .*/
 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
 
-		/*
-		 * Sometimes a later bad compute job can block a good gfx job as gfx
-		 * and compute ring share internal GC HW mutually. We add an additional
-		 * guilty jobs recheck step to find the real guilty job, it synchronously
-		 * submits and pends for the first job being signaled. If it gets timeout,
-		 * we identify it as a real guilty job.
-		 */
-		if (amdgpu_gpu_recovery == 2 &&
-			!(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))
-			amdgpu_device_recheck_guilty_jobs(
-				tmp_adev, device_list_handle, reset_context);
-
 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 			struct amdgpu_ring *ring = tmp_adev->rings[i];
 
-			if (!ring || !ring->sched.thread)
+			if (!ring || !drm_sched_wqueue_ready(&ring->sched))
 				continue;
 
 			/* No point to resubmit jobs if we didn't HW reset*/
@@ -5825,7 +5723,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 			struct amdgpu_ring *ring = adev->rings[i];
 
-			if (!ring || !ring->sched.thread)
+			if (!ring || !drm_sched_wqueue_ready(&ring->sched))
 				continue;
 
 			drm_sched_stop(&ring->sched, NULL);
@@ -5953,7 +5851,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 		struct amdgpu_ring *ring = adev->rings[i];
 
-		if (!ring || !ring->sched.thread)
+		if (!ring || !drm_sched_wqueue_ready(&ring->sched))
 			continue;
 
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 49a023f59b2fc..94b0f62073e61 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -533,7 +533,7 @@ module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444);
  * DOC: gpu_recovery (int)
  * Set to enable GPU recovery mechanism (1 = enable, 0 = disable). The default is -1 (auto, disabled except SRIOV).
  */
-MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (2 = advanced tdr mode, 1 = enable, 0 = disable, -1 = auto)");
+MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 = enable, 0 = disable, -1 = auto)");
 module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
 
 /**
@@ -2795,8 +2795,6 @@ static const struct drm_driver amdgpu_kms_driver = {
 	.fops = &amdgpu_driver_kms_fops,
 	.release = &amdgpu_driver_release_kms,
 
-	.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
-	.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
 	.gem_prime_import = amdgpu_gem_prime_import,
 	.gem_prime_mmap = drm_gem_prime_mmap,
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 3bf0e893c07df..8043d9b92fac8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -111,6 +111,8 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
 	(*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET;
 
 	return 0;
+
+	return drm_sched_job_init(&(*job)->base, entity, 1, owner);
 }
 
 int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, unsigned size,
@@ -248,7 +250,8 @@ int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring,
 	return 0;
 }
 
-static struct dma_fence *amdgpu_job_dependency(struct drm_sched_job *sched_job,
+static struct dma_fence *
+amdgpu_job_prepare_job(struct drm_sched_job *sched_job,
 					       struct drm_sched_entity *s_entity)
 {
 	struct amdgpu_ring *ring = to_amdgpu_ring(s_entity->rq->sched);
@@ -326,8 +329,8 @@ void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched)
 	int i;
 
 	/* Signal all jobs not yet scheduled */
-	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
-		struct drm_sched_rq *rq = &sched->sched_rq[i];
+	for (i = DRM_SCHED_PRIORITY_KERNEL; i < sched->num_rqs; i++) {
+		struct drm_sched_rq *rq = sched->sched_rq[i];
 		spin_lock(&rq->lock);
 		list_for_each_entry(s_entity, &rq->entities, list) {
 			while ((s_job = to_drm_sched_job(spsc_queue_pop(&s_entity->job_queue)))) {
@@ -351,7 +354,7 @@ void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched)
 }
 
 const struct drm_sched_backend_ops amdgpu_sched_ops = {
-	.dependency = amdgpu_job_dependency,
+	.prepare_job = amdgpu_job_prepare_job,
 	.run_job = amdgpu_job_run,
 	.timedout_job = amdgpu_job_timedout,
 	.free_job = amdgpu_job_free_cb
diff --git a/drivers/gpu/drm/armada/armada_drv.c b/drivers/gpu/drm/armada/armada_drv.c
index 142668cd6d7cd..a9e6f1647a32c 100644
--- a/drivers/gpu/drm/armada/armada_drv.c
+++ b/drivers/gpu/drm/armada/armada_drv.c
@@ -38,8 +38,6 @@ DEFINE_DRM_GEM_FOPS(armada_drm_fops);
 
 static const struct drm_driver armada_drm_driver = {
 	.lastclose		= drm_fb_helper_lastclose,
-	.prime_handle_to_fd	= drm_gem_prime_handle_to_fd,
-	.prime_fd_to_handle	= drm_gem_prime_fd_to_handle,
 	.gem_prime_import	= armada_gem_prime_import,
 	.dumb_create		= armada_gem_dumb_create,
 	.major			= 1,
diff --git a/drivers/gpu/drm/drm_client.c b/drivers/gpu/drm/drm_client.c
index dcbeeb68ca641..11de319ac6d90 100644
--- a/drivers/gpu/drm/drm_client.c
+++ b/drivers/gpu/drm/drm_client.c
@@ -344,7 +344,7 @@ drm_client_buffer_vmap(struct drm_client_buffer *buffer,
 	 * fd_install step out of the driver backend hooks, to make that
 	 * final step optional for internal users.
 	 */
-	ret = drm_gem_vmap(buffer->gem, map);
+	ret = drm_gem_vmap_unlocked(buffer->gem, map);
 	if (ret)
 		return ret;
 
@@ -366,7 +366,7 @@ void drm_client_buffer_vunmap(struct drm_client_buffer *buffer)
 {
 	struct iosys_map *map = &buffer->map;
 
-	drm_gem_vunmap(buffer->gem, map);
+	drm_gem_vunmap_unlocked(buffer->gem, map);
 }
 EXPORT_SYMBOL(drm_client_buffer_vunmap);
 
diff --git a/drivers/gpu/drm/drm_exec.c b/drivers/gpu/drm/drm_exec.c
new file mode 100644
index 0000000000000..48ee851b61d90
--- /dev/null
+++ b/drivers/gpu/drm/drm_exec.c
@@ -0,0 +1,340 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+
+#include <drm/drm_exec.h>
+#include <drm/drm_gem.h>
+#include <linux/dma-resv.h>
+
+/**
+ * DOC: Overview
+ *
+ * This component mainly abstracts the retry loop necessary for locking
+ * multiple GEM objects while preparing hardware operations (e.g. command
+ * submissions, page table updates etc..).
+ *
+ * If a contention is detected while locking a GEM object the cleanup procedure
+ * unlocks all previously locked GEM objects and locks the contended one first
+ * before locking any further objects.
+ *
+ * After an object is locked fences slots can optionally be reserved on the
+ * dma_resv object inside the GEM object.
+ *
+ * A typical usage pattern should look like this::
+ *
+ *	struct drm_gem_object *obj;
+ *	struct drm_exec exec;
+ *	unsigned long index;
+ *	int ret;
+ *
+ *	drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT);
+ *	drm_exec_until_all_locked(&exec) {
+ *		ret = drm_exec_prepare_obj(&exec, boA, 1);
+ *		drm_exec_retry_on_contention(&exec);
+ *		if (ret)
+ *			goto error;
+ *
+ *		ret = drm_exec_prepare_obj(&exec, boB, 1);
+ *		drm_exec_retry_on_contention(&exec);
+ *		if (ret)
+ *			goto error;
+ *	}
+ *
+ *	drm_exec_for_each_locked_object(&exec, index, obj) {
+ *		dma_resv_add_fence(obj->resv, fence, DMA_RESV_USAGE_READ);
+ *		...
+ *	}
+ *	drm_exec_fini(&exec);
+ *
+ * See struct dma_exec for more details.
+ */
+
+/* Dummy value used to initially enter the retry loop */
+#define DRM_EXEC_DUMMY ((void *)~0)
+
+/* Unlock all objects and drop references */
+static void drm_exec_unlock_all(struct drm_exec *exec)
+{
+	struct drm_gem_object *obj;
+	unsigned long index;
+
+	drm_exec_for_each_locked_object_reverse(exec, index, obj) {
+		dma_resv_unlock(obj->resv);
+		drm_gem_object_put(obj);
+	}
+
+	drm_gem_object_put(exec->prelocked);
+	exec->prelocked = NULL;
+}
+
+/**
+ * drm_exec_init - initialize a drm_exec object
+ * @exec: the drm_exec object to initialize
+ * @flags: controls locking behavior, see DRM_EXEC_* defines
+ * @nr: the initial # of objects
+ *
+ * Initialize the object and make sure that we can track locked objects.
+ *
+ * If nr is non-zero then it is used as the initial objects table size.
+ * In either case, the table will grow (be re-allocated) on demand.
+ */
+void drm_exec_init(struct drm_exec *exec, uint32_t flags, unsigned nr)
+{
+	if (!nr)
+		nr = PAGE_SIZE / sizeof(void *);
+
+	exec->flags = flags;
+	exec->objects = kvmalloc_array(nr, sizeof(void *), GFP_KERNEL);
+
+	/* If allocation here fails, just delay that till the first use */
+	exec->max_objects = exec->objects ? nr : 0;
+	exec->num_objects = 0;
+	exec->contended = DRM_EXEC_DUMMY;
+	exec->prelocked = NULL;
+}
+EXPORT_SYMBOL(drm_exec_init);
+
+/**
+ * drm_exec_fini - finalize a drm_exec object
+ * @exec: the drm_exec object to finalize
+ *
+ * Unlock all locked objects, drop the references to objects and free all memory
+ * used for tracking the state.
+ */
+void drm_exec_fini(struct drm_exec *exec)
+{
+	drm_exec_unlock_all(exec);
+	kvfree(exec->objects);
+	if (exec->contended != DRM_EXEC_DUMMY) {
+		drm_gem_object_put(exec->contended);
+		ww_acquire_fini(&exec->ticket);
+	}
+}
+EXPORT_SYMBOL(drm_exec_fini);
+
+/**
+ * drm_exec_cleanup - cleanup when contention is detected
+ * @exec: the drm_exec object to cleanup
+ *
+ * Cleanup the current state and return true if we should stay inside the retry
+ * loop, false if there wasn't any contention detected and we can keep the
+ * objects locked.
+ */
+bool drm_exec_cleanup(struct drm_exec *exec)
+{
+	if (likely(!exec->contended)) {
+		ww_acquire_done(&exec->ticket);
+		return false;
+	}
+
+	if (likely(exec->contended == DRM_EXEC_DUMMY)) {
+		exec->contended = NULL;
+		ww_acquire_init(&exec->ticket, &reservation_ww_class);
+		return true;
+	}
+
+	drm_exec_unlock_all(exec);
+	exec->num_objects = 0;
+	return true;
+}
+EXPORT_SYMBOL(drm_exec_cleanup);
+
+/* Track the locked object in the array */
+static int drm_exec_obj_locked(struct drm_exec *exec,
+			       struct drm_gem_object *obj)
+{
+	if (unlikely(exec->num_objects == exec->max_objects)) {
+		size_t size = exec->max_objects * sizeof(void *);
+		void *tmp;
+
+		tmp = kvrealloc(exec->objects, size, size + PAGE_SIZE,
+				GFP_KERNEL);
+		if (!tmp)
+			return -ENOMEM;
+
+		exec->objects = tmp;
+		exec->max_objects += PAGE_SIZE / sizeof(void *);
+	}
+	drm_gem_object_get(obj);
+	exec->objects[exec->num_objects++] = obj;
+
+	return 0;
+}
+
+/* Make sure the contended object is locked first */
+static int drm_exec_lock_contended(struct drm_exec *exec)
+{
+	struct drm_gem_object *obj = exec->contended;
+	int ret;
+
+	if (likely(!obj))
+		return 0;
+
+	/* Always cleanup the contention so that error handling can kick in */
+	exec->contended = NULL;
+	if (exec->flags & DRM_EXEC_INTERRUPTIBLE_WAIT) {
+		ret = dma_resv_lock_slow_interruptible(obj->resv,
+						       &exec->ticket);
+		if (unlikely(ret))
+			goto error_dropref;
+	} else {
+		dma_resv_lock_slow(obj->resv, &exec->ticket);
+	}
+
+	ret = drm_exec_obj_locked(exec, obj);
+	if (unlikely(ret))
+		goto error_unlock;
+
+	exec->prelocked = obj;
+	return 0;
+
+error_unlock:
+	dma_resv_unlock(obj->resv);
+
+error_dropref:
+	drm_gem_object_put(obj);
+	return ret;
+}
+
+/**
+ * drm_exec_lock_obj - lock a GEM object for use
+ * @exec: the drm_exec object with the state
+ * @obj: the GEM object to lock
+ *
+ * Lock a GEM object for use and grab a reference to it.
+ *
+ * Returns: -EDEADLK if a contention is detected, -EALREADY when object is
+ * already locked (can be suppressed by setting the DRM_EXEC_IGNORE_DUPLICATES
+ * flag), -ENOMEM when memory allocation failed and zero for success.
+ */
+int drm_exec_lock_obj(struct drm_exec *exec, struct drm_gem_object *obj)
+{
+	int ret;
+
+	ret = drm_exec_lock_contended(exec);
+	if (unlikely(ret))
+		return ret;
+
+	if (exec->prelocked == obj) {
+		drm_gem_object_put(exec->prelocked);
+		exec->prelocked = NULL;
+		return 0;
+	}
+
+	if (exec->flags & DRM_EXEC_INTERRUPTIBLE_WAIT)
+		ret = dma_resv_lock_interruptible(obj->resv, &exec->ticket);
+	else
+		ret = dma_resv_lock(obj->resv, &exec->ticket);
+
+	if (unlikely(ret == -EDEADLK)) {
+		drm_gem_object_get(obj);
+		exec->contended = obj;
+		return -EDEADLK;
+	}
+
+	if (unlikely(ret == -EALREADY) &&
+	    exec->flags & DRM_EXEC_IGNORE_DUPLICATES)
+		return 0;
+
+	if (unlikely(ret))
+		return ret;
+
+	ret = drm_exec_obj_locked(exec, obj);
+	if (ret)
+		goto error_unlock;
+
+	return 0;
+
+error_unlock:
+	dma_resv_unlock(obj->resv);
+	return ret;
+}
+EXPORT_SYMBOL(drm_exec_lock_obj);
+
+/**
+ * drm_exec_unlock_obj - unlock a GEM object in this exec context
+ * @exec: the drm_exec object with the state
+ * @obj: the GEM object to unlock
+ *
+ * Unlock the GEM object and remove it from the collection of locked objects.
+ * Should only be used to unlock the most recently locked objects. It's not time
+ * efficient to unlock objects locked long ago.
+ */
+void drm_exec_unlock_obj(struct drm_exec *exec, struct drm_gem_object *obj)
+{
+	unsigned int i;
+
+	for (i = exec->num_objects; i--;) {
+		if (exec->objects[i] == obj) {
+			dma_resv_unlock(obj->resv);
+			for (++i; i < exec->num_objects; ++i)
+				exec->objects[i - 1] = exec->objects[i];
+			--exec->num_objects;
+			drm_gem_object_put(obj);
+			return;
+		}
+
+	}
+}
+EXPORT_SYMBOL(drm_exec_unlock_obj);
+
+/**
+ * drm_exec_prepare_obj - prepare a GEM object for use
+ * @exec: the drm_exec object with the state
+ * @obj: the GEM object to prepare
+ * @num_fences: how many fences to reserve
+ *
+ * Prepare a GEM object for use by locking it and reserving fence slots.
+ *
+ * Returns: -EDEADLK if a contention is detected, -EALREADY when object is
+ * already locked, -ENOMEM when memory allocation failed and zero for success.
+ */
+int drm_exec_prepare_obj(struct drm_exec *exec, struct drm_gem_object *obj,
+			 unsigned int num_fences)
+{
+	int ret;
+
+	ret = drm_exec_lock_obj(exec, obj);
+	if (ret)
+		return ret;
+
+	ret = dma_resv_reserve_fences(obj->resv, num_fences);
+	if (ret) {
+		drm_exec_unlock_obj(exec, obj);
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(drm_exec_prepare_obj);
+
+/**
+ * drm_exec_prepare_array - helper to prepare an array of objects
+ * @exec: the drm_exec object with the state
+ * @objects: array of GEM object to prepare
+ * @num_objects: number of GEM objects in the array
+ * @num_fences: number of fences to reserve on each GEM object
+ *
+ * Prepares all GEM objects in an array, aborts on first error.
+ * Reserves @num_fences on each GEM object after locking it.
+ *
+ * Returns: -EDEADLOCK on contention, -EALREADY when object is already locked,
+ * -ENOMEM when memory allocation failed and zero for success.
+ */
+int drm_exec_prepare_array(struct drm_exec *exec,
+			   struct drm_gem_object **objects,
+			   unsigned int num_objects,
+			   unsigned int num_fences)
+{
+	int ret;
+
+	for (unsigned int i = 0; i < num_objects; ++i) {
+		ret = drm_exec_prepare_obj(exec, objects[i], num_fences);
+		if (unlikely(ret))
+			return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(drm_exec_prepare_array);
+
+MODULE_DESCRIPTION("DRM execution context");
+MODULE_LICENSE("Dual MIT/GPL");
diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
index b87ed4238fc83..482a7964b851c 100644
--- a/drivers/gpu/drm/drm_gem.c
+++ b/drivers/gpu/drm/drm_gem.c
@@ -164,11 +164,28 @@ void drm_gem_private_object_init(struct drm_device *dev,
 	if (!obj->resv)
 		obj->resv = &obj->_resv;
 
+	if (drm_core_check_feature(dev, DRIVER_GEM_GPUVA))
+		drm_gem_gpuva_init(obj);
+
 	drm_vma_node_reset(&obj->vma_node);
 	INIT_LIST_HEAD(&obj->lru_node);
 }
 EXPORT_SYMBOL(drm_gem_private_object_init);
 
+/**
+ * drm_gem_private_object_fini - Finalize a failed drm_gem_object
+ * @obj: drm_gem_object
+ *
+ * Uninitialize an already allocated GEM object when it initialized failed
+ */
+void drm_gem_private_object_fini(struct drm_gem_object *obj)
+{
+	WARN_ON(obj->dma_buf);
+
+	dma_resv_fini(&obj->_resv);
+}
+EXPORT_SYMBOL(drm_gem_private_object_fini);
+
 /**
  * drm_gem_object_handle_free - release resources bound to userspace handles
  * @obj: GEM object to clean up.
@@ -930,12 +947,11 @@ drm_gem_release(struct drm_device *dev, struct drm_file *file_private)
 void
 drm_gem_object_release(struct drm_gem_object *obj)
 {
-	WARN_ON(obj->dma_buf);
-
 	if (obj->filp)
 		fput(obj->filp);
 
-	dma_resv_fini(&obj->_resv);
+	drm_gem_private_object_fini(obj);
+
 	drm_gem_free_mmap_offset(obj);
 	drm_gem_lru_remove(obj);
 }
@@ -1158,6 +1174,8 @@ int drm_gem_vmap(struct drm_gem_object *obj, struct iosys_map *map)
 {
 	int ret;
 
+	dma_resv_assert_held(obj->resv);
+
 	if (!obj->funcs->vmap)
 		return -EOPNOTSUPP;
 
@@ -1173,6 +1191,8 @@ EXPORT_SYMBOL(drm_gem_vmap);
 
 void drm_gem_vunmap(struct drm_gem_object *obj, struct iosys_map *map)
 {
+	dma_resv_assert_held(obj->resv);
+
 	if (iosys_map_is_null(map))
 		return;
 
@@ -1184,6 +1204,26 @@ void drm_gem_vunmap(struct drm_gem_object *obj, struct iosys_map *map)
 }
 EXPORT_SYMBOL(drm_gem_vunmap);
 
+int drm_gem_vmap_unlocked(struct drm_gem_object *obj, struct iosys_map *map)
+{
+	int ret;
+
+	dma_resv_lock(obj->resv, NULL);
+	ret = drm_gem_vmap(obj, map);
+	dma_resv_unlock(obj->resv);
+
+	return ret;
+}
+EXPORT_SYMBOL(drm_gem_vmap_unlocked);
+
+void drm_gem_vunmap_unlocked(struct drm_gem_object *obj, struct iosys_map *map)
+{
+	dma_resv_lock(obj->resv, NULL);
+	drm_gem_vunmap(obj, map);
+	dma_resv_unlock(obj->resv);
+}
+EXPORT_SYMBOL(drm_gem_vunmap_unlocked);
+
 /**
  * drm_gem_lock_reservations - Sets up the ww context and acquires
  * the lock on an array of GEM objects.
diff --git a/drivers/gpu/drm/drm_gem_dma_helper.c b/drivers/gpu/drm/drm_gem_dma_helper.c
index f6901ff97bbb5..df89fbd2d35cd 100644
--- a/drivers/gpu/drm/drm_gem_dma_helper.c
+++ b/drivers/gpu/drm/drm_gem_dma_helper.c
@@ -230,7 +230,7 @@ void drm_gem_dma_free(struct drm_gem_dma_object *dma_obj)
 
 	if (gem_obj->import_attach) {
 		if (dma_obj->vaddr)
-			dma_buf_vunmap(gem_obj->import_attach->dmabuf, &map);
+			dma_buf_vunmap_unlocked(gem_obj->import_attach->dmabuf, &map);
 		drm_prime_gem_destroy(gem_obj, dma_obj->sgt);
 	} else if (dma_obj->vaddr) {
 		if (dma_obj->map_noncoherent)
@@ -477,8 +477,8 @@ drm_gem_dma_prime_import_sg_table(struct drm_device *dev,
 	dma_obj->dma_addr = sg_dma_address(sgt->sgl);
 	dma_obj->sgt = sgt;
 
-	DRM_DEBUG_PRIME("dma_addr = %pad, size = %zu\n", &dma_obj->dma_addr,
-			attach->dmabuf->size);
+	drm_dbg_prime(dev, "dma_addr = %pad, size = %zu\n", &dma_obj->dma_addr,
+		      attach->dmabuf->size);
 
 	return &dma_obj->base;
 }
@@ -581,7 +581,7 @@ drm_gem_dma_prime_import_sg_table_vmap(struct drm_device *dev,
 	struct iosys_map map;
 	int ret;
 
-	ret = dma_buf_vmap(attach->dmabuf, &map);
+	ret = dma_buf_vmap_unlocked(attach->dmabuf, &map);
 	if (ret) {
 		DRM_ERROR("Failed to vmap PRIME buffer\n");
 		return ERR_PTR(ret);
@@ -589,7 +589,7 @@ drm_gem_dma_prime_import_sg_table_vmap(struct drm_device *dev,
 
 	obj = drm_gem_dma_prime_import_sg_table(dev, attach, sgt);
 	if (IS_ERR(obj)) {
-		dma_buf_vunmap(attach->dmabuf, &map);
+		dma_buf_vunmap_unlocked(attach->dmabuf, &map);
 		return obj;
 	}
 
diff --git a/drivers/gpu/drm/drm_gem_framebuffer_helper.c b/drivers/gpu/drm/drm_gem_framebuffer_helper.c
index 880a4975507fc..e35e224e6303a 100644
--- a/drivers/gpu/drm/drm_gem_framebuffer_helper.c
+++ b/drivers/gpu/drm/drm_gem_framebuffer_helper.c
@@ -354,7 +354,7 @@ int drm_gem_fb_vmap(struct drm_framebuffer *fb, struct iosys_map *map,
 			ret = -EINVAL;
 			goto err_drm_gem_vunmap;
 		}
-		ret = drm_gem_vmap(obj, &map[i]);
+		ret = drm_gem_vmap_unlocked(obj, &map[i]);
 		if (ret)
 			goto err_drm_gem_vunmap;
 	}
@@ -376,7 +376,7 @@ int drm_gem_fb_vmap(struct drm_framebuffer *fb, struct iosys_map *map,
 		obj = drm_gem_fb_get_obj(fb, i);
 		if (!obj)
 			continue;
-		drm_gem_vunmap(obj, &map[i]);
+		drm_gem_vunmap_unlocked(obj, &map[i]);
 	}
 	return ret;
 }
@@ -403,7 +403,7 @@ void drm_gem_fb_vunmap(struct drm_framebuffer *fb, struct iosys_map *map)
 			continue;
 		if (iosys_map_is_null(&map[i]))
 			continue;
-		drm_gem_vunmap(obj, &map[i]);
+		drm_gem_vunmap_unlocked(obj, &map[i]);
 	}
 }
 EXPORT_SYMBOL(drm_gem_fb_vunmap);
diff --git a/drivers/gpu/drm/drm_gem_shmem_helper.c b/drivers/gpu/drm/drm_gem_shmem_helper.c
index 5fdc608043e76..926e893f65276 100644
--- a/drivers/gpu/drm/drm_gem_shmem_helper.c
+++ b/drivers/gpu/drm/drm_gem_shmem_helper.c
@@ -79,15 +79,15 @@ __drm_gem_shmem_create(struct drm_device *dev, size_t size, bool private)
 	} else {
 		ret = drm_gem_object_init(dev, obj, size);
 	}
-	if (ret)
+	if (ret) {
+		drm_gem_private_object_fini(obj);
 		goto err_free;
+	}
 
 	ret = drm_gem_create_mmap_offset(obj);
 	if (ret)
 		goto err_release;
 
-	mutex_init(&shmem->pages_lock);
-	mutex_init(&shmem->vmap_lock);
 	INIT_LIST_HEAD(&shmem->madv_list);
 
 	if (!private) {
@@ -139,11 +139,13 @@ void drm_gem_shmem_free(struct drm_gem_shmem_object *shmem)
 {
 	struct drm_gem_object *obj = &shmem->base;
 
-	WARN_ON(shmem->vmap_use_count);
-
 	if (obj->import_attach) {
 		drm_prime_gem_destroy(obj, shmem->sgt);
 	} else {
+		dma_resv_lock(shmem->base.resv, NULL);
+
+		drm_WARN_ON(obj->dev, shmem->vmap_use_count);
+
 		if (shmem->sgt) {
 			dma_unmap_sgtable(obj->dev->dev, shmem->sgt,
 					  DMA_BIDIRECTIONAL, 0);
@@ -152,28 +154,31 @@ void drm_gem_shmem_free(struct drm_gem_shmem_object *shmem)
 		}
 		if (shmem->pages)
 			drm_gem_shmem_put_pages(shmem);
-	}
 
-	WARN_ON(shmem->pages_use_count);
+		drm_WARN_ON(obj->dev, shmem->pages_use_count);
+
+		dma_resv_unlock(shmem->base.resv);
+	}
 
 	drm_gem_object_release(obj);
-	mutex_destroy(&shmem->pages_lock);
-	mutex_destroy(&shmem->vmap_lock);
 	kfree(shmem);
 }
 EXPORT_SYMBOL_GPL(drm_gem_shmem_free);
 
-static int drm_gem_shmem_get_pages_locked(struct drm_gem_shmem_object *shmem)
+static int drm_gem_shmem_get_pages(struct drm_gem_shmem_object *shmem)
 {
 	struct drm_gem_object *obj = &shmem->base;
 	struct page **pages;
 
+	dma_resv_assert_held(shmem->base.resv);
+
 	if (shmem->pages_use_count++ > 0)
 		return 0;
 
 	pages = drm_gem_get_pages(obj);
 	if (IS_ERR(pages)) {
-		DRM_DEBUG_KMS("Failed to get pages (%ld)\n", PTR_ERR(pages));
+		drm_dbg_kms(obj->dev, "Failed to get pages (%ld)\n",
+			    PTR_ERR(pages));
 		shmem->pages_use_count = 0;
 		return PTR_ERR(pages);
 	}
@@ -194,36 +199,18 @@ static int drm_gem_shmem_get_pages_locked(struct drm_gem_shmem_object *shmem)
 }
 
 /*
- * drm_gem_shmem_get_pages - Allocate backing pages for a shmem GEM object
+ * drm_gem_shmem_put_pages - Decrease use count on the backing pages for a shmem GEM object
  * @shmem: shmem GEM object
  *
- * This function makes sure that backing pages exists for the shmem GEM object
- * and increases the use count.
- *
- * Returns:
- * 0 on success or a negative error code on failure.
+ * This function decreases the use count and puts the backing pages when use drops to zero.
  */
-int drm_gem_shmem_get_pages(struct drm_gem_shmem_object *shmem)
-{
-	int ret;
-
-	WARN_ON(shmem->base.import_attach);
-
-	ret = mutex_lock_interruptible(&shmem->pages_lock);
-	if (ret)
-		return ret;
-	ret = drm_gem_shmem_get_pages_locked(shmem);
-	mutex_unlock(&shmem->pages_lock);
-
-	return ret;
-}
-EXPORT_SYMBOL(drm_gem_shmem_get_pages);
-
-static void drm_gem_shmem_put_pages_locked(struct drm_gem_shmem_object *shmem)
+void drm_gem_shmem_put_pages(struct drm_gem_shmem_object *shmem)
 {
 	struct drm_gem_object *obj = &shmem->base;
 
-	if (WARN_ON_ONCE(!shmem->pages_use_count))
+	dma_resv_assert_held(shmem->base.resv);
+
+	if (drm_WARN_ON_ONCE(obj->dev, !shmem->pages_use_count))
 		return;
 
 	if (--shmem->pages_use_count > 0)
@@ -239,20 +226,25 @@ static void drm_gem_shmem_put_pages_locked(struct drm_gem_shmem_object *shmem)
 			  shmem->pages_mark_accessed_on_put);
 	shmem->pages = NULL;
 }
+EXPORT_SYMBOL(drm_gem_shmem_put_pages);
 
-/*
- * drm_gem_shmem_put_pages - Decrease use count on the backing pages for a shmem GEM object
- * @shmem: shmem GEM object
- *
- * This function decreases the use count and puts the backing pages when use drops to zero.
- */
-void drm_gem_shmem_put_pages(struct drm_gem_shmem_object *shmem)
+static int drm_gem_shmem_pin_locked(struct drm_gem_shmem_object *shmem)
+{
+	int ret;
+
+	dma_resv_assert_held(shmem->base.resv);
+
+	ret = drm_gem_shmem_get_pages(shmem);
+
+	return ret;
+}
+
+static void drm_gem_shmem_unpin_locked(struct drm_gem_shmem_object *shmem)
 {
-	mutex_lock(&shmem->pages_lock);
-	drm_gem_shmem_put_pages_locked(shmem);
-	mutex_unlock(&shmem->pages_lock);
+	dma_resv_assert_held(shmem->base.resv);
+
+	drm_gem_shmem_put_pages(shmem);
 }
-EXPORT_SYMBOL(drm_gem_shmem_put_pages);
 
 /**
  * drm_gem_shmem_pin - Pin backing pages for a shmem GEM object
@@ -266,9 +258,18 @@ EXPORT_SYMBOL(drm_gem_shmem_put_pages);
  */
 int drm_gem_shmem_pin(struct drm_gem_shmem_object *shmem)
 {
-	WARN_ON(shmem->base.import_attach);
+	struct drm_gem_object *obj = &shmem->base;
+	int ret;
+
+	drm_WARN_ON(obj->dev, obj->import_attach);
 
-	return drm_gem_shmem_get_pages(shmem);
+	ret = dma_resv_lock_interruptible(shmem->base.resv, NULL);
+	if (ret)
+		return ret;
+	ret = drm_gem_shmem_pin_locked(shmem);
+	dma_resv_unlock(shmem->base.resv);
+
+	return ret;
 }
 EXPORT_SYMBOL(drm_gem_shmem_pin);
 
@@ -281,36 +282,55 @@ EXPORT_SYMBOL(drm_gem_shmem_pin);
  */
 void drm_gem_shmem_unpin(struct drm_gem_shmem_object *shmem)
 {
-	WARN_ON(shmem->base.import_attach);
+	struct drm_gem_object *obj = &shmem->base;
 
-	drm_gem_shmem_put_pages(shmem);
+	drm_WARN_ON(obj->dev, obj->import_attach);
+
+	dma_resv_lock(shmem->base.resv, NULL);
+	drm_gem_shmem_unpin_locked(shmem);
+	dma_resv_unlock(shmem->base.resv);
 }
 EXPORT_SYMBOL(drm_gem_shmem_unpin);
 
-static int drm_gem_shmem_vmap_locked(struct drm_gem_shmem_object *shmem,
-				     struct iosys_map *map)
+/*
+ * drm_gem_shmem_vmap - Create a virtual mapping for a shmem GEM object
+ * @shmem: shmem GEM object
+ * @map: Returns the kernel virtual address of the SHMEM GEM object's backing
+ *       store.
+ *
+ * This function makes sure that a contiguous kernel virtual address mapping
+ * exists for the buffer backing the shmem GEM object. It hides the differences
+ * between dma-buf imported and natively allocated objects.
+ *
+ * Acquired mappings should be cleaned up by calling drm_gem_shmem_vunmap().
+ *
+ * Returns:
+ * 0 on success or a negative error code on failure.
+ */
+int drm_gem_shmem_vmap(struct drm_gem_shmem_object *shmem,
+		       struct iosys_map *map)
 {
 	struct drm_gem_object *obj = &shmem->base;
 	int ret = 0;
 
-	if (shmem->vmap_use_count++ > 0) {
-		iosys_map_set_vaddr(map, shmem->vaddr);
-		return 0;
-	}
-
 	if (obj->import_attach) {
 		ret = dma_buf_vmap(obj->import_attach->dmabuf, map);
 		if (!ret) {
-			if (WARN_ON(map->is_iomem)) {
+			if (drm_WARN_ON(obj->dev, map->is_iomem)) {
 				dma_buf_vunmap(obj->import_attach->dmabuf, map);
-				ret = -EIO;
-				goto err_put_pages;
+				return -EIO;
 			}
-			shmem->vaddr = map->vaddr;
 		}
 	} else {
 		pgprot_t prot = PAGE_KERNEL;
 
+		dma_resv_assert_held(shmem->base.resv);
+
+		if (shmem->vmap_use_count++ > 0) {
+			iosys_map_set_vaddr(map, shmem->vaddr);
+			return 0;
+		}
+
 		ret = drm_gem_shmem_get_pages(shmem);
 		if (ret)
 			goto err_zero_use;
@@ -326,7 +346,7 @@ static int drm_gem_shmem_vmap_locked(struct drm_gem_shmem_object *shmem,
 	}
 
 	if (ret) {
-		DRM_DEBUG_KMS("Failed to vmap pages, error %d\n", ret);
+		drm_dbg_kms(obj->dev, "Failed to vmap pages, error %d\n", ret);
 		goto err_put_pages;
 	}
 
@@ -340,58 +360,8 @@ static int drm_gem_shmem_vmap_locked(struct drm_gem_shmem_object *shmem,
 
 	return ret;
 }
-
-/*
- * drm_gem_shmem_vmap - Create a virtual mapping for a shmem GEM object
- * @shmem: shmem GEM object
- * @map: Returns the kernel virtual address of the SHMEM GEM object's backing
- *       store.
- *
- * This function makes sure that a contiguous kernel virtual address mapping
- * exists for the buffer backing the shmem GEM object. It hides the differences
- * between dma-buf imported and natively allocated objects.
- *
- * Acquired mappings should be cleaned up by calling drm_gem_shmem_vunmap().
- *
- * Returns:
- * 0 on success or a negative error code on failure.
- */
-int drm_gem_shmem_vmap(struct drm_gem_shmem_object *shmem,
-		       struct iosys_map *map)
-{
-	int ret;
-
-	ret = mutex_lock_interruptible(&shmem->vmap_lock);
-	if (ret)
-		return ret;
-	ret = drm_gem_shmem_vmap_locked(shmem, map);
-	mutex_unlock(&shmem->vmap_lock);
-
-	return ret;
-}
 EXPORT_SYMBOL(drm_gem_shmem_vmap);
 
-static void drm_gem_shmem_vunmap_locked(struct drm_gem_shmem_object *shmem,
-					struct iosys_map *map)
-{
-	struct drm_gem_object *obj = &shmem->base;
-
-	if (WARN_ON_ONCE(!shmem->vmap_use_count))
-		return;
-
-	if (--shmem->vmap_use_count > 0)
-		return;
-
-	if (obj->import_attach) {
-		dma_buf_vunmap(obj->import_attach->dmabuf, map);
-	} else {
-		vunmap(shmem->vaddr);
-		drm_gem_shmem_put_pages(shmem);
-	}
-
-	shmem->vaddr = NULL;
-}
-
 /*
  * drm_gem_shmem_vunmap - Unmap a virtual mapping for a shmem GEM object
  * @shmem: shmem GEM object
@@ -407,13 +377,28 @@ static void drm_gem_shmem_vunmap_locked(struct drm_gem_shmem_object *shmem,
 void drm_gem_shmem_vunmap(struct drm_gem_shmem_object *shmem,
 			  struct iosys_map *map)
 {
-	mutex_lock(&shmem->vmap_lock);
-	drm_gem_shmem_vunmap_locked(shmem, map);
-	mutex_unlock(&shmem->vmap_lock);
+	struct drm_gem_object *obj = &shmem->base;
+
+	if (obj->import_attach) {
+		dma_buf_vunmap(obj->import_attach->dmabuf, map);
+	} else {
+		dma_resv_assert_held(shmem->base.resv);
+
+		if (drm_WARN_ON_ONCE(obj->dev, !shmem->vmap_use_count))
+			return;
+
+		if (--shmem->vmap_use_count > 0)
+			return;
+
+		vunmap(shmem->vaddr);
+		drm_gem_shmem_put_pages(shmem);
+	}
+
+	shmem->vaddr = NULL;
 }
 EXPORT_SYMBOL(drm_gem_shmem_vunmap);
 
-static struct drm_gem_shmem_object *
+static int
 drm_gem_shmem_create_with_handle(struct drm_file *file_priv,
 				 struct drm_device *dev, size_t size,
 				 uint32_t *handle)
@@ -423,7 +408,7 @@ drm_gem_shmem_create_with_handle(struct drm_file *file_priv,
 
 	shmem = drm_gem_shmem_create(dev, size);
 	if (IS_ERR(shmem))
-		return shmem;
+		return PTR_ERR(shmem);
 
 	/*
 	 * Allocate an id of idr table where the obj is registered
@@ -432,10 +417,8 @@ drm_gem_shmem_create_with_handle(struct drm_file *file_priv,
 	ret = drm_gem_handle_create(file_priv, &shmem->base, handle);
 	/* drop reference from allocate - handle holds it now. */
 	drm_gem_object_put(&shmem->base);
-	if (ret)
-		return ERR_PTR(ret);
 
-	return shmem;
+	return ret;
 }
 
 /* Update madvise status, returns true if not purged, else
@@ -443,32 +426,32 @@ drm_gem_shmem_create_with_handle(struct drm_file *file_priv,
  */
 int drm_gem_shmem_madvise(struct drm_gem_shmem_object *shmem, int madv)
 {
-	mutex_lock(&shmem->pages_lock);
+	dma_resv_assert_held(shmem->base.resv);
 
 	if (shmem->madv >= 0)
 		shmem->madv = madv;
 
 	madv = shmem->madv;
 
-	mutex_unlock(&shmem->pages_lock);
-
 	return (madv >= 0);
 }
 EXPORT_SYMBOL(drm_gem_shmem_madvise);
 
-void drm_gem_shmem_purge_locked(struct drm_gem_shmem_object *shmem)
+void drm_gem_shmem_purge(struct drm_gem_shmem_object *shmem)
 {
 	struct drm_gem_object *obj = &shmem->base;
 	struct drm_device *dev = obj->dev;
 
-	WARN_ON(!drm_gem_shmem_is_purgeable(shmem));
+	dma_resv_assert_held(shmem->base.resv);
+
+	drm_WARN_ON(obj->dev, !drm_gem_shmem_is_purgeable(shmem));
 
 	dma_unmap_sgtable(dev->dev, shmem->sgt, DMA_BIDIRECTIONAL, 0);
 	sg_free_table(shmem->sgt);
 	kfree(shmem->sgt);
 	shmem->sgt = NULL;
 
-	drm_gem_shmem_put_pages_locked(shmem);
+	drm_gem_shmem_put_pages(shmem);
 
 	shmem->madv = -1;
 
@@ -484,17 +467,6 @@ void drm_gem_shmem_purge_locked(struct drm_gem_shmem_object *shmem)
 
 	invalidate_mapping_pages(file_inode(obj->filp)->i_mapping, 0, (loff_t)-1);
 }
-EXPORT_SYMBOL(drm_gem_shmem_purge_locked);
-
-bool drm_gem_shmem_purge(struct drm_gem_shmem_object *shmem)
-{
-	if (!mutex_trylock(&shmem->pages_lock))
-		return false;
-	drm_gem_shmem_purge_locked(shmem);
-	mutex_unlock(&shmem->pages_lock);
-
-	return true;
-}
 EXPORT_SYMBOL(drm_gem_shmem_purge);
 
 /**
@@ -518,7 +490,6 @@ int drm_gem_shmem_dumb_create(struct drm_file *file, struct drm_device *dev,
 			      struct drm_mode_create_dumb *args)
 {
 	u32 min_pitch = DIV_ROUND_UP(args->width * args->bpp, 8);
-	struct drm_gem_shmem_object *shmem;
 
 	if (!args->pitch || !args->size) {
 		args->pitch = min_pitch;
@@ -531,9 +502,7 @@ int drm_gem_shmem_dumb_create(struct drm_file *file, struct drm_device *dev,
 			args->size = PAGE_ALIGN(args->pitch * args->height);
 	}
 
-	shmem = drm_gem_shmem_create_with_handle(file, dev, args->size, &args->handle);
-
-	return PTR_ERR_OR_ZERO(shmem);
+	return drm_gem_shmem_create_with_handle(file, dev, args->size, &args->handle);
 }
 EXPORT_SYMBOL_GPL(drm_gem_shmem_dumb_create);
 
@@ -550,10 +519,10 @@ static vm_fault_t drm_gem_shmem_fault(struct vm_fault *vmf)
 	/* We don't use vmf->pgoff since that has the fake offset */
 	page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
 
-	mutex_lock(&shmem->pages_lock);
+	dma_resv_lock(shmem->base.resv, NULL);
 
 	if (page_offset >= num_pages ||
-	    WARN_ON_ONCE(!shmem->pages) ||
+	    drm_WARN_ON_ONCE(obj->dev, !shmem->pages) ||
 	    shmem->madv < 0) {
 		ret = VM_FAULT_SIGBUS;
 	} else {
@@ -562,7 +531,7 @@ static vm_fault_t drm_gem_shmem_fault(struct vm_fault *vmf)
 		ret = vmf_insert_pfn(vma, vmf->address, page_to_pfn(page));
 	}
 
-	mutex_unlock(&shmem->pages_lock);
+	dma_resv_unlock(shmem->base.resv);
 
 	return ret;
 }
@@ -572,19 +541,19 @@ static void drm_gem_shmem_vm_open(struct vm_area_struct *vma)
 	struct drm_gem_object *obj = vma->vm_private_data;
 	struct drm_gem_shmem_object *shmem = to_drm_gem_shmem_obj(obj);
 
-	WARN_ON(shmem->base.import_attach);
+	drm_WARN_ON(obj->dev, obj->import_attach);
 
-	mutex_lock(&shmem->pages_lock);
+	dma_resv_lock(shmem->base.resv, NULL);
 
 	/*
 	 * We should have already pinned the pages when the buffer was first
 	 * mmap'd, vm_open() just grabs an additional reference for the new
 	 * mm the vma is getting copied into (ie. on fork()).
 	 */
-	if (!WARN_ON_ONCE(!shmem->pages_use_count))
+	if (!drm_WARN_ON_ONCE(obj->dev, !shmem->pages_use_count))
 		shmem->pages_use_count++;
 
-	mutex_unlock(&shmem->pages_lock);
+	dma_resv_unlock(shmem->base.resv);
 
 	drm_gem_vm_open(vma);
 }
@@ -594,7 +563,10 @@ static void drm_gem_shmem_vm_close(struct vm_area_struct *vma)
 	struct drm_gem_object *obj = vma->vm_private_data;
 	struct drm_gem_shmem_object *shmem = to_drm_gem_shmem_obj(obj);
 
+	dma_resv_lock(shmem->base.resv, NULL);
 	drm_gem_shmem_put_pages(shmem);
+	dma_resv_unlock(shmem->base.resv);
+
 	drm_gem_vm_close(vma);
 }
 
@@ -622,7 +594,13 @@ int drm_gem_shmem_mmap(struct drm_gem_shmem_object *shmem, struct vm_area_struct
 	int ret;
 
 	if (obj->import_attach) {
+		/* Reset both vm_ops and vm_private_data, so we don't end up with
+		 * vm_ops pointing to our implementation if the dma-buf backend
+		 * doesn't set those fields.
+		 */
 		vma->vm_private_data = NULL;
+		vma->vm_ops = NULL;
+
 		ret = dma_buf_mmap(obj->dma_buf, vma, 0);
 
 		/* Drop the reference drm_gem_mmap_obj() acquired.*/
@@ -632,7 +610,10 @@ int drm_gem_shmem_mmap(struct drm_gem_shmem_object *shmem, struct vm_area_struct
 		return ret;
 	}
 
+	dma_resv_lock(shmem->base.resv, NULL);
 	ret = drm_gem_shmem_get_pages(shmem);
+	dma_resv_unlock(shmem->base.resv);
+
 	if (ret)
 		return ret;
 
@@ -654,6 +635,9 @@ EXPORT_SYMBOL_GPL(drm_gem_shmem_mmap);
 void drm_gem_shmem_print_info(const struct drm_gem_shmem_object *shmem,
 			      struct drm_printer *p, unsigned int indent)
 {
+	if (shmem->base.import_attach)
+		return;
+
 	drm_printf_indent(p, indent, "pages_use_count=%u\n", shmem->pages_use_count);
 	drm_printf_indent(p, indent, "vmap_use_count=%u\n", shmem->vmap_use_count);
 	drm_printf_indent(p, indent, "vaddr=%p\n", shmem->vaddr);
@@ -678,7 +662,7 @@ struct sg_table *drm_gem_shmem_get_sg_table(struct drm_gem_shmem_object *shmem)
 {
 	struct drm_gem_object *obj = &shmem->base;
 
-	WARN_ON(shmem->base.import_attach);
+	drm_WARN_ON(obj->dev, obj->import_attach);
 
 	return drm_prime_pages_to_sg(obj->dev, shmem->pages, obj->size >> PAGE_SHIFT);
 }
@@ -693,9 +677,9 @@ static struct sg_table *drm_gem_shmem_get_pages_sgt_locked(struct drm_gem_shmem_
 	if (shmem->sgt)
 		return shmem->sgt;
 
-	WARN_ON(obj->import_attach);
+	drm_WARN_ON(obj->dev, obj->import_attach);
 
-	ret = drm_gem_shmem_get_pages_locked(shmem);
+	ret = drm_gem_shmem_get_pages(shmem);
 	if (ret)
 		return ERR_PTR(ret);
 
@@ -717,7 +701,7 @@ static struct sg_table *drm_gem_shmem_get_pages_sgt_locked(struct drm_gem_shmem_
 	sg_free_table(sgt);
 	kfree(sgt);
 err_put_pages:
-	drm_gem_shmem_put_pages_locked(shmem);
+	drm_gem_shmem_put_pages(shmem);
 	return ERR_PTR(ret);
 }
 
@@ -742,11 +726,11 @@ struct sg_table *drm_gem_shmem_get_pages_sgt(struct drm_gem_shmem_object *shmem)
 	int ret;
 	struct sg_table *sgt;
 
-	ret = mutex_lock_interruptible(&shmem->pages_lock);
+	ret = dma_resv_lock_interruptible(shmem->base.resv, NULL);
 	if (ret)
 		return ERR_PTR(ret);
 	sgt = drm_gem_shmem_get_pages_sgt_locked(shmem);
-	mutex_unlock(&shmem->pages_lock);
+	dma_resv_unlock(shmem->base.resv);
 
 	return sgt;
 }
@@ -781,7 +765,7 @@ drm_gem_shmem_prime_import_sg_table(struct drm_device *dev,
 
 	shmem->sgt = sgt;
 
-	DRM_DEBUG_PRIME("size = %zu\n", size);
+	drm_dbg_prime(dev, "size = %zu\n", size);
 
 	return &shmem->base;
 }
diff --git a/drivers/gpu/drm/drm_gem_ttm_helper.c b/drivers/gpu/drm/drm_gem_ttm_helper.c
index e5fc875990c4f..d5962a34c01d5 100644
--- a/drivers/gpu/drm/drm_gem_ttm_helper.c
+++ b/drivers/gpu/drm/drm_gem_ttm_helper.c
@@ -64,13 +64,8 @@ int drm_gem_ttm_vmap(struct drm_gem_object *gem,
 		     struct iosys_map *map)
 {
 	struct ttm_buffer_object *bo = drm_gem_ttm_of_gem(gem);
-	int ret;
-
-	dma_resv_lock(gem->resv, NULL);
-	ret = ttm_bo_vmap(bo, map);
-	dma_resv_unlock(gem->resv);
 
-	return ret;
+	return ttm_bo_vmap(bo, map);
 }
 EXPORT_SYMBOL(drm_gem_ttm_vmap);
 
@@ -87,9 +82,7 @@ void drm_gem_ttm_vunmap(struct drm_gem_object *gem,
 {
 	struct ttm_buffer_object *bo = drm_gem_ttm_of_gem(gem);
 
-	dma_resv_lock(gem->resv, NULL);
 	ttm_bo_vunmap(bo, map);
-	dma_resv_unlock(gem->resv);
 }
 EXPORT_SYMBOL(drm_gem_ttm_vunmap);
 
diff --git a/drivers/gpu/drm/drm_gpuvm.c b/drivers/gpu/drm/drm_gpuvm.c
new file mode 100644
index 0000000000000..9387823c2dde4
--- /dev/null
+++ b/drivers/gpu/drm/drm_gpuvm.c
@@ -0,0 +1,2752 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *     Danilo Krummrich <dakr@redhat.com>
+ *
+ */
+
+#include <drm/drm_gpuvm.h>
+
+#include <linux/interval_tree_generic.h>
+#include <linux/mm.h>
+
+/**
+ * DOC: Overview
+ *
+ * The DRM GPU VA Manager, represented by struct drm_gpuvm keeps track of a
+ * GPU's virtual address (VA) space and manages the corresponding virtual
+ * mappings represented by &drm_gpuva objects. It also keeps track of the
+ * mapping's backing &drm_gem_object buffers.
+ *
+ * &drm_gem_object buffers maintain a list of &drm_gpuva objects representing
+ * all existent GPU VA mappings using this &drm_gem_object as backing buffer.
+ *
+ * GPU VAs can be flagged as sparse, such that drivers may use GPU VAs to also
+ * keep track of sparse PTEs in order to support Vulkan 'Sparse Resources'.
+ *
+ * The GPU VA manager internally uses a rb-tree to manage the
+ * &drm_gpuva mappings within a GPU's virtual address space.
+ *
+ * The &drm_gpuvm structure contains a special &drm_gpuva representing the
+ * portion of VA space reserved by the kernel. This node is initialized together
+ * with the GPU VA manager instance and removed when the GPU VA manager is
+ * destroyed.
+ *
+ * In a typical application drivers would embed struct drm_gpuvm and
+ * struct drm_gpuva within their own driver specific structures, there won't be
+ * any memory allocations of its own nor memory allocations of &drm_gpuva
+ * entries.
+ *
+ * The data structures needed to store &drm_gpuvas within the &drm_gpuvm are
+ * contained within struct drm_gpuva already. Hence, for inserting &drm_gpuva
+ * entries from within dma-fence signalling critical sections it is enough to
+ * pre-allocate the &drm_gpuva structures.
+ *
+ * &drm_gem_objects which are private to a single VM can share a common
+ * &dma_resv in order to improve locking efficiency (e.g. with &drm_exec).
+ * For this purpose drivers must pass a &drm_gem_object to drm_gpuvm_init(), in
+ * the following called 'resv object', which serves as the container of the
+ * GPUVM's shared &dma_resv. This resv object can be a driver specific
+ * &drm_gem_object, such as the &drm_gem_object containing the root page table,
+ * but it can also be a 'dummy' object, which can be allocated with
+ * drm_gpuvm_resv_object_alloc().
+ *
+ * In order to connect a struct drm_gpuva its backing &drm_gem_object each
+ * &drm_gem_object maintains a list of &drm_gpuvm_bo structures, and each
+ * &drm_gpuvm_bo contains a list of &drm_gpuva structures.
+ *
+ * A &drm_gpuvm_bo is an abstraction that represents a combination of a
+ * &drm_gpuvm and a &drm_gem_object. Every such combination should be unique.
+ * This is ensured by the API through drm_gpuvm_bo_obtain() and
+ * drm_gpuvm_bo_obtain_prealloc() which first look into the corresponding
+ * &drm_gem_object list of &drm_gpuvm_bos for an existing instance of this
+ * particular combination. If not existent a new instance is created and linked
+ * to the &drm_gem_object.
+ *
+ * &drm_gpuvm_bo structures, since unique for a given &drm_gpuvm, are also used
+ * as entry for the &drm_gpuvm's lists of external and evicted objects. Those
+ * lists are maintained in order to accelerate locking of dma-resv locks and
+ * validation of evicted objects bound in a &drm_gpuvm. For instance, all
+ * &drm_gem_object's &dma_resv of a given &drm_gpuvm can be locked by calling
+ * drm_gpuvm_exec_lock(). Once locked drivers can call drm_gpuvm_validate() in
+ * order to validate all evicted &drm_gem_objects. It is also possible to lock
+ * additional &drm_gem_objects by providing the corresponding parameters to
+ * drm_gpuvm_exec_lock() as well as open code the &drm_exec loop while making
+ * use of helper functions such as drm_gpuvm_prepare_range() or
+ * drm_gpuvm_prepare_objects().
+ *
+ * Every bound &drm_gem_object is treated as external object when its &dma_resv
+ * structure is different than the &drm_gpuvm's common &dma_resv structure.
+ */
+
+/**
+ * DOC: Split and Merge
+ *
+ * Besides its capability to manage and represent a GPU VA space, the
+ * GPU VA manager also provides functions to let the &drm_gpuvm calculate a
+ * sequence of operations to satisfy a given map or unmap request.
+ *
+ * Therefore the DRM GPU VA manager provides an algorithm implementing splitting
+ * and merging of existent GPU VA mappings with the ones that are requested to
+ * be mapped or unmapped. This feature is required by the Vulkan API to
+ * implement Vulkan 'Sparse Memory Bindings' - drivers UAPIs often refer to this
+ * as VM BIND.
+ *
+ * Drivers can call drm_gpuvm_sm_map() to receive a sequence of callbacks
+ * containing map, unmap and remap operations for a given newly requested
+ * mapping. The sequence of callbacks represents the set of operations to
+ * execute in order to integrate the new mapping cleanly into the current state
+ * of the GPU VA space.
+ *
+ * Depending on how the new GPU VA mapping intersects with the existent mappings
+ * of the GPU VA space the &drm_gpuvm_ops callbacks contain an arbitrary amount
+ * of unmap operations, a maximum of two remap operations and a single map
+ * operation. The caller might receive no callback at all if no operation is
+ * required, e.g. if the requested mapping already exists in the exact same way.
+ *
+ * The single map operation represents the original map operation requested by
+ * the caller.
+ *
+ * &drm_gpuva_op_unmap contains a 'keep' field, which indicates whether the
+ * &drm_gpuva to unmap is physically contiguous with the original mapping
+ * request. Optionally, if 'keep' is set, drivers may keep the actual page table
+ * entries for this &drm_gpuva, adding the missing page table entries only and
+ * update the &drm_gpuvm's view of things accordingly.
+ *
+ * Drivers may do the same optimization, namely delta page table updates, also
+ * for remap operations. This is possible since &drm_gpuva_op_remap consists of
+ * one unmap operation and one or two map operations, such that drivers can
+ * derive the page table update delta accordingly.
+ *
+ * Note that there can't be more than two existent mappings to split up, one at
+ * the beginning and one at the end of the new mapping, hence there is a
+ * maximum of two remap operations.
+ *
+ * Analogous to drm_gpuvm_sm_map() drm_gpuvm_sm_unmap() uses &drm_gpuvm_ops to
+ * call back into the driver in order to unmap a range of GPU VA space. The
+ * logic behind this function is way simpler though: For all existent mappings
+ * enclosed by the given range unmap operations are created. For mappings which
+ * are only partically located within the given range, remap operations are
+ * created such that those mappings are split up and re-mapped partically.
+ *
+ * As an alternative to drm_gpuvm_sm_map() and drm_gpuvm_sm_unmap(),
+ * drm_gpuvm_sm_map_ops_create() and drm_gpuvm_sm_unmap_ops_create() can be used
+ * to directly obtain an instance of struct drm_gpuva_ops containing a list of
+ * &drm_gpuva_op, which can be iterated with drm_gpuva_for_each_op(). This list
+ * contains the &drm_gpuva_ops analogous to the callbacks one would receive when
+ * calling drm_gpuvm_sm_map() or drm_gpuvm_sm_unmap(). While this way requires
+ * more memory (to allocate the &drm_gpuva_ops), it provides drivers a way to
+ * iterate the &drm_gpuva_op multiple times, e.g. once in a context where memory
+ * allocations are possible (e.g. to allocate GPU page tables) and once in the
+ * dma-fence signalling critical path.
+ *
+ * To update the &drm_gpuvm's view of the GPU VA space drm_gpuva_insert() and
+ * drm_gpuva_remove() may be used. These functions can safely be used from
+ * &drm_gpuvm_ops callbacks originating from drm_gpuvm_sm_map() or
+ * drm_gpuvm_sm_unmap(). However, it might be more convenient to use the
+ * provided helper functions drm_gpuva_map(), drm_gpuva_remap() and
+ * drm_gpuva_unmap() instead.
+ *
+ * The following diagram depicts the basic relationships of existent GPU VA
+ * mappings, a newly requested mapping and the resulting mappings as implemented
+ * by drm_gpuvm_sm_map() - it doesn't cover any arbitrary combinations of these.
+ *
+ * 1) Requested mapping is identical. Replace it, but indicate the backing PTEs
+ *    could be kept.
+ *
+ *    ::
+ *
+ *	     0     a     1
+ *	old: |-----------| (bo_offset=n)
+ *
+ *	     0     a     1
+ *	req: |-----------| (bo_offset=n)
+ *
+ *	     0     a     1
+ *	new: |-----------| (bo_offset=n)
+ *
+ *
+ * 2) Requested mapping is identical, except for the BO offset, hence replace
+ *    the mapping.
+ *
+ *    ::
+ *
+ *	     0     a     1
+ *	old: |-----------| (bo_offset=n)
+ *
+ *	     0     a     1
+ *	req: |-----------| (bo_offset=m)
+ *
+ *	     0     a     1
+ *	new: |-----------| (bo_offset=m)
+ *
+ *
+ * 3) Requested mapping is identical, except for the backing BO, hence replace
+ *    the mapping.
+ *
+ *    ::
+ *
+ *	     0     a     1
+ *	old: |-----------| (bo_offset=n)
+ *
+ *	     0     b     1
+ *	req: |-----------| (bo_offset=n)
+ *
+ *	     0     b     1
+ *	new: |-----------| (bo_offset=n)
+ *
+ *
+ * 4) Existent mapping is a left aligned subset of the requested one, hence
+ *    replace the existent one.
+ *
+ *    ::
+ *
+ *	     0  a  1
+ *	old: |-----|       (bo_offset=n)
+ *
+ *	     0     a     2
+ *	req: |-----------| (bo_offset=n)
+ *
+ *	     0     a     2
+ *	new: |-----------| (bo_offset=n)
+ *
+ *    .. note::
+ *       We expect to see the same result for a request with a different BO
+ *       and/or non-contiguous BO offset.
+ *
+ *
+ * 5) Requested mapping's range is a left aligned subset of the existent one,
+ *    but backed by a different BO. Hence, map the requested mapping and split
+ *    the existent one adjusting its BO offset.
+ *
+ *    ::
+ *
+ *	     0     a     2
+ *	old: |-----------| (bo_offset=n)
+ *
+ *	     0  b  1
+ *	req: |-----|       (bo_offset=n)
+ *
+ *	     0  b  1  a' 2
+ *	new: |-----|-----| (b.bo_offset=n, a.bo_offset=n+1)
+ *
+ *    .. note::
+ *       We expect to see the same result for a request with a different BO
+ *       and/or non-contiguous BO offset.
+ *
+ *
+ * 6) Existent mapping is a superset of the requested mapping. Split it up, but
+ *    indicate that the backing PTEs could be kept.
+ *
+ *    ::
+ *
+ *	     0     a     2
+ *	old: |-----------| (bo_offset=n)
+ *
+ *	     0  a  1
+ *	req: |-----|       (bo_offset=n)
+ *
+ *	     0  a  1  a' 2
+ *	new: |-----|-----| (a.bo_offset=n, a'.bo_offset=n+1)
+ *
+ *
+ * 7) Requested mapping's range is a right aligned subset of the existent one,
+ *    but backed by a different BO. Hence, map the requested mapping and split
+ *    the existent one, without adjusting the BO offset.
+ *
+ *    ::
+ *
+ *	     0     a     2
+ *	old: |-----------| (bo_offset=n)
+ *
+ *	           1  b  2
+ *	req:       |-----| (bo_offset=m)
+ *
+ *	     0  a  1  b  2
+ *	new: |-----|-----| (a.bo_offset=n,b.bo_offset=m)
+ *
+ *
+ * 8) Existent mapping is a superset of the requested mapping. Split it up, but
+ *    indicate that the backing PTEs could be kept.
+ *
+ *    ::
+ *
+ *	      0     a     2
+ *	old: |-----------| (bo_offset=n)
+ *
+ *	           1  a  2
+ *	req:       |-----| (bo_offset=n+1)
+ *
+ *	     0  a' 1  a  2
+ *	new: |-----|-----| (a'.bo_offset=n, a.bo_offset=n+1)
+ *
+ *
+ * 9) Existent mapping is overlapped at the end by the requested mapping backed
+ *    by a different BO. Hence, map the requested mapping and split up the
+ *    existent one, without adjusting the BO offset.
+ *
+ *    ::
+ *
+ *	     0     a     2
+ *	old: |-----------|       (bo_offset=n)
+ *
+ *	           1     b     3
+ *	req:       |-----------| (bo_offset=m)
+ *
+ *	     0  a  1     b     3
+ *	new: |-----|-----------| (a.bo_offset=n,b.bo_offset=m)
+ *
+ *
+ * 10) Existent mapping is overlapped by the requested mapping, both having the
+ *     same backing BO with a contiguous offset. Indicate the backing PTEs of
+ *     the old mapping could be kept.
+ *
+ *     ::
+ *
+ *	      0     a     2
+ *	 old: |-----------|       (bo_offset=n)
+ *
+ *	            1     a     3
+ *	 req:       |-----------| (bo_offset=n+1)
+ *
+ *	      0  a' 1     a     3
+ *	 new: |-----|-----------| (a'.bo_offset=n, a.bo_offset=n+1)
+ *
+ *
+ * 11) Requested mapping's range is a centered subset of the existent one
+ *     having a different backing BO. Hence, map the requested mapping and split
+ *     up the existent one in two mappings, adjusting the BO offset of the right
+ *     one accordingly.
+ *
+ *     ::
+ *
+ *	      0        a        3
+ *	 old: |-----------------| (bo_offset=n)
+ *
+ *	            1  b  2
+ *	 req:       |-----|       (bo_offset=m)
+ *
+ *	      0  a  1  b  2  a' 3
+ *	 new: |-----|-----|-----| (a.bo_offset=n,b.bo_offset=m,a'.bo_offset=n+2)
+ *
+ *
+ * 12) Requested mapping is a contiguous subset of the existent one. Split it
+ *     up, but indicate that the backing PTEs could be kept.
+ *
+ *     ::
+ *
+ *	      0        a        3
+ *	 old: |-----------------| (bo_offset=n)
+ *
+ *	            1  a  2
+ *	 req:       |-----|       (bo_offset=n+1)
+ *
+ *	      0  a' 1  a  2 a'' 3
+ *	 old: |-----|-----|-----| (a'.bo_offset=n, a.bo_offset=n+1, a''.bo_offset=n+2)
+ *
+ *
+ * 13) Existent mapping is a right aligned subset of the requested one, hence
+ *     replace the existent one.
+ *
+ *     ::
+ *
+ *	            1  a  2
+ *	 old:       |-----| (bo_offset=n+1)
+ *
+ *	      0     a     2
+ *	 req: |-----------| (bo_offset=n)
+ *
+ *	      0     a     2
+ *	 new: |-----------| (bo_offset=n)
+ *
+ *     .. note::
+ *        We expect to see the same result for a request with a different bo
+ *        and/or non-contiguous bo_offset.
+ *
+ *
+ * 14) Existent mapping is a centered subset of the requested one, hence
+ *     replace the existent one.
+ *
+ *     ::
+ *
+ *	            1  a  2
+ *	 old:       |-----| (bo_offset=n+1)
+ *
+ *	      0        a       3
+ *	 req: |----------------| (bo_offset=n)
+ *
+ *	      0        a       3
+ *	 new: |----------------| (bo_offset=n)
+ *
+ *     .. note::
+ *        We expect to see the same result for a request with a different bo
+ *        and/or non-contiguous bo_offset.
+ *
+ *
+ * 15) Existent mappings is overlapped at the beginning by the requested mapping
+ *     backed by a different BO. Hence, map the requested mapping and split up
+ *     the existent one, adjusting its BO offset accordingly.
+ *
+ *     ::
+ *
+ *	            1     a     3
+ *	 old:       |-----------| (bo_offset=n)
+ *
+ *	      0     b     2
+ *	 req: |-----------|       (bo_offset=m)
+ *
+ *	      0     b     2  a' 3
+ *	 new: |-----------|-----| (b.bo_offset=m,a.bo_offset=n+2)
+ */
+
+/**
+ * DOC: Locking
+ *
+ * In terms of managing &drm_gpuva entries DRM GPUVM does not take care of
+ * locking itself, it is the drivers responsibility to take care about locking.
+ * Drivers might want to protect the following operations: inserting, removing
+ * and iterating &drm_gpuva objects as well as generating all kinds of
+ * operations, such as split / merge or prefetch.
+ *
+ * DRM GPUVM also does not take care of the locking of the backing
+ * &drm_gem_object buffers GPU VA lists and &drm_gpuvm_bo abstractions by
+ * itself; drivers are responsible to enforce mutual exclusion using either the
+ * GEMs dma_resv lock or alternatively a driver specific external lock. For the
+ * latter see also drm_gem_gpuva_set_lock().
+ *
+ * However, DRM GPUVM contains lockdep checks to ensure callers of its API hold
+ * the corresponding lock whenever the &drm_gem_objects GPU VA list is accessed
+ * by functions such as drm_gpuva_link() or drm_gpuva_unlink(), but also
+ * drm_gpuvm_bo_obtain() and drm_gpuvm_bo_put().
+ *
+ * The latter is required since on creation and destruction of a &drm_gpuvm_bo
+ * the &drm_gpuvm_bo is attached / removed from the &drm_gem_objects gpuva list.
+ * Subsequent calls to drm_gpuvm_bo_obtain() for the same &drm_gpuvm and
+ * &drm_gem_object must be able to observe previous creations and destructions
+ * of &drm_gpuvm_bos in order to keep instances unique.
+ *
+ * The &drm_gpuvm's lists for keeping track of external and evicted objects are
+ * protected against concurrent insertion / removal and iteration internally.
+ *
+ * However, drivers still need ensure to protect concurrent calls to functions
+ * iterating those lists, namely drm_gpuvm_prepare_objects() and
+ * drm_gpuvm_validate().
+ *
+ * Alternatively, drivers can set the &DRM_GPUVM_RESV_PROTECTED flag to indicate
+ * that the corresponding &dma_resv locks are held in order to protect the
+ * lists. If &DRM_GPUVM_RESV_PROTECTED is set, internal locking is disabled and
+ * the corresponding lockdep checks are enabled. This is an optimization for
+ * drivers which are capable of taking the corresponding &dma_resv locks and
+ * hence do not require internal locking.
+ */
+
+/**
+ * DOC: Examples
+ *
+ * This section gives two examples on how to let the DRM GPUVA Manager generate
+ * &drm_gpuva_op in order to satisfy a given map or unmap request and how to
+ * make use of them.
+ *
+ * The below code is strictly limited to illustrate the generic usage pattern.
+ * To maintain simplicitly, it doesn't make use of any abstractions for common
+ * code, different (asyncronous) stages with fence signalling critical paths,
+ * any other helpers or error handling in terms of freeing memory and dropping
+ * previously taken locks.
+ *
+ * 1) Obtain a list of &drm_gpuva_op to create a new mapping::
+ *
+ *	// Allocates a new &drm_gpuva.
+ *	struct drm_gpuva * driver_gpuva_alloc(void);
+ *
+ *	// Typically drivers would embedd the &drm_gpuvm and &drm_gpuva
+ *	// structure in individual driver structures and lock the dma-resv with
+ *	// drm_exec or similar helpers.
+ *	int driver_mapping_create(struct drm_gpuvm *gpuvm,
+ *				  u64 addr, u64 range,
+ *				  struct drm_gem_object *obj, u64 offset)
+ *	{
+ *		struct drm_gpuva_ops *ops;
+ *		struct drm_gpuva_op *op
+ *		struct drm_gpuvm_bo *vm_bo;
+ *
+ *		driver_lock_va_space();
+ *		ops = drm_gpuvm_sm_map_ops_create(gpuvm, addr, range,
+ *						  obj, offset);
+ *		if (IS_ERR(ops))
+ *			return PTR_ERR(ops);
+ *
+ *		vm_bo = drm_gpuvm_bo_obtain(gpuvm, obj);
+ *		if (IS_ERR(vm_bo))
+ *			return PTR_ERR(vm_bo);
+ *
+ *		drm_gpuva_for_each_op(op, ops) {
+ *			struct drm_gpuva *va;
+ *
+ *			switch (op->op) {
+ *			case DRM_GPUVA_OP_MAP:
+ *				va = driver_gpuva_alloc();
+ *				if (!va)
+ *					; // unwind previous VA space updates,
+ *					  // free memory and unlock
+ *
+ *				driver_vm_map();
+ *				drm_gpuva_map(gpuvm, va, &op->map);
+ *				drm_gpuva_link(va, vm_bo);
+ *
+ *				break;
+ *			case DRM_GPUVA_OP_REMAP: {
+ *				struct drm_gpuva *prev = NULL, *next = NULL;
+ *
+ *				va = op->remap.unmap->va;
+ *
+ *				if (op->remap.prev) {
+ *					prev = driver_gpuva_alloc();
+ *					if (!prev)
+ *						; // unwind previous VA space
+ *						  // updates, free memory and
+ *						  // unlock
+ *				}
+ *
+ *				if (op->remap.next) {
+ *					next = driver_gpuva_alloc();
+ *					if (!next)
+ *						; // unwind previous VA space
+ *						  // updates, free memory and
+ *						  // unlock
+ *				}
+ *
+ *				driver_vm_remap();
+ *				drm_gpuva_remap(prev, next, &op->remap);
+ *
+ *				if (prev)
+ *					drm_gpuva_link(prev, va->vm_bo);
+ *				if (next)
+ *					drm_gpuva_link(next, va->vm_bo);
+ *				drm_gpuva_unlink(va);
+ *
+ *				break;
+ *			}
+ *			case DRM_GPUVA_OP_UNMAP:
+ *				va = op->unmap->va;
+ *
+ *				driver_vm_unmap();
+ *				drm_gpuva_unlink(va);
+ *				drm_gpuva_unmap(&op->unmap);
+ *
+ *				break;
+ *			default:
+ *				break;
+ *			}
+ *		}
+ *		drm_gpuvm_bo_put(vm_bo);
+ *		driver_unlock_va_space();
+ *
+ *		return 0;
+ *	}
+ *
+ * 2) Receive a callback for each &drm_gpuva_op to create a new mapping::
+ *
+ *	struct driver_context {
+ *		struct drm_gpuvm *gpuvm;
+ *		struct drm_gpuvm_bo *vm_bo;
+ *		struct drm_gpuva *new_va;
+ *		struct drm_gpuva *prev_va;
+ *		struct drm_gpuva *next_va;
+ *	};
+ *
+ *	// ops to pass to drm_gpuvm_init()
+ *	static const struct drm_gpuvm_ops driver_gpuvm_ops = {
+ *		.sm_step_map = driver_gpuva_map,
+ *		.sm_step_remap = driver_gpuva_remap,
+ *		.sm_step_unmap = driver_gpuva_unmap,
+ *	};
+ *
+ *	// Typically drivers would embedd the &drm_gpuvm and &drm_gpuva
+ *	// structure in individual driver structures and lock the dma-resv with
+ *	// drm_exec or similar helpers.
+ *	int driver_mapping_create(struct drm_gpuvm *gpuvm,
+ *				  u64 addr, u64 range,
+ *				  struct drm_gem_object *obj, u64 offset)
+ *	{
+ *		struct driver_context ctx;
+ *		struct drm_gpuvm_bo *vm_bo;
+ *		struct drm_gpuva_ops *ops;
+ *		struct drm_gpuva_op *op;
+ *		int ret = 0;
+ *
+ *		ctx.gpuvm = gpuvm;
+ *
+ *		ctx.new_va = kzalloc(sizeof(*ctx.new_va), GFP_KERNEL);
+ *		ctx.prev_va = kzalloc(sizeof(*ctx.prev_va), GFP_KERNEL);
+ *		ctx.next_va = kzalloc(sizeof(*ctx.next_va), GFP_KERNEL);
+ *		ctx.vm_bo = drm_gpuvm_bo_create(gpuvm, obj);
+ *		if (!ctx.new_va || !ctx.prev_va || !ctx.next_va || !vm_bo) {
+ *			ret = -ENOMEM;
+ *			goto out;
+ *		}
+ *
+ *		// Typically protected with a driver specific GEM gpuva lock
+ *		// used in the fence signaling path for drm_gpuva_link() and
+ *		// drm_gpuva_unlink(), hence pre-allocate.
+ *		ctx.vm_bo = drm_gpuvm_bo_obtain_prealloc(ctx.vm_bo);
+ *
+ *		driver_lock_va_space();
+ *		ret = drm_gpuvm_sm_map(gpuvm, &ctx, addr, range, obj, offset);
+ *		driver_unlock_va_space();
+ *
+ *	out:
+ *		drm_gpuvm_bo_put(ctx.vm_bo);
+ *		kfree(ctx.new_va);
+ *		kfree(ctx.prev_va);
+ *		kfree(ctx.next_va);
+ *		return ret;
+ *	}
+ *
+ *	int driver_gpuva_map(struct drm_gpuva_op *op, void *__ctx)
+ *	{
+ *		struct driver_context *ctx = __ctx;
+ *
+ *		drm_gpuva_map(ctx->vm, ctx->new_va, &op->map);
+ *
+ *		drm_gpuva_link(ctx->new_va, ctx->vm_bo);
+ *
+ *		// prevent the new GPUVA from being freed in
+ *		// driver_mapping_create()
+ *		ctx->new_va = NULL;
+ *
+ *		return 0;
+ *	}
+ *
+ *	int driver_gpuva_remap(struct drm_gpuva_op *op, void *__ctx)
+ *	{
+ *		struct driver_context *ctx = __ctx;
+ *		struct drm_gpuva *va = op->remap.unmap->va;
+ *
+ *		drm_gpuva_remap(ctx->prev_va, ctx->next_va, &op->remap);
+ *
+ *		if (op->remap.prev) {
+ *			drm_gpuva_link(ctx->prev_va, va->vm_bo);
+ *			ctx->prev_va = NULL;
+ *		}
+ *
+ *		if (op->remap.next) {
+ *			drm_gpuva_link(ctx->next_va, va->vm_bo);
+ *			ctx->next_va = NULL;
+ *		}
+ *
+ *		drm_gpuva_unlink(va);
+ *		kfree(va);
+ *
+ *		return 0;
+ *	}
+ *
+ *	int driver_gpuva_unmap(struct drm_gpuva_op *op, void *__ctx)
+ *	{
+ *		drm_gpuva_unlink(op->unmap.va);
+ *		drm_gpuva_unmap(&op->unmap);
+ *		kfree(op->unmap.va);
+ *
+ *		return 0;
+ *	}
+ */
+
+/**
+ * get_next_vm_bo_from_list() - get the next vm_bo element
+ * @__gpuvm: the &drm_gpuvm
+ * @__list_name: the name of the list we're iterating on
+ * @__local_list: a pointer to the local list used to store already iterated items
+ * @__prev_vm_bo: the previous element we got from get_next_vm_bo_from_list()
+ *
+ * This helper is here to provide lockless list iteration. Lockless as in, the
+ * iterator releases the lock immediately after picking the first element from
+ * the list, so list insertion deletion can happen concurrently.
+ *
+ * Elements popped from the original list are kept in a local list, so removal
+ * and is_empty checks can still happen while we're iterating the list.
+ */
+#define get_next_vm_bo_from_list(__gpuvm, __list_name, __local_list, __prev_vm_bo)	\
+	({										\
+		struct drm_gpuvm_bo *__vm_bo = NULL;					\
+											\
+		drm_gpuvm_bo_put(__prev_vm_bo);						\
+											\
+		spin_lock(&(__gpuvm)->__list_name.lock);				\
+		if (!(__gpuvm)->__list_name.local_list)					\
+			(__gpuvm)->__list_name.local_list = __local_list;		\
+		else									\
+			drm_WARN_ON((__gpuvm)->drm,					\
+				    (__gpuvm)->__list_name.local_list != __local_list);	\
+											\
+		while (!list_empty(&(__gpuvm)->__list_name.list)) {			\
+			__vm_bo = list_first_entry(&(__gpuvm)->__list_name.list,	\
+						   struct drm_gpuvm_bo,			\
+						   list.entry.__list_name);		\
+			if (kref_get_unless_zero(&__vm_bo->kref)) {			\
+				list_move_tail(&(__vm_bo)->list.entry.__list_name,	\
+					       __local_list);				\
+				break;							\
+			} else {							\
+				list_del_init(&(__vm_bo)->list.entry.__list_name);	\
+				__vm_bo = NULL;						\
+			}								\
+		}									\
+		spin_unlock(&(__gpuvm)->__list_name.lock);				\
+											\
+		__vm_bo;								\
+	})
+
+/**
+ * for_each_vm_bo_in_list() - internal vm_bo list iterator
+ * @__gpuvm: the &drm_gpuvm
+ * @__list_name: the name of the list we're iterating on
+ * @__local_list: a pointer to the local list used to store already iterated items
+ * @__vm_bo: the struct drm_gpuvm_bo to assign in each iteration step
+ *
+ * This helper is here to provide lockless list iteration. Lockless as in, the
+ * iterator releases the lock immediately after picking the first element from the
+ * list, hence list insertion and deletion can happen concurrently.
+ *
+ * It is not allowed to re-assign the vm_bo pointer from inside this loop.
+ *
+ * Typical use:
+ *
+ *	struct drm_gpuvm_bo *vm_bo;
+ *	LIST_HEAD(my_local_list);
+ *
+ *	ret = 0;
+ *	for_each_vm_bo_in_list(gpuvm, <list_name>, &my_local_list, vm_bo) {
+ *		ret = do_something_with_vm_bo(..., vm_bo);
+ *		if (ret)
+ *			break;
+ *	}
+ *	// Drop ref in case we break out of the loop.
+ *	drm_gpuvm_bo_put(vm_bo);
+ *	restore_vm_bo_list(gpuvm, <list_name>, &my_local_list);
+ *
+ *
+ * Only used for internal list iterations, not meant to be exposed to the outside
+ * world.
+ */
+#define for_each_vm_bo_in_list(__gpuvm, __list_name, __local_list, __vm_bo)	\
+	for (__vm_bo = get_next_vm_bo_from_list(__gpuvm, __list_name,		\
+						__local_list, NULL);		\
+	     __vm_bo;								\
+	     __vm_bo = get_next_vm_bo_from_list(__gpuvm, __list_name,		\
+						__local_list, __vm_bo))
+
+static void
+__restore_vm_bo_list(struct drm_gpuvm *gpuvm, spinlock_t *lock,
+		     struct list_head *list, struct list_head **local_list)
+{
+	/* Merge back the two lists, moving local list elements to the
+	 * head to preserve previous ordering, in case it matters.
+	 */
+	spin_lock(lock);
+	if (*local_list) {
+		list_splice(*local_list, list);
+		*local_list = NULL;
+	}
+	spin_unlock(lock);
+}
+
+/**
+ * restore_vm_bo_list() - move vm_bo elements back to their original list
+ * @__gpuvm: the &drm_gpuvm
+ * @__list_name: the name of the list we're iterating on
+ *
+ * When we're done iterating a vm_bo list, we should call restore_vm_bo_list()
+ * to restore the original state and let new iterations take place.
+ */
+#define restore_vm_bo_list(__gpuvm, __list_name)			\
+	__restore_vm_bo_list((__gpuvm), &(__gpuvm)->__list_name.lock,	\
+			     &(__gpuvm)->__list_name.list,		\
+			     &(__gpuvm)->__list_name.local_list)
+
+static void
+cond_spin_lock(spinlock_t *lock, bool cond)
+{
+	if (cond)
+		spin_lock(lock);
+}
+
+static void
+cond_spin_unlock(spinlock_t *lock, bool cond)
+{
+	if (cond)
+		spin_unlock(lock);
+}
+
+static void
+__drm_gpuvm_bo_list_add(struct drm_gpuvm *gpuvm, spinlock_t *lock,
+			struct list_head *entry, struct list_head *list)
+{
+	cond_spin_lock(lock, !!lock);
+	if (list_empty(entry))
+		list_add_tail(entry, list);
+	cond_spin_unlock(lock, !!lock);
+}
+
+/**
+ * drm_gpuvm_bo_list_add() - insert a vm_bo into the given list
+ * @__vm_bo: the &drm_gpuvm_bo
+ * @__list_name: the name of the list to insert into
+ * @__lock: whether to lock with the internal spinlock
+ *
+ * Inserts the given @__vm_bo into the list specified by @__list_name.
+ */
+#define drm_gpuvm_bo_list_add(__vm_bo, __list_name, __lock)			\
+	__drm_gpuvm_bo_list_add((__vm_bo)->vm,					\
+				__lock ? &(__vm_bo)->vm->__list_name.lock :	\
+					 NULL,					\
+				&(__vm_bo)->list.entry.__list_name,		\
+				&(__vm_bo)->vm->__list_name.list)
+
+static void
+__drm_gpuvm_bo_list_del(struct drm_gpuvm *gpuvm, spinlock_t *lock,
+			struct list_head *entry, bool init)
+{
+	cond_spin_lock(lock, !!lock);
+	if (init) {
+		if (!list_empty(entry))
+			list_del_init(entry);
+	} else {
+		list_del(entry);
+	}
+	cond_spin_unlock(lock, !!lock);
+}
+
+/**
+ * drm_gpuvm_bo_list_del_init() - remove a vm_bo from the given list
+ * @__vm_bo: the &drm_gpuvm_bo
+ * @__list_name: the name of the list to insert into
+ * @__lock: whether to lock with the internal spinlock
+ *
+ * Removes the given @__vm_bo from the list specified by @__list_name.
+ */
+#define drm_gpuvm_bo_list_del_init(__vm_bo, __list_name, __lock)		\
+	__drm_gpuvm_bo_list_del((__vm_bo)->vm,					\
+				__lock ? &(__vm_bo)->vm->__list_name.lock :	\
+					 NULL,					\
+				&(__vm_bo)->list.entry.__list_name,		\
+				true)
+
+/**
+ * drm_gpuvm_bo_list_del() - remove a vm_bo from the given list
+ * @__vm_bo: the &drm_gpuvm_bo
+ * @__list_name: the name of the list to insert into
+ * @__lock: whether to lock with the internal spinlock
+ *
+ * Removes the given @__vm_bo from the list specified by @__list_name.
+ */
+#define drm_gpuvm_bo_list_del(__vm_bo, __list_name, __lock)			\
+	__drm_gpuvm_bo_list_del((__vm_bo)->vm,					\
+				__lock ? &(__vm_bo)->vm->__list_name.lock :	\
+					 NULL,					\
+				&(__vm_bo)->list.entry.__list_name,		\
+				false)
+
+#define to_drm_gpuva(__node)	container_of((__node), struct drm_gpuva, rb.node)
+
+#define GPUVA_START(node) ((node)->va.addr)
+#define GPUVA_LAST(node) ((node)->va.addr + (node)->va.range - 1)
+
+/* We do not actually use drm_gpuva_it_next(), tell the compiler to not complain
+ * about this.
+ */
+INTERVAL_TREE_DEFINE(struct drm_gpuva, rb.node, u64, rb.__subtree_last,
+		     GPUVA_START, GPUVA_LAST, static __maybe_unused,
+		     drm_gpuva_it)
+
+static int __drm_gpuva_insert(struct drm_gpuvm *gpuvm,
+			      struct drm_gpuva *va);
+static void __drm_gpuva_remove(struct drm_gpuva *va);
+
+static bool
+drm_gpuvm_check_overflow(u64 addr, u64 range)
+{
+	u64 end;
+
+	return check_add_overflow(addr, range, &end);
+}
+
+static bool
+drm_gpuvm_warn_check_overflow(struct drm_gpuvm *gpuvm, u64 addr, u64 range)
+{
+	return drm_WARN(gpuvm->drm, drm_gpuvm_check_overflow(addr, range),
+			"GPUVA address limited to %zu bytes.\n", sizeof(addr));
+}
+
+static bool
+drm_gpuvm_in_mm_range(struct drm_gpuvm *gpuvm, u64 addr, u64 range)
+{
+	u64 end = addr + range;
+	u64 mm_start = gpuvm->mm_start;
+	u64 mm_end = mm_start + gpuvm->mm_range;
+
+	return addr >= mm_start && end <= mm_end;
+}
+
+static bool
+drm_gpuvm_in_kernel_node(struct drm_gpuvm *gpuvm, u64 addr, u64 range)
+{
+	u64 end = addr + range;
+	u64 kstart = gpuvm->kernel_alloc_node.va.addr;
+	u64 krange = gpuvm->kernel_alloc_node.va.range;
+	u64 kend = kstart + krange;
+
+	return krange && addr < kend && kstart < end;
+}
+
+/**
+ * drm_gpuvm_range_valid() - checks whether the given range is valid for the
+ * given &drm_gpuvm
+ * @gpuvm: the GPUVM to check the range for
+ * @addr: the base address
+ * @range: the range starting from the base address
+ *
+ * Checks whether the range is within the GPUVM's managed boundaries.
+ *
+ * Returns: true for a valid range, false otherwise
+ */
+bool
+drm_gpuvm_range_valid(struct drm_gpuvm *gpuvm,
+		      u64 addr, u64 range)
+{
+	return !drm_gpuvm_check_overflow(addr, range) &&
+	       drm_gpuvm_in_mm_range(gpuvm, addr, range) &&
+	       !drm_gpuvm_in_kernel_node(gpuvm, addr, range);
+}
+EXPORT_SYMBOL_GPL(drm_gpuvm_range_valid);
+
+static void
+drm_gpuvm_gem_object_free(struct drm_gem_object *obj)
+{
+	drm_gem_object_release(obj);
+	kfree(obj);
+}
+
+static const struct drm_gem_object_funcs drm_gpuvm_object_funcs = {
+	.free = drm_gpuvm_gem_object_free,
+};
+
+/**
+ * drm_gpuvm_resv_object_alloc() - allocate a dummy &drm_gem_object
+ * @drm: the drivers &drm_device
+ *
+ * Allocates a dummy &drm_gem_object which can be passed to drm_gpuvm_init() in
+ * order to serve as root GEM object providing the &drm_resv shared across
+ * &drm_gem_objects local to a single GPUVM.
+ *
+ * Returns: the &drm_gem_object on success, NULL on failure
+ */
+struct drm_gem_object *
+drm_gpuvm_resv_object_alloc(struct drm_device *drm)
+{
+	struct drm_gem_object *obj;
+
+	obj = kzalloc(sizeof(*obj), GFP_KERNEL);
+	if (!obj)
+		return NULL;
+
+	obj->funcs = &drm_gpuvm_object_funcs;
+	drm_gem_private_object_init(drm, obj, 0);
+
+	return obj;
+}
+EXPORT_SYMBOL_GPL(drm_gpuvm_resv_object_alloc);
+
+/**
+ * drm_gpuvm_init() - initialize a &drm_gpuvm
+ * @gpuvm: pointer to the &drm_gpuvm to initialize
+ * @name: the name of the GPU VA space
+ * @flags: the &drm_gpuvm_flags for this GPUVM
+ * @drm: the &drm_device this VM resides in
+ * @r_obj: the resv &drm_gem_object providing the GPUVM's common &dma_resv
+ * @start_offset: the start offset of the GPU VA space
+ * @range: the size of the GPU VA space
+ * @reserve_offset: the start of the kernel reserved GPU VA area
+ * @reserve_range: the size of the kernel reserved GPU VA area
+ * @ops: &drm_gpuvm_ops called on &drm_gpuvm_sm_map / &drm_gpuvm_sm_unmap
+ *
+ * The &drm_gpuvm must be initialized with this function before use.
+ *
+ * Note that @gpuvm must be cleared to 0 before calling this function. The given
+ * &name is expected to be managed by the surrounding driver structures.
+ */
+void
+drm_gpuvm_init(struct drm_gpuvm *gpuvm, const char *name,
+	       enum drm_gpuvm_flags flags,
+	       struct drm_device *drm,
+	       struct drm_gem_object *r_obj,
+	       u64 start_offset, u64 range,
+	       u64 reserve_offset, u64 reserve_range,
+	       const struct drm_gpuvm_ops *ops)
+{
+	gpuvm->rb.tree = RB_ROOT_CACHED;
+	INIT_LIST_HEAD(&gpuvm->rb.list);
+
+	INIT_LIST_HEAD(&gpuvm->extobj.list);
+	spin_lock_init(&gpuvm->extobj.lock);
+
+	INIT_LIST_HEAD(&gpuvm->evict.list);
+	spin_lock_init(&gpuvm->evict.lock);
+
+	kref_init(&gpuvm->kref);
+
+	gpuvm->name = name ? name : "unknown";
+	gpuvm->flags = flags;
+	gpuvm->ops = ops;
+	gpuvm->drm = drm;
+	gpuvm->r_obj = r_obj;
+
+	drm_gem_object_get(r_obj);
+
+	drm_gpuvm_warn_check_overflow(gpuvm, start_offset, range);
+	gpuvm->mm_start = start_offset;
+	gpuvm->mm_range = range;
+
+	memset(&gpuvm->kernel_alloc_node, 0, sizeof(struct drm_gpuva));
+	if (reserve_range) {
+		gpuvm->kernel_alloc_node.va.addr = reserve_offset;
+		gpuvm->kernel_alloc_node.va.range = reserve_range;
+
+		if (likely(!drm_gpuvm_warn_check_overflow(gpuvm, reserve_offset,
+							  reserve_range)))
+			__drm_gpuva_insert(gpuvm, &gpuvm->kernel_alloc_node);
+	}
+}
+EXPORT_SYMBOL_GPL(drm_gpuvm_init);
+
+static void
+drm_gpuvm_fini(struct drm_gpuvm *gpuvm)
+{
+	gpuvm->name = NULL;
+
+	if (gpuvm->kernel_alloc_node.va.range)
+		__drm_gpuva_remove(&gpuvm->kernel_alloc_node);
+
+	drm_WARN(gpuvm->drm, !RB_EMPTY_ROOT(&gpuvm->rb.tree.rb_root),
+		 "GPUVA tree is not empty, potentially leaking memory.\n");
+
+	drm_WARN(gpuvm->drm, !list_empty(&gpuvm->extobj.list),
+		 "Extobj list should be empty.\n");
+	drm_WARN(gpuvm->drm, !list_empty(&gpuvm->evict.list),
+		 "Evict list should be empty.\n");
+
+	drm_gem_object_put(gpuvm->r_obj);
+}
+
+static void
+drm_gpuvm_free(struct kref *kref)
+{
+	struct drm_gpuvm *gpuvm = container_of(kref, struct drm_gpuvm, kref);
+
+	drm_gpuvm_fini(gpuvm);
+
+	if (drm_WARN_ON(gpuvm->drm, !gpuvm->ops->vm_free))
+		return;
+
+	gpuvm->ops->vm_free(gpuvm);
+}
+
+/**
+ * drm_gpuvm_put() - drop a struct drm_gpuvm reference
+ * @gpuvm: the &drm_gpuvm to release the reference of
+ *
+ * This releases a reference to @gpuvm.
+ *
+ * This function may be called from atomic context.
+ */
+void
+drm_gpuvm_put(struct drm_gpuvm *gpuvm)
+{
+	if (gpuvm)
+		kref_put(&gpuvm->kref, drm_gpuvm_free);
+}
+EXPORT_SYMBOL_GPL(drm_gpuvm_put);
+
+static int
+exec_prepare_obj(struct drm_exec *exec, struct drm_gem_object *obj,
+		 unsigned int num_fences)
+{
+	return num_fences ? drm_exec_prepare_obj(exec, obj, num_fences) :
+			    drm_exec_lock_obj(exec, obj);
+}
+
+/**
+ * drm_gpuvm_prepare_vm() - prepare the GPUVMs common dma-resv
+ * @gpuvm: the &drm_gpuvm
+ * @exec: the &drm_exec context
+ * @num_fences: the amount of &dma_fences to reserve
+ *
+ * Calls drm_exec_prepare_obj() for the GPUVMs dummy &drm_gem_object; if
+ * @num_fences is zero drm_exec_lock_obj() is called instead.
+ *
+ * Using this function directly, it is the drivers responsibility to call
+ * drm_exec_init() and drm_exec_fini() accordingly.
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+int
+drm_gpuvm_prepare_vm(struct drm_gpuvm *gpuvm,
+		     struct drm_exec *exec,
+		     unsigned int num_fences)
+{
+	return exec_prepare_obj(exec, gpuvm->r_obj, num_fences);
+}
+EXPORT_SYMBOL_GPL(drm_gpuvm_prepare_vm);
+
+static int
+__drm_gpuvm_prepare_objects(struct drm_gpuvm *gpuvm,
+			    struct drm_exec *exec,
+			    unsigned int num_fences)
+{
+	struct drm_gpuvm_bo *vm_bo;
+	LIST_HEAD(extobjs);
+	int ret = 0;
+
+	for_each_vm_bo_in_list(gpuvm, extobj, &extobjs, vm_bo) {
+		ret = exec_prepare_obj(exec, vm_bo->obj, num_fences);
+		if (ret)
+			break;
+	}
+	/* Drop ref in case we break out of the loop. */
+	drm_gpuvm_bo_put(vm_bo);
+	restore_vm_bo_list(gpuvm, extobj);
+
+	return ret;
+}
+
+static int
+drm_gpuvm_prepare_objects_locked(struct drm_gpuvm *gpuvm,
+				 struct drm_exec *exec,
+				 unsigned int num_fences)
+{
+	struct drm_gpuvm_bo *vm_bo;
+	int ret = 0;
+
+	drm_gpuvm_resv_assert_held(gpuvm);
+	list_for_each_entry(vm_bo, &gpuvm->extobj.list, list.entry.extobj) {
+		ret = exec_prepare_obj(exec, vm_bo->obj, num_fences);
+		if (ret)
+			break;
+
+		if (vm_bo->evicted)
+			drm_gpuvm_bo_list_add(vm_bo, evict, false);
+	}
+
+	return ret;
+}
+
+/**
+ * drm_gpuvm_prepare_objects() - prepare all assoiciated BOs
+ * @gpuvm: the &drm_gpuvm
+ * @exec: the &drm_exec locking context
+ * @num_fences: the amount of &dma_fences to reserve
+ *
+ * Calls drm_exec_prepare_obj() for all &drm_gem_objects the given
+ * &drm_gpuvm contains mappings of; if @num_fences is zero drm_exec_lock_obj()
+ * is called instead.
+ *
+ * Using this function directly, it is the drivers responsibility to call
+ * drm_exec_init() and drm_exec_fini() accordingly.
+ *
+ * Note: This function is safe against concurrent insertion and removal of
+ * external objects, however it is not safe against concurrent usage itself.
+ *
+ * Drivers need to make sure to protect this case with either an outer VM lock
+ * or by calling drm_gpuvm_prepare_vm() before this function within the
+ * drm_exec_until_all_locked() loop, such that the GPUVM's dma-resv lock ensures
+ * mutual exclusion.
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+int
+drm_gpuvm_prepare_objects(struct drm_gpuvm *gpuvm,
+			  struct drm_exec *exec,
+			  unsigned int num_fences)
+{
+	if (drm_gpuvm_resv_protected(gpuvm))
+		return drm_gpuvm_prepare_objects_locked(gpuvm, exec,
+							num_fences);
+	else
+		return __drm_gpuvm_prepare_objects(gpuvm, exec, num_fences);
+}
+EXPORT_SYMBOL_GPL(drm_gpuvm_prepare_objects);
+
+/**
+ * drm_gpuvm_prepare_range() - prepare all BOs mapped within a given range
+ * @gpuvm: the &drm_gpuvm
+ * @exec: the &drm_exec locking context
+ * @addr: the start address within the VA space
+ * @range: the range to iterate within the VA space
+ * @num_fences: the amount of &dma_fences to reserve
+ *
+ * Calls drm_exec_prepare_obj() for all &drm_gem_objects mapped between @addr
+ * and @addr + @range; if @num_fences is zero drm_exec_lock_obj() is called
+ * instead.
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+int
+drm_gpuvm_prepare_range(struct drm_gpuvm *gpuvm, struct drm_exec *exec,
+			u64 addr, u64 range, unsigned int num_fences)
+{
+	struct drm_gpuva *va;
+	u64 end = addr + range;
+	int ret;
+
+	drm_gpuvm_for_each_va_range(va, gpuvm, addr, end) {
+		struct drm_gem_object *obj = va->gem.obj;
+
+		ret = exec_prepare_obj(exec, obj, num_fences);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(drm_gpuvm_prepare_range);
+
+/**
+ * drm_gpuvm_exec_lock() - lock all dma-resv of all assoiciated BOs
+ * @vm_exec: the &drm_gpuvm_exec wrapper
+ *
+ * Acquires all dma-resv locks of all &drm_gem_objects the given
+ * &drm_gpuvm contains mappings of.
+ *
+ * Addionally, when calling this function with struct drm_gpuvm_exec::extra
+ * being set the driver receives the given @fn callback to lock additional
+ * dma-resv in the context of the &drm_gpuvm_exec instance. Typically, drivers
+ * would call drm_exec_prepare_obj() from within this callback.
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+int
+drm_gpuvm_exec_lock(struct drm_gpuvm_exec *vm_exec)
+{
+	struct drm_gpuvm *gpuvm = vm_exec->vm;
+	struct drm_exec *exec = &vm_exec->exec;
+	unsigned int num_fences = vm_exec->num_fences;
+	int ret;
+
+	drm_exec_init(exec, vm_exec->flags, 0);
+
+	drm_exec_until_all_locked(exec) {
+		ret = drm_gpuvm_prepare_vm(gpuvm, exec, num_fences);
+		drm_exec_retry_on_contention(exec);
+		if (ret)
+			goto err;
+
+		ret = drm_gpuvm_prepare_objects(gpuvm, exec, num_fences);
+		drm_exec_retry_on_contention(exec);
+		if (ret)
+			goto err;
+
+		if (vm_exec->extra.fn) {
+			ret = vm_exec->extra.fn(vm_exec);
+			drm_exec_retry_on_contention(exec);
+			if (ret)
+				goto err;
+		}
+	}
+
+	return 0;
+
+err:
+	drm_exec_fini(exec);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(drm_gpuvm_exec_lock);
+
+static int
+fn_lock_array(struct drm_gpuvm_exec *vm_exec)
+{
+	struct {
+		struct drm_gem_object **objs;
+		unsigned int num_objs;
+	} *args = vm_exec->extra.priv;
+
+	return drm_exec_prepare_array(&vm_exec->exec, args->objs,
+				      args->num_objs, vm_exec->num_fences);
+}
+
+/**
+ * drm_gpuvm_exec_lock_array() - lock all dma-resv of all assoiciated BOs
+ * @vm_exec: the &drm_gpuvm_exec wrapper
+ * @objs: additional &drm_gem_objects to lock
+ * @num_objs: the number of additional &drm_gem_objects to lock
+ *
+ * Acquires all dma-resv locks of all &drm_gem_objects the given &drm_gpuvm
+ * contains mappings of, plus the ones given through @objs.
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+int
+drm_gpuvm_exec_lock_array(struct drm_gpuvm_exec *vm_exec,
+			  struct drm_gem_object **objs,
+			  unsigned int num_objs)
+{
+	struct {
+		struct drm_gem_object **objs;
+		unsigned int num_objs;
+	} args;
+
+	args.objs = objs;
+	args.num_objs = num_objs;
+
+	vm_exec->extra.fn = fn_lock_array;
+	vm_exec->extra.priv = &args;
+
+	return drm_gpuvm_exec_lock(vm_exec);
+}
+EXPORT_SYMBOL_GPL(drm_gpuvm_exec_lock_array);
+
+/**
+ * drm_gpuvm_exec_lock_range() - prepare all BOs mapped within a given range
+ * @vm_exec: the &drm_gpuvm_exec wrapper
+ * @addr: the start address within the VA space
+ * @range: the range to iterate within the VA space
+ *
+ * Acquires all dma-resv locks of all &drm_gem_objects mapped between @addr and
+ * @addr + @range.
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+int
+drm_gpuvm_exec_lock_range(struct drm_gpuvm_exec *vm_exec,
+			  u64 addr, u64 range)
+{
+	struct drm_gpuvm *gpuvm = vm_exec->vm;
+	struct drm_exec *exec = &vm_exec->exec;
+	int ret;
+
+	drm_exec_init(exec, vm_exec->flags, 0);
+
+	drm_exec_until_all_locked(exec) {
+		ret = drm_gpuvm_prepare_range(gpuvm, exec, addr, range,
+					      vm_exec->num_fences);
+		drm_exec_retry_on_contention(exec);
+		if (ret)
+			goto err;
+	}
+
+	return ret;
+
+err:
+	drm_exec_fini(exec);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(drm_gpuvm_exec_lock_range);
+
+static int
+__drm_gpuvm_validate(struct drm_gpuvm *gpuvm, struct drm_exec *exec)
+{
+	const struct drm_gpuvm_ops *ops = gpuvm->ops;
+	struct drm_gpuvm_bo *vm_bo;
+	LIST_HEAD(evict);
+	int ret = 0;
+
+	for_each_vm_bo_in_list(gpuvm, evict, &evict, vm_bo) {
+		ret = ops->vm_bo_validate(vm_bo, exec);
+		if (ret)
+			break;
+	}
+	/* Drop ref in case we break out of the loop. */
+	drm_gpuvm_bo_put(vm_bo);
+	restore_vm_bo_list(gpuvm, evict);
+
+	return ret;
+}
+
+static int
+drm_gpuvm_validate_locked(struct drm_gpuvm *gpuvm, struct drm_exec *exec)
+{
+	const struct drm_gpuvm_ops *ops = gpuvm->ops;
+	struct drm_gpuvm_bo *vm_bo, *next;
+	int ret = 0;
+
+	drm_gpuvm_resv_assert_held(gpuvm);
+
+	list_for_each_entry_safe(vm_bo, next, &gpuvm->evict.list,
+				 list.entry.evict) {
+		ret = ops->vm_bo_validate(vm_bo, exec);
+		if (ret)
+			break;
+
+		dma_resv_assert_held(vm_bo->obj->resv);
+		if (!vm_bo->evicted)
+			drm_gpuvm_bo_list_del_init(vm_bo, evict, false);
+	}
+
+	return ret;
+}
+
+/**
+ * drm_gpuvm_validate() - validate all BOs marked as evicted
+ * @gpuvm: the &drm_gpuvm to validate evicted BOs
+ * @exec: the &drm_exec instance used for locking the GPUVM
+ *
+ * Calls the &drm_gpuvm_ops::vm_bo_validate callback for all evicted buffer
+ * objects being mapped in the given &drm_gpuvm.
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+int
+drm_gpuvm_validate(struct drm_gpuvm *gpuvm, struct drm_exec *exec)
+{
+	const struct drm_gpuvm_ops *ops = gpuvm->ops;
+
+	if (unlikely(!ops || !ops->vm_bo_validate))
+		return -EOPNOTSUPP;
+
+	if (drm_gpuvm_resv_protected(gpuvm))
+		return drm_gpuvm_validate_locked(gpuvm, exec);
+	else
+		return __drm_gpuvm_validate(gpuvm, exec);
+}
+EXPORT_SYMBOL_GPL(drm_gpuvm_validate);
+
+/**
+ * drm_gpuvm_resv_add_fence - add fence to private and all extobj
+ * dma-resv
+ * @gpuvm: the &drm_gpuvm to add a fence to
+ * @exec: the &drm_exec locking context
+ * @fence: fence to add
+ * @private_usage: private dma-resv usage
+ * @extobj_usage: extobj dma-resv usage
+ */
+void
+drm_gpuvm_resv_add_fence(struct drm_gpuvm *gpuvm,
+			 struct drm_exec *exec,
+			 struct dma_fence *fence,
+			 enum dma_resv_usage private_usage,
+			 enum dma_resv_usage extobj_usage)
+{
+	struct drm_gem_object *obj;
+	unsigned long index;
+
+	drm_exec_for_each_locked_object(exec, index, obj) {
+		dma_resv_assert_held(obj->resv);
+		dma_resv_add_fence(obj->resv, fence,
+				   drm_gpuvm_is_extobj(gpuvm, obj) ?
+				   extobj_usage : private_usage);
+	}
+}
+EXPORT_SYMBOL_GPL(drm_gpuvm_resv_add_fence);
+
+/**
+ * drm_gpuvm_bo_create() - create a new instance of struct drm_gpuvm_bo
+ * @gpuvm: The &drm_gpuvm the @obj is mapped in.
+ * @obj: The &drm_gem_object being mapped in the @gpuvm.
+ *
+ * If provided by the driver, this function uses the &drm_gpuvm_ops
+ * vm_bo_alloc() callback to allocate.
+ *
+ * Returns: a pointer to the &drm_gpuvm_bo on success, NULL on failure
+ */
+struct drm_gpuvm_bo *
+drm_gpuvm_bo_create(struct drm_gpuvm *gpuvm,
+		    struct drm_gem_object *obj)
+{
+	const struct drm_gpuvm_ops *ops = gpuvm->ops;
+	struct drm_gpuvm_bo *vm_bo;
+
+	if (ops && ops->vm_bo_alloc)
+		vm_bo = ops->vm_bo_alloc();
+	else
+		vm_bo = kzalloc(sizeof(*vm_bo), GFP_KERNEL);
+
+	if (unlikely(!vm_bo))
+		return NULL;
+
+	vm_bo->vm = drm_gpuvm_get(gpuvm);
+	vm_bo->obj = obj;
+	drm_gem_object_get(obj);
+
+	kref_init(&vm_bo->kref);
+	INIT_LIST_HEAD(&vm_bo->list.gpuva);
+	INIT_LIST_HEAD(&vm_bo->list.entry.gem);
+
+	INIT_LIST_HEAD(&vm_bo->list.entry.extobj);
+	INIT_LIST_HEAD(&vm_bo->list.entry.evict);
+
+	return vm_bo;
+}
+EXPORT_SYMBOL_GPL(drm_gpuvm_bo_create);
+
+static void
+drm_gpuvm_bo_destroy(struct kref *kref)
+{
+	struct drm_gpuvm_bo *vm_bo = container_of(kref, struct drm_gpuvm_bo,
+						  kref);
+	struct drm_gpuvm *gpuvm = vm_bo->vm;
+	const struct drm_gpuvm_ops *ops = gpuvm->ops;
+	struct drm_gem_object *obj = vm_bo->obj;
+	bool lock = !drm_gpuvm_resv_protected(gpuvm);
+
+	if (!lock)
+		drm_gpuvm_resv_assert_held(gpuvm);
+
+	drm_gpuvm_bo_list_del(vm_bo, extobj, lock);
+	drm_gpuvm_bo_list_del(vm_bo, evict, lock);
+
+	drm_gem_gpuva_assert_lock_held(obj);
+	list_del(&vm_bo->list.entry.gem);
+
+	if (ops && ops->vm_bo_free)
+		ops->vm_bo_free(vm_bo);
+	else
+		kfree(vm_bo);
+
+	drm_gpuvm_put(gpuvm);
+	drm_gem_object_put(obj);
+}
+
+/**
+ * drm_gpuvm_bo_put() - drop a struct drm_gpuvm_bo reference
+ * @vm_bo: the &drm_gpuvm_bo to release the reference of
+ *
+ * This releases a reference to @vm_bo.
+ *
+ * If the reference count drops to zero, the &gpuvm_bo is destroyed, which
+ * includes removing it from the GEMs gpuva list. Hence, if a call to this
+ * function can potentially let the reference count drop to zero the caller must
+ * hold the dma-resv or driver specific GEM gpuva lock.
+ *
+ * This function may only be called from non-atomic context.
+ *
+ * Returns: true if vm_bo was destroyed, false otherwise.
+ */
+bool
+drm_gpuvm_bo_put(struct drm_gpuvm_bo *vm_bo)
+{
+	might_sleep();
+
+	if (vm_bo)
+		return !!kref_put(&vm_bo->kref, drm_gpuvm_bo_destroy);
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(drm_gpuvm_bo_put);
+
+static struct drm_gpuvm_bo *
+__drm_gpuvm_bo_find(struct drm_gpuvm *gpuvm,
+		    struct drm_gem_object *obj)
+{
+	struct drm_gpuvm_bo *vm_bo;
+
+	drm_gem_gpuva_assert_lock_held(obj);
+	drm_gem_for_each_gpuvm_bo(vm_bo, obj)
+		if (vm_bo->vm == gpuvm)
+			return vm_bo;
+
+	return NULL;
+}
+
+/**
+ * drm_gpuvm_bo_find() - find the &drm_gpuvm_bo for the given
+ * &drm_gpuvm and &drm_gem_object
+ * @gpuvm: The &drm_gpuvm the @obj is mapped in.
+ * @obj: The &drm_gem_object being mapped in the @gpuvm.
+ *
+ * Find the &drm_gpuvm_bo representing the combination of the given
+ * &drm_gpuvm and &drm_gem_object. If found, increases the reference
+ * count of the &drm_gpuvm_bo accordingly.
+ *
+ * Returns: a pointer to the &drm_gpuvm_bo on success, NULL on failure
+ */
+struct drm_gpuvm_bo *
+drm_gpuvm_bo_find(struct drm_gpuvm *gpuvm,
+		  struct drm_gem_object *obj)
+{
+	struct drm_gpuvm_bo *vm_bo = __drm_gpuvm_bo_find(gpuvm, obj);
+
+	return vm_bo ? drm_gpuvm_bo_get(vm_bo) : NULL;
+}
+EXPORT_SYMBOL_GPL(drm_gpuvm_bo_find);
+
+/**
+ * drm_gpuvm_bo_obtain() - obtains and instance of the &drm_gpuvm_bo for the
+ * given &drm_gpuvm and &drm_gem_object
+ * @gpuvm: The &drm_gpuvm the @obj is mapped in.
+ * @obj: The &drm_gem_object being mapped in the @gpuvm.
+ *
+ * Find the &drm_gpuvm_bo representing the combination of the given
+ * &drm_gpuvm and &drm_gem_object. If found, increases the reference
+ * count of the &drm_gpuvm_bo accordingly. If not found, allocates a new
+ * &drm_gpuvm_bo.
+ *
+ * A new &drm_gpuvm_bo is added to the GEMs gpuva list.
+ *
+ * Returns: a pointer to the &drm_gpuvm_bo on success, an ERR_PTR on failure
+ */
+struct drm_gpuvm_bo *
+drm_gpuvm_bo_obtain(struct drm_gpuvm *gpuvm,
+		    struct drm_gem_object *obj)
+{
+	struct drm_gpuvm_bo *vm_bo;
+
+	vm_bo = drm_gpuvm_bo_find(gpuvm, obj);
+	if (vm_bo)
+		return vm_bo;
+
+	vm_bo = drm_gpuvm_bo_create(gpuvm, obj);
+	if (!vm_bo)
+		return ERR_PTR(-ENOMEM);
+
+	drm_gem_gpuva_assert_lock_held(obj);
+	list_add_tail(&vm_bo->list.entry.gem, &obj->gpuva.list);
+
+	return vm_bo;
+}
+EXPORT_SYMBOL_GPL(drm_gpuvm_bo_obtain);
+
+/**
+ * drm_gpuvm_bo_obtain_prealloc() - obtains and instance of the &drm_gpuvm_bo
+ * for the given &drm_gpuvm and &drm_gem_object
+ * @__vm_bo: A pre-allocated struct drm_gpuvm_bo.
+ *
+ * Find the &drm_gpuvm_bo representing the combination of the given
+ * &drm_gpuvm and &drm_gem_object. If found, increases the reference
+ * count of the found &drm_gpuvm_bo accordingly, while the @__vm_bo reference
+ * count is decreased. If not found @__vm_bo is returned without further
+ * increase of the reference count.
+ *
+ * A new &drm_gpuvm_bo is added to the GEMs gpuva list.
+ *
+ * Returns: a pointer to the found &drm_gpuvm_bo or @__vm_bo if no existing
+ * &drm_gpuvm_bo was found
+ */
+struct drm_gpuvm_bo *
+drm_gpuvm_bo_obtain_prealloc(struct drm_gpuvm_bo *__vm_bo)
+{
+	struct drm_gpuvm *gpuvm = __vm_bo->vm;
+	struct drm_gem_object *obj = __vm_bo->obj;
+	struct drm_gpuvm_bo *vm_bo;
+
+	vm_bo = drm_gpuvm_bo_find(gpuvm, obj);
+	if (vm_bo) {
+		drm_gpuvm_bo_put(__vm_bo);
+		return vm_bo;
+	}
+
+	drm_gem_gpuva_assert_lock_held(obj);
+	list_add_tail(&__vm_bo->list.entry.gem, &obj->gpuva.list);
+
+	return __vm_bo;
+}
+EXPORT_SYMBOL_GPL(drm_gpuvm_bo_obtain_prealloc);
+
+/**
+ * drm_gpuvm_bo_extobj_add() - adds the &drm_gpuvm_bo to its &drm_gpuvm's
+ * extobj list
+ * @vm_bo: The &drm_gpuvm_bo to add to its &drm_gpuvm's the extobj list.
+ *
+ * Adds the given @vm_bo to its &drm_gpuvm's extobj list if not on the list
+ * already and if the corresponding &drm_gem_object is an external object,
+ * actually.
+ */
+void
+drm_gpuvm_bo_extobj_add(struct drm_gpuvm_bo *vm_bo)
+{
+	struct drm_gpuvm *gpuvm = vm_bo->vm;
+	bool lock = !drm_gpuvm_resv_protected(gpuvm);
+
+	if (!lock)
+		drm_gpuvm_resv_assert_held(gpuvm);
+
+	if (drm_gpuvm_is_extobj(gpuvm, vm_bo->obj))
+		drm_gpuvm_bo_list_add(vm_bo, extobj, lock);
+}
+EXPORT_SYMBOL_GPL(drm_gpuvm_bo_extobj_add);
+
+/**
+ * drm_gpuvm_bo_evict() - add / remove a &drm_gpuvm_bo to / from the &drm_gpuvms
+ * evicted list
+ * @vm_bo: the &drm_gpuvm_bo to add or remove
+ * @evict: indicates whether the object is evicted
+ *
+ * Adds a &drm_gpuvm_bo to or removes it from the &drm_gpuvms evicted list.
+ */
+void
+drm_gpuvm_bo_evict(struct drm_gpuvm_bo *vm_bo, bool evict)
+{
+	struct drm_gpuvm *gpuvm = vm_bo->vm;
+	struct drm_gem_object *obj = vm_bo->obj;
+	bool lock = !drm_gpuvm_resv_protected(gpuvm);
+
+	dma_resv_assert_held(obj->resv);
+	vm_bo->evicted = evict;
+
+	/* Can't add external objects to the evicted list directly if not using
+	 * internal spinlocks, since in this case the evicted list is protected
+	 * with the VM's common dma-resv lock.
+	 */
+	if (drm_gpuvm_is_extobj(gpuvm, obj) && !lock)
+		return;
+
+	if (evict)
+		drm_gpuvm_bo_list_add(vm_bo, evict, lock);
+	else
+		drm_gpuvm_bo_list_del_init(vm_bo, evict, lock);
+}
+EXPORT_SYMBOL_GPL(drm_gpuvm_bo_evict);
+
+static int
+__drm_gpuva_insert(struct drm_gpuvm *gpuvm,
+		   struct drm_gpuva *va)
+{
+	struct rb_node *node;
+	struct list_head *head;
+
+	if (drm_gpuva_it_iter_first(&gpuvm->rb.tree,
+				    GPUVA_START(va),
+				    GPUVA_LAST(va)))
+		return -EEXIST;
+
+	va->vm = gpuvm;
+
+	drm_gpuva_it_insert(va, &gpuvm->rb.tree);
+
+	node = rb_prev(&va->rb.node);
+	if (node)
+		head = &(to_drm_gpuva(node))->rb.entry;
+	else
+		head = &gpuvm->rb.list;
+
+	list_add(&va->rb.entry, head);
+
+	return 0;
+}
+
+/**
+ * drm_gpuva_insert() - insert a &drm_gpuva
+ * @gpuvm: the &drm_gpuvm to insert the &drm_gpuva in
+ * @va: the &drm_gpuva to insert
+ *
+ * Insert a &drm_gpuva with a given address and range into a
+ * &drm_gpuvm.
+ *
+ * It is safe to use this function using the safe versions of iterating the GPU
+ * VA space, such as drm_gpuvm_for_each_va_safe() and
+ * drm_gpuvm_for_each_va_range_safe().
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+int
+drm_gpuva_insert(struct drm_gpuvm *gpuvm,
+		 struct drm_gpuva *va)
+{
+	u64 addr = va->va.addr;
+	u64 range = va->va.range;
+	int ret;
+
+	if (unlikely(!drm_gpuvm_range_valid(gpuvm, addr, range)))
+		return -EINVAL;
+
+	ret = __drm_gpuva_insert(gpuvm, va);
+	if (likely(!ret))
+		/* Take a reference of the GPUVM for the successfully inserted
+		 * drm_gpuva. We can't take the reference in
+		 * __drm_gpuva_insert() itself, since we don't want to increse
+		 * the reference count for the GPUVM's kernel_alloc_node.
+		 */
+		drm_gpuvm_get(gpuvm);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(drm_gpuva_insert);
+
+static void
+__drm_gpuva_remove(struct drm_gpuva *va)
+{
+	drm_gpuva_it_remove(va, &va->vm->rb.tree);
+	list_del_init(&va->rb.entry);
+}
+
+/**
+ * drm_gpuva_remove() - remove a &drm_gpuva
+ * @va: the &drm_gpuva to remove
+ *
+ * This removes the given &va from the underlaying tree.
+ *
+ * It is safe to use this function using the safe versions of iterating the GPU
+ * VA space, such as drm_gpuvm_for_each_va_safe() and
+ * drm_gpuvm_for_each_va_range_safe().
+ */
+void
+drm_gpuva_remove(struct drm_gpuva *va)
+{
+	struct drm_gpuvm *gpuvm = va->vm;
+
+	if (unlikely(va == &gpuvm->kernel_alloc_node)) {
+		drm_WARN(gpuvm->drm, 1,
+			 "Can't destroy kernel reserved node.\n");
+		return;
+	}
+
+	__drm_gpuva_remove(va);
+	drm_gpuvm_put(va->vm);
+}
+EXPORT_SYMBOL_GPL(drm_gpuva_remove);
+
+/**
+ * drm_gpuva_link() - link a &drm_gpuva
+ * @va: the &drm_gpuva to link
+ * @vm_bo: the &drm_gpuvm_bo to add the &drm_gpuva to
+ *
+ * This adds the given &va to the GPU VA list of the &drm_gpuvm_bo and the
+ * &drm_gpuvm_bo to the &drm_gem_object it is associated with.
+ *
+ * For every &drm_gpuva entry added to the &drm_gpuvm_bo an additional
+ * reference of the latter is taken.
+ *
+ * This function expects the caller to protect the GEM's GPUVA list against
+ * concurrent access using either the GEMs dma_resv lock or a driver specific
+ * lock set through drm_gem_gpuva_set_lock().
+ */
+void
+drm_gpuva_link(struct drm_gpuva *va, struct drm_gpuvm_bo *vm_bo)
+{
+	struct drm_gem_object *obj = va->gem.obj;
+	struct drm_gpuvm *gpuvm = va->vm;
+
+	if (unlikely(!obj))
+		return;
+
+	drm_WARN_ON(gpuvm->drm, obj != vm_bo->obj);
+
+	va->vm_bo = drm_gpuvm_bo_get(vm_bo);
+
+	drm_gem_gpuva_assert_lock_held(obj);
+	list_add_tail(&va->gem.entry, &vm_bo->list.gpuva);
+}
+EXPORT_SYMBOL_GPL(drm_gpuva_link);
+
+/**
+ * drm_gpuva_unlink() - unlink a &drm_gpuva
+ * @va: the &drm_gpuva to unlink
+ *
+ * This removes the given &va from the GPU VA list of the &drm_gem_object it is
+ * associated with.
+ *
+ * This removes the given &va from the GPU VA list of the &drm_gpuvm_bo and
+ * the &drm_gpuvm_bo from the &drm_gem_object it is associated with in case
+ * this call unlinks the last &drm_gpuva from the &drm_gpuvm_bo.
+ *
+ * For every &drm_gpuva entry removed from the &drm_gpuvm_bo a reference of
+ * the latter is dropped.
+ *
+ * This function expects the caller to protect the GEM's GPUVA list against
+ * concurrent access using either the GEMs dma_resv lock or a driver specific
+ * lock set through drm_gem_gpuva_set_lock().
+ */
+void
+drm_gpuva_unlink(struct drm_gpuva *va)
+{
+	struct drm_gem_object *obj = va->gem.obj;
+	struct drm_gpuvm_bo *vm_bo = va->vm_bo;
+
+	if (unlikely(!obj))
+		return;
+
+	drm_gem_gpuva_assert_lock_held(obj);
+	list_del_init(&va->gem.entry);
+
+	va->vm_bo = NULL;
+	drm_gpuvm_bo_put(vm_bo);
+}
+EXPORT_SYMBOL_GPL(drm_gpuva_unlink);
+
+/**
+ * drm_gpuva_find_first() - find the first &drm_gpuva in the given range
+ * @gpuvm: the &drm_gpuvm to search in
+ * @addr: the &drm_gpuvas address
+ * @range: the &drm_gpuvas range
+ *
+ * Returns: the first &drm_gpuva within the given range
+ */
+struct drm_gpuva *
+drm_gpuva_find_first(struct drm_gpuvm *gpuvm,
+		     u64 addr, u64 range)
+{
+	u64 last = addr + range - 1;
+
+	return drm_gpuva_it_iter_first(&gpuvm->rb.tree, addr, last);
+}
+EXPORT_SYMBOL_GPL(drm_gpuva_find_first);
+
+/**
+ * drm_gpuva_find() - find a &drm_gpuva
+ * @gpuvm: the &drm_gpuvm to search in
+ * @addr: the &drm_gpuvas address
+ * @range: the &drm_gpuvas range
+ *
+ * Returns: the &drm_gpuva at a given &addr and with a given &range
+ */
+struct drm_gpuva *
+drm_gpuva_find(struct drm_gpuvm *gpuvm,
+	       u64 addr, u64 range)
+{
+	struct drm_gpuva *va;
+
+	va = drm_gpuva_find_first(gpuvm, addr, range);
+	if (!va)
+		goto out;
+
+	if (va->va.addr != addr ||
+	    va->va.range != range)
+		goto out;
+
+	return va;
+
+out:
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(drm_gpuva_find);
+
+/**
+ * drm_gpuva_find_prev() - find the &drm_gpuva before the given address
+ * @gpuvm: the &drm_gpuvm to search in
+ * @start: the given GPU VA's start address
+ *
+ * Find the adjacent &drm_gpuva before the GPU VA with given &start address.
+ *
+ * Note that if there is any free space between the GPU VA mappings no mapping
+ * is returned.
+ *
+ * Returns: a pointer to the found &drm_gpuva or NULL if none was found
+ */
+struct drm_gpuva *
+drm_gpuva_find_prev(struct drm_gpuvm *gpuvm, u64 start)
+{
+	if (!drm_gpuvm_range_valid(gpuvm, start - 1, 1))
+		return NULL;
+
+	return drm_gpuva_it_iter_first(&gpuvm->rb.tree, start - 1, start);
+}
+EXPORT_SYMBOL_GPL(drm_gpuva_find_prev);
+
+/**
+ * drm_gpuva_find_next() - find the &drm_gpuva after the given address
+ * @gpuvm: the &drm_gpuvm to search in
+ * @end: the given GPU VA's end address
+ *
+ * Find the adjacent &drm_gpuva after the GPU VA with given &end address.
+ *
+ * Note that if there is any free space between the GPU VA mappings no mapping
+ * is returned.
+ *
+ * Returns: a pointer to the found &drm_gpuva or NULL if none was found
+ */
+struct drm_gpuva *
+drm_gpuva_find_next(struct drm_gpuvm *gpuvm, u64 end)
+{
+	if (!drm_gpuvm_range_valid(gpuvm, end, 1))
+		return NULL;
+
+	return drm_gpuva_it_iter_first(&gpuvm->rb.tree, end, end + 1);
+}
+EXPORT_SYMBOL_GPL(drm_gpuva_find_next);
+
+/**
+ * drm_gpuvm_interval_empty() - indicate whether a given interval of the VA space
+ * is empty
+ * @gpuvm: the &drm_gpuvm to check the range for
+ * @addr: the start address of the range
+ * @range: the range of the interval
+ *
+ * Returns: true if the interval is empty, false otherwise
+ */
+bool
+drm_gpuvm_interval_empty(struct drm_gpuvm *gpuvm, u64 addr, u64 range)
+{
+	return !drm_gpuva_find_first(gpuvm, addr, range);
+}
+EXPORT_SYMBOL_GPL(drm_gpuvm_interval_empty);
+
+/**
+ * drm_gpuva_map() - helper to insert a &drm_gpuva according to a
+ * &drm_gpuva_op_map
+ * @gpuvm: the &drm_gpuvm
+ * @va: the &drm_gpuva to insert
+ * @op: the &drm_gpuva_op_map to initialize @va with
+ *
+ * Initializes the @va from the @op and inserts it into the given @gpuvm.
+ */
+void
+drm_gpuva_map(struct drm_gpuvm *gpuvm,
+	      struct drm_gpuva *va,
+	      struct drm_gpuva_op_map *op)
+{
+	drm_gpuva_init_from_op(va, op);
+	drm_gpuva_insert(gpuvm, va);
+}
+EXPORT_SYMBOL_GPL(drm_gpuva_map);
+
+/**
+ * drm_gpuva_remap() - helper to remap a &drm_gpuva according to a
+ * &drm_gpuva_op_remap
+ * @prev: the &drm_gpuva to remap when keeping the start of a mapping
+ * @next: the &drm_gpuva to remap when keeping the end of a mapping
+ * @op: the &drm_gpuva_op_remap to initialize @prev and @next with
+ *
+ * Removes the currently mapped &drm_gpuva and remaps it using @prev and/or
+ * @next.
+ */
+void
+drm_gpuva_remap(struct drm_gpuva *prev,
+		struct drm_gpuva *next,
+		struct drm_gpuva_op_remap *op)
+{
+	struct drm_gpuva *va = op->unmap->va;
+	struct drm_gpuvm *gpuvm = va->vm;
+
+	drm_gpuva_remove(va);
+
+	if (op->prev) {
+		drm_gpuva_init_from_op(prev, op->prev);
+		drm_gpuva_insert(gpuvm, prev);
+	}
+
+	if (op->next) {
+		drm_gpuva_init_from_op(next, op->next);
+		drm_gpuva_insert(gpuvm, next);
+	}
+}
+EXPORT_SYMBOL_GPL(drm_gpuva_remap);
+
+/**
+ * drm_gpuva_unmap() - helper to remove a &drm_gpuva according to a
+ * &drm_gpuva_op_unmap
+ * @op: the &drm_gpuva_op_unmap specifying the &drm_gpuva to remove
+ *
+ * Removes the &drm_gpuva associated with the &drm_gpuva_op_unmap.
+ */
+void
+drm_gpuva_unmap(struct drm_gpuva_op_unmap *op)
+{
+	drm_gpuva_remove(op->va);
+}
+EXPORT_SYMBOL_GPL(drm_gpuva_unmap);
+
+static int
+op_map_cb(const struct drm_gpuvm_ops *fn, void *priv,
+	  u64 addr, u64 range,
+	  struct drm_gem_object *obj, u64 offset)
+{
+	struct drm_gpuva_op op = {};
+
+	op.op = DRM_GPUVA_OP_MAP;
+	op.map.va.addr = addr;
+	op.map.va.range = range;
+	op.map.gem.obj = obj;
+	op.map.gem.offset = offset;
+
+	return fn->sm_step_map(&op, priv);
+}
+
+static int
+op_remap_cb(const struct drm_gpuvm_ops *fn, void *priv,
+	    struct drm_gpuva_op_map *prev,
+	    struct drm_gpuva_op_map *next,
+	    struct drm_gpuva_op_unmap *unmap)
+{
+	struct drm_gpuva_op op = {};
+	struct drm_gpuva_op_remap *r;
+
+	op.op = DRM_GPUVA_OP_REMAP;
+	r = &op.remap;
+	r->prev = prev;
+	r->next = next;
+	r->unmap = unmap;
+
+	return fn->sm_step_remap(&op, priv);
+}
+
+static int
+op_unmap_cb(const struct drm_gpuvm_ops *fn, void *priv,
+	    struct drm_gpuva *va, bool merge)
+{
+	struct drm_gpuva_op op = {};
+
+	op.op = DRM_GPUVA_OP_UNMAP;
+	op.unmap.va = va;
+	op.unmap.keep = merge;
+
+	return fn->sm_step_unmap(&op, priv);
+}
+
+static int
+__drm_gpuvm_sm_map(struct drm_gpuvm *gpuvm,
+		   const struct drm_gpuvm_ops *ops, void *priv,
+		   u64 req_addr, u64 req_range,
+		   struct drm_gem_object *req_obj, u64 req_offset)
+{
+	struct drm_gpuva *va, *next;
+	u64 req_end = req_addr + req_range;
+	int ret;
+
+	if (unlikely(!drm_gpuvm_range_valid(gpuvm, req_addr, req_range)))
+		return -EINVAL;
+
+	drm_gpuvm_for_each_va_range_safe(va, next, gpuvm, req_addr, req_end) {
+		struct drm_gem_object *obj = va->gem.obj;
+		u64 offset = va->gem.offset;
+		u64 addr = va->va.addr;
+		u64 range = va->va.range;
+		u64 end = addr + range;
+		bool merge = !!va->gem.obj;
+
+		if (addr == req_addr) {
+			merge &= obj == req_obj &&
+				 offset == req_offset;
+
+			if (end == req_end) {
+				ret = op_unmap_cb(ops, priv, va, merge);
+				if (ret)
+					return ret;
+				break;
+			}
+
+			if (end < req_end) {
+				ret = op_unmap_cb(ops, priv, va, merge);
+				if (ret)
+					return ret;
+				continue;
+			}
+
+			if (end > req_end) {
+				struct drm_gpuva_op_map n = {
+					.va.addr = req_end,
+					.va.range = range - req_range,
+					.gem.obj = obj,
+					.gem.offset = offset + req_range,
+				};
+				struct drm_gpuva_op_unmap u = {
+					.va = va,
+					.keep = merge,
+				};
+
+				ret = op_remap_cb(ops, priv, NULL, &n, &u);
+				if (ret)
+					return ret;
+				break;
+			}
+		} else if (addr < req_addr) {
+			u64 ls_range = req_addr - addr;
+			struct drm_gpuva_op_map p = {
+				.va.addr = addr,
+				.va.range = ls_range,
+				.gem.obj = obj,
+				.gem.offset = offset,
+			};
+			struct drm_gpuva_op_unmap u = { .va = va };
+
+			merge &= obj == req_obj &&
+				 offset + ls_range == req_offset;
+			u.keep = merge;
+
+			if (end == req_end) {
+				ret = op_remap_cb(ops, priv, &p, NULL, &u);
+				if (ret)
+					return ret;
+				break;
+			}
+
+			if (end < req_end) {
+				ret = op_remap_cb(ops, priv, &p, NULL, &u);
+				if (ret)
+					return ret;
+				continue;
+			}
+
+			if (end > req_end) {
+				struct drm_gpuva_op_map n = {
+					.va.addr = req_end,
+					.va.range = end - req_end,
+					.gem.obj = obj,
+					.gem.offset = offset + ls_range +
+						      req_range,
+				};
+
+				ret = op_remap_cb(ops, priv, &p, &n, &u);
+				if (ret)
+					return ret;
+				break;
+			}
+		} else if (addr > req_addr) {
+			merge &= obj == req_obj &&
+				 offset == req_offset +
+					   (addr - req_addr);
+
+			if (end == req_end) {
+				ret = op_unmap_cb(ops, priv, va, merge);
+				if (ret)
+					return ret;
+				break;
+			}
+
+			if (end < req_end) {
+				ret = op_unmap_cb(ops, priv, va, merge);
+				if (ret)
+					return ret;
+				continue;
+			}
+
+			if (end > req_end) {
+				struct drm_gpuva_op_map n = {
+					.va.addr = req_end,
+					.va.range = end - req_end,
+					.gem.obj = obj,
+					.gem.offset = offset + req_end - addr,
+				};
+				struct drm_gpuva_op_unmap u = {
+					.va = va,
+					.keep = merge,
+				};
+
+				ret = op_remap_cb(ops, priv, NULL, &n, &u);
+				if (ret)
+					return ret;
+				break;
+			}
+		}
+	}
+
+	return op_map_cb(ops, priv,
+			 req_addr, req_range,
+			 req_obj, req_offset);
+}
+
+static int
+__drm_gpuvm_sm_unmap(struct drm_gpuvm *gpuvm,
+		     const struct drm_gpuvm_ops *ops, void *priv,
+		     u64 req_addr, u64 req_range)
+{
+	struct drm_gpuva *va, *next;
+	u64 req_end = req_addr + req_range;
+	int ret;
+
+	if (unlikely(!drm_gpuvm_range_valid(gpuvm, req_addr, req_range)))
+		return -EINVAL;
+
+	drm_gpuvm_for_each_va_range_safe(va, next, gpuvm, req_addr, req_end) {
+		struct drm_gpuva_op_map prev = {}, next = {};
+		bool prev_split = false, next_split = false;
+		struct drm_gem_object *obj = va->gem.obj;
+		u64 offset = va->gem.offset;
+		u64 addr = va->va.addr;
+		u64 range = va->va.range;
+		u64 end = addr + range;
+
+		if (addr < req_addr) {
+			prev.va.addr = addr;
+			prev.va.range = req_addr - addr;
+			prev.gem.obj = obj;
+			prev.gem.offset = offset;
+
+			prev_split = true;
+		}
+
+		if (end > req_end) {
+			next.va.addr = req_end;
+			next.va.range = end - req_end;
+			next.gem.obj = obj;
+			next.gem.offset = offset + (req_end - addr);
+
+			next_split = true;
+		}
+
+		if (prev_split || next_split) {
+			struct drm_gpuva_op_unmap unmap = { .va = va };
+
+			ret = op_remap_cb(ops, priv,
+					  prev_split ? &prev : NULL,
+					  next_split ? &next : NULL,
+					  &unmap);
+			if (ret)
+				return ret;
+		} else {
+			ret = op_unmap_cb(ops, priv, va, false);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * drm_gpuvm_sm_map() - creates the &drm_gpuva_op split/merge steps
+ * @gpuvm: the &drm_gpuvm representing the GPU VA space
+ * @req_addr: the start address of the new mapping
+ * @req_range: the range of the new mapping
+ * @req_obj: the &drm_gem_object to map
+ * @req_offset: the offset within the &drm_gem_object
+ * @priv: pointer to a driver private data structure
+ *
+ * This function iterates the given range of the GPU VA space. It utilizes the
+ * &drm_gpuvm_ops to call back into the driver providing the split and merge
+ * steps.
+ *
+ * Drivers may use these callbacks to update the GPU VA space right away within
+ * the callback. In case the driver decides to copy and store the operations for
+ * later processing neither this function nor &drm_gpuvm_sm_unmap is allowed to
+ * be called before the &drm_gpuvm's view of the GPU VA space was
+ * updated with the previous set of operations. To update the
+ * &drm_gpuvm's view of the GPU VA space drm_gpuva_insert(),
+ * drm_gpuva_destroy_locked() and/or drm_gpuva_destroy_unlocked() should be
+ * used.
+ *
+ * A sequence of callbacks can contain map, unmap and remap operations, but
+ * the sequence of callbacks might also be empty if no operation is required,
+ * e.g. if the requested mapping already exists in the exact same way.
+ *
+ * There can be an arbitrary amount of unmap operations, a maximum of two remap
+ * operations and a single map operation. The latter one represents the original
+ * map operation requested by the caller.
+ *
+ * Returns: 0 on success or a negative error code
+ */
+int
+drm_gpuvm_sm_map(struct drm_gpuvm *gpuvm, void *priv,
+		 u64 req_addr, u64 req_range,
+		 struct drm_gem_object *req_obj, u64 req_offset)
+{
+	const struct drm_gpuvm_ops *ops = gpuvm->ops;
+
+	if (unlikely(!(ops && ops->sm_step_map &&
+		       ops->sm_step_remap &&
+		       ops->sm_step_unmap)))
+		return -EINVAL;
+
+	return __drm_gpuvm_sm_map(gpuvm, ops, priv,
+				  req_addr, req_range,
+				  req_obj, req_offset);
+}
+EXPORT_SYMBOL_GPL(drm_gpuvm_sm_map);
+
+/**
+ * drm_gpuvm_sm_unmap() - creates the &drm_gpuva_ops to split on unmap
+ * @gpuvm: the &drm_gpuvm representing the GPU VA space
+ * @priv: pointer to a driver private data structure
+ * @req_addr: the start address of the range to unmap
+ * @req_range: the range of the mappings to unmap
+ *
+ * This function iterates the given range of the GPU VA space. It utilizes the
+ * &drm_gpuvm_ops to call back into the driver providing the operations to
+ * unmap and, if required, split existent mappings.
+ *
+ * Drivers may use these callbacks to update the GPU VA space right away within
+ * the callback. In case the driver decides to copy and store the operations for
+ * later processing neither this function nor &drm_gpuvm_sm_map is allowed to be
+ * called before the &drm_gpuvm's view of the GPU VA space was updated
+ * with the previous set of operations. To update the &drm_gpuvm's view
+ * of the GPU VA space drm_gpuva_insert(), drm_gpuva_destroy_locked() and/or
+ * drm_gpuva_destroy_unlocked() should be used.
+ *
+ * A sequence of callbacks can contain unmap and remap operations, depending on
+ * whether there are actual overlapping mappings to split.
+ *
+ * There can be an arbitrary amount of unmap operations and a maximum of two
+ * remap operations.
+ *
+ * Returns: 0 on success or a negative error code
+ */
+int
+drm_gpuvm_sm_unmap(struct drm_gpuvm *gpuvm, void *priv,
+		   u64 req_addr, u64 req_range)
+{
+	const struct drm_gpuvm_ops *ops = gpuvm->ops;
+
+	if (unlikely(!(ops && ops->sm_step_remap &&
+		       ops->sm_step_unmap)))
+		return -EINVAL;
+
+	return __drm_gpuvm_sm_unmap(gpuvm, ops, priv,
+				    req_addr, req_range);
+}
+EXPORT_SYMBOL_GPL(drm_gpuvm_sm_unmap);
+
+static struct drm_gpuva_op *
+gpuva_op_alloc(struct drm_gpuvm *gpuvm)
+{
+	const struct drm_gpuvm_ops *fn = gpuvm->ops;
+	struct drm_gpuva_op *op;
+
+	if (fn && fn->op_alloc)
+		op = fn->op_alloc();
+	else
+		op = kzalloc(sizeof(*op), GFP_KERNEL);
+
+	if (unlikely(!op))
+		return NULL;
+
+	return op;
+}
+
+static void
+gpuva_op_free(struct drm_gpuvm *gpuvm,
+	      struct drm_gpuva_op *op)
+{
+	const struct drm_gpuvm_ops *fn = gpuvm->ops;
+
+	if (fn && fn->op_free)
+		fn->op_free(op);
+	else
+		kfree(op);
+}
+
+static int
+drm_gpuva_sm_step(struct drm_gpuva_op *__op,
+		  void *priv)
+{
+	struct {
+		struct drm_gpuvm *vm;
+		struct drm_gpuva_ops *ops;
+	} *args = priv;
+	struct drm_gpuvm *gpuvm = args->vm;
+	struct drm_gpuva_ops *ops = args->ops;
+	struct drm_gpuva_op *op;
+
+	op = gpuva_op_alloc(gpuvm);
+	if (unlikely(!op))
+		goto err;
+
+	memcpy(op, __op, sizeof(*op));
+
+	if (op->op == DRM_GPUVA_OP_REMAP) {
+		struct drm_gpuva_op_remap *__r = &__op->remap;
+		struct drm_gpuva_op_remap *r = &op->remap;
+
+		r->unmap = kmemdup(__r->unmap, sizeof(*r->unmap),
+				   GFP_KERNEL);
+		if (unlikely(!r->unmap))
+			goto err_free_op;
+
+		if (__r->prev) {
+			r->prev = kmemdup(__r->prev, sizeof(*r->prev),
+					  GFP_KERNEL);
+			if (unlikely(!r->prev))
+				goto err_free_unmap;
+		}
+
+		if (__r->next) {
+			r->next = kmemdup(__r->next, sizeof(*r->next),
+					  GFP_KERNEL);
+			if (unlikely(!r->next))
+				goto err_free_prev;
+		}
+	}
+
+	list_add_tail(&op->entry, &ops->list);
+
+	return 0;
+
+err_free_unmap:
+	kfree(op->remap.unmap);
+err_free_prev:
+	kfree(op->remap.prev);
+err_free_op:
+	gpuva_op_free(gpuvm, op);
+err:
+	return -ENOMEM;
+}
+
+static const struct drm_gpuvm_ops gpuvm_list_ops = {
+	.sm_step_map = drm_gpuva_sm_step,
+	.sm_step_remap = drm_gpuva_sm_step,
+	.sm_step_unmap = drm_gpuva_sm_step,
+};
+
+/**
+ * drm_gpuvm_sm_map_ops_create() - creates the &drm_gpuva_ops to split and merge
+ * @gpuvm: the &drm_gpuvm representing the GPU VA space
+ * @req_addr: the start address of the new mapping
+ * @req_range: the range of the new mapping
+ * @req_obj: the &drm_gem_object to map
+ * @req_offset: the offset within the &drm_gem_object
+ *
+ * This function creates a list of operations to perform splitting and merging
+ * of existent mapping(s) with the newly requested one.
+ *
+ * The list can be iterated with &drm_gpuva_for_each_op and must be processed
+ * in the given order. It can contain map, unmap and remap operations, but it
+ * also can be empty if no operation is required, e.g. if the requested mapping
+ * already exists is the exact same way.
+ *
+ * There can be an arbitrary amount of unmap operations, a maximum of two remap
+ * operations and a single map operation. The latter one represents the original
+ * map operation requested by the caller.
+ *
+ * Note that before calling this function again with another mapping request it
+ * is necessary to update the &drm_gpuvm's view of the GPU VA space. The
+ * previously obtained operations must be either processed or abandoned. To
+ * update the &drm_gpuvm's view of the GPU VA space drm_gpuva_insert(),
+ * drm_gpuva_destroy_locked() and/or drm_gpuva_destroy_unlocked() should be
+ * used.
+ *
+ * After the caller finished processing the returned &drm_gpuva_ops, they must
+ * be freed with &drm_gpuva_ops_free.
+ *
+ * Returns: a pointer to the &drm_gpuva_ops on success, an ERR_PTR on failure
+ */
+struct drm_gpuva_ops *
+drm_gpuvm_sm_map_ops_create(struct drm_gpuvm *gpuvm,
+			    u64 req_addr, u64 req_range,
+			    struct drm_gem_object *req_obj, u64 req_offset)
+{
+	struct drm_gpuva_ops *ops;
+	struct {
+		struct drm_gpuvm *vm;
+		struct drm_gpuva_ops *ops;
+	} args;
+	int ret;
+
+	ops = kzalloc(sizeof(*ops), GFP_KERNEL);
+	if (unlikely(!ops))
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&ops->list);
+
+	args.vm = gpuvm;
+	args.ops = ops;
+
+	ret = __drm_gpuvm_sm_map(gpuvm, &gpuvm_list_ops, &args,
+				 req_addr, req_range,
+				 req_obj, req_offset);
+	if (ret)
+		goto err_free_ops;
+
+	return ops;
+
+err_free_ops:
+	drm_gpuva_ops_free(gpuvm, ops);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(drm_gpuvm_sm_map_ops_create);
+
+/**
+ * drm_gpuvm_sm_unmap_ops_create() - creates the &drm_gpuva_ops to split on
+ * unmap
+ * @gpuvm: the &drm_gpuvm representing the GPU VA space
+ * @req_addr: the start address of the range to unmap
+ * @req_range: the range of the mappings to unmap
+ *
+ * This function creates a list of operations to perform unmapping and, if
+ * required, splitting of the mappings overlapping the unmap range.
+ *
+ * The list can be iterated with &drm_gpuva_for_each_op and must be processed
+ * in the given order. It can contain unmap and remap operations, depending on
+ * whether there are actual overlapping mappings to split.
+ *
+ * There can be an arbitrary amount of unmap operations and a maximum of two
+ * remap operations.
+ *
+ * Note that before calling this function again with another range to unmap it
+ * is necessary to update the &drm_gpuvm's view of the GPU VA space. The
+ * previously obtained operations must be processed or abandoned. To update the
+ * &drm_gpuvm's view of the GPU VA space drm_gpuva_insert(),
+ * drm_gpuva_destroy_locked() and/or drm_gpuva_destroy_unlocked() should be
+ * used.
+ *
+ * After the caller finished processing the returned &drm_gpuva_ops, they must
+ * be freed with &drm_gpuva_ops_free.
+ *
+ * Returns: a pointer to the &drm_gpuva_ops on success, an ERR_PTR on failure
+ */
+struct drm_gpuva_ops *
+drm_gpuvm_sm_unmap_ops_create(struct drm_gpuvm *gpuvm,
+			      u64 req_addr, u64 req_range)
+{
+	struct drm_gpuva_ops *ops;
+	struct {
+		struct drm_gpuvm *vm;
+		struct drm_gpuva_ops *ops;
+	} args;
+	int ret;
+
+	ops = kzalloc(sizeof(*ops), GFP_KERNEL);
+	if (unlikely(!ops))
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&ops->list);
+
+	args.vm = gpuvm;
+	args.ops = ops;
+
+	ret = __drm_gpuvm_sm_unmap(gpuvm, &gpuvm_list_ops, &args,
+				   req_addr, req_range);
+	if (ret)
+		goto err_free_ops;
+
+	return ops;
+
+err_free_ops:
+	drm_gpuva_ops_free(gpuvm, ops);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(drm_gpuvm_sm_unmap_ops_create);
+
+/**
+ * drm_gpuvm_prefetch_ops_create() - creates the &drm_gpuva_ops to prefetch
+ * @gpuvm: the &drm_gpuvm representing the GPU VA space
+ * @addr: the start address of the range to prefetch
+ * @range: the range of the mappings to prefetch
+ *
+ * This function creates a list of operations to perform prefetching.
+ *
+ * The list can be iterated with &drm_gpuva_for_each_op and must be processed
+ * in the given order. It can contain prefetch operations.
+ *
+ * There can be an arbitrary amount of prefetch operations.
+ *
+ * After the caller finished processing the returned &drm_gpuva_ops, they must
+ * be freed with &drm_gpuva_ops_free.
+ *
+ * Returns: a pointer to the &drm_gpuva_ops on success, an ERR_PTR on failure
+ */
+struct drm_gpuva_ops *
+drm_gpuvm_prefetch_ops_create(struct drm_gpuvm *gpuvm,
+			      u64 addr, u64 range)
+{
+	struct drm_gpuva_ops *ops;
+	struct drm_gpuva_op *op;
+	struct drm_gpuva *va;
+	u64 end = addr + range;
+	int ret;
+
+	ops = kzalloc(sizeof(*ops), GFP_KERNEL);
+	if (!ops)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&ops->list);
+
+	drm_gpuvm_for_each_va_range(va, gpuvm, addr, end) {
+		op = gpuva_op_alloc(gpuvm);
+		if (!op) {
+			ret = -ENOMEM;
+			goto err_free_ops;
+		}
+
+		op->op = DRM_GPUVA_OP_PREFETCH;
+		op->prefetch.va = va;
+		list_add_tail(&op->entry, &ops->list);
+	}
+
+	return ops;
+
+err_free_ops:
+	drm_gpuva_ops_free(gpuvm, ops);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(drm_gpuvm_prefetch_ops_create);
+
+/**
+ * drm_gpuvm_bo_unmap_ops_create() - creates the &drm_gpuva_ops to unmap a GEM
+ * @vm_bo: the &drm_gpuvm_bo abstraction
+ *
+ * This function creates a list of operations to perform unmapping for every
+ * GPUVA attached to a GEM.
+ *
+ * The list can be iterated with &drm_gpuva_for_each_op and consists out of an
+ * arbitrary amount of unmap operations.
+ *
+ * After the caller finished processing the returned &drm_gpuva_ops, they must
+ * be freed with &drm_gpuva_ops_free.
+ *
+ * It is the callers responsibility to protect the GEMs GPUVA list against
+ * concurrent access using the GEMs dma_resv lock.
+ *
+ * Returns: a pointer to the &drm_gpuva_ops on success, an ERR_PTR on failure
+ */
+struct drm_gpuva_ops *
+drm_gpuvm_bo_unmap_ops_create(struct drm_gpuvm_bo *vm_bo)
+{
+	struct drm_gpuva_ops *ops;
+	struct drm_gpuva_op *op;
+	struct drm_gpuva *va;
+	int ret;
+
+	drm_gem_gpuva_assert_lock_held(vm_bo->obj);
+
+	ops = kzalloc(sizeof(*ops), GFP_KERNEL);
+	if (!ops)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&ops->list);
+
+	drm_gpuvm_bo_for_each_va(va, vm_bo) {
+		op = gpuva_op_alloc(vm_bo->vm);
+		if (!op) {
+			ret = -ENOMEM;
+			goto err_free_ops;
+		}
+
+		op->op = DRM_GPUVA_OP_UNMAP;
+		op->unmap.va = va;
+		list_add_tail(&op->entry, &ops->list);
+	}
+
+	return ops;
+
+err_free_ops:
+	drm_gpuva_ops_free(vm_bo->vm, ops);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(drm_gpuvm_bo_unmap_ops_create);
+
+/**
+ * drm_gpuva_ops_free() - free the given &drm_gpuva_ops
+ * @gpuvm: the &drm_gpuvm the ops were created for
+ * @ops: the &drm_gpuva_ops to free
+ *
+ * Frees the given &drm_gpuva_ops structure including all the ops associated
+ * with it.
+ */
+void
+drm_gpuva_ops_free(struct drm_gpuvm *gpuvm,
+		   struct drm_gpuva_ops *ops)
+{
+	struct drm_gpuva_op *op, *next;
+
+	drm_gpuva_for_each_op_safe(op, next, ops) {
+		list_del(&op->entry);
+
+		if (op->op == DRM_GPUVA_OP_REMAP) {
+			kfree(op->remap.prev);
+			kfree(op->remap.next);
+			kfree(op->remap.unmap);
+		}
+
+		gpuva_op_free(gpuvm, op);
+	}
+
+	kfree(ops);
+}
+EXPORT_SYMBOL_GPL(drm_gpuva_ops_free);
+
+MODULE_DESCRIPTION("DRM GPUVM");
+MODULE_LICENSE("GPL");
diff --git a/drivers/gpu/drm/drm_ioctl.c b/drivers/gpu/drm/drm_ioctl.c
index 4b839f17f315a..4f6895096c704 100644
--- a/drivers/gpu/drm/drm_ioctl.c
+++ b/drivers/gpu/drm/drm_ioctl.c
@@ -245,8 +245,7 @@ static int drm_getcap(struct drm_device *dev, void *data, struct drm_file *file_
 		req->value = 1;
 		return 0;
 	case DRM_CAP_PRIME:
-		req->value |= dev->driver->prime_fd_to_handle ? DRM_PRIME_CAP_IMPORT : 0;
-		req->value |= dev->driver->prime_handle_to_fd ? DRM_PRIME_CAP_EXPORT : 0;
+		req->value = DRM_PRIME_CAP_IMPORT | DRM_PRIME_CAP_EXPORT;
 		return 0;
 	case DRM_CAP_SYNCOBJ:
 		req->value = drm_core_check_feature(dev, DRIVER_SYNCOBJ);
diff --git a/drivers/gpu/drm/drm_prime.c b/drivers/gpu/drm/drm_prime.c
index eb09e86044c6d..c631a20069858 100644
--- a/drivers/gpu/drm/drm_prime.c
+++ b/drivers/gpu/drm/drm_prime.c
@@ -51,15 +51,10 @@ MODULE_IMPORT_NS(DMA_BUF);
  * between applications, they can't be guessed like the globally unique GEM
  * names.
  *
- * Drivers that support the PRIME API implement the
- * &drm_driver.prime_handle_to_fd and &drm_driver.prime_fd_to_handle operations.
- * GEM based drivers must use drm_gem_prime_handle_to_fd() and
- * drm_gem_prime_fd_to_handle() to implement these. For GEM based drivers the
- * actual driver interfaces is provided through the &drm_gem_object_funcs.export
- * and &drm_driver.gem_prime_import hooks.
- *
- * &dma_buf_ops implementations for GEM drivers are all individually exported
- * for drivers which need to overwrite or reimplement some of them.
+ * Drivers that support the PRIME API implement the drm_gem_object_funcs.export
+ * and &drm_driver.gem_prime_import hooks. &dma_buf_ops implementations for
+ * drivers are all individually exported for drivers which need to overwrite
+ * or reimplement some of them.
  *
  * Reference Counting for GEM Drivers
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -372,11 +367,12 @@ int drm_prime_fd_to_handle_ioctl(struct drm_device *dev, void *data,
 {
 	struct drm_prime_handle *args = data;
 
-	if (!dev->driver->prime_fd_to_handle)
-		return -ENOSYS;
+	if (dev->driver->prime_fd_to_handle) {
+		return dev->driver->prime_fd_to_handle(dev, file_priv, args->fd,
+						       &args->handle);
+	}
 
-	return dev->driver->prime_fd_to_handle(dev, file_priv,
-			args->fd, &args->handle);
+	return drm_gem_prime_fd_to_handle(dev, file_priv, args->fd, &args->handle);
 }
 
 static struct dma_buf *export_and_register_object(struct drm_device *dev,
@@ -518,15 +514,17 @@ int drm_prime_handle_to_fd_ioctl(struct drm_device *dev, void *data,
 {
 	struct drm_prime_handle *args = data;
 
-	if (!dev->driver->prime_handle_to_fd)
-		return -ENOSYS;
-
 	/* check flags are valid */
 	if (args->flags & ~(DRM_CLOEXEC | DRM_RDWR))
 		return -EINVAL;
 
-	return dev->driver->prime_handle_to_fd(dev, file_priv,
-			args->handle, args->flags, &args->fd);
+	if (dev->driver->prime_handle_to_fd) {
+		return dev->driver->prime_handle_to_fd(dev, file_priv,
+						       args->handle, args->flags,
+						       &args->fd);
+	}
+	return drm_gem_prime_handle_to_fd(dev, file_priv, args->handle,
+					  args->flags, &args->fd);
 }
 
 /**
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_drv.c b/drivers/gpu/drm/etnaviv/etnaviv_drv.c
index 1d2b4fb4bcf8b..7ebce549fb914 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_drv.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_drv.c
@@ -474,8 +474,6 @@ static const struct drm_driver etnaviv_drm_driver = {
 	.driver_features    = DRIVER_GEM | DRIVER_RENDER,
 	.open               = etnaviv_open,
 	.postclose           = etnaviv_postclose,
-	.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
-	.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
 	.gem_prime_import_sg_table = etnaviv_gem_prime_import_sg_table,
 	.gem_prime_mmap     = drm_gem_prime_mmap,
 #ifdef CONFIG_DEBUG_FS
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
index 1ac916b248917..ceaebf793909b 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
@@ -531,7 +531,7 @@ int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data,
 
 	ret = drm_sched_job_init(&submit->sched_job,
 				 &ctx->sched_entity[args->pipe],
-				 submit->ctx);
+				 1, submit->ctx);
 	if (ret)
 		goto err_submit_put;
 
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
index f667e7906d1f4..b08709715abd1 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
@@ -1870,7 +1870,7 @@ static int etnaviv_gpu_rpm_suspend(struct device *dev)
 	u32 idle, mask;
 
 	/* If there are any jobs in the HW queue, we're not idle */
-	if (atomic_read(&gpu->sched.hw_rq_count))
+	if (atomic_read(&gpu->sched.credit_count))
 		return -EBUSY;
 
 	/* Check whether the hardware (except FE and MC) is idle */
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
index 72e2553fbc984..8d8ef85cbad57 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
@@ -133,7 +133,8 @@ int etnaviv_sched_init(struct etnaviv_gpu *gpu)
 {
 	int ret;
 
-	ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops,
+	ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops, NULL,
+			     DRM_SCHED_PRIORITY_COUNT,
 			     etnaviv_hw_jobs_limit, etnaviv_job_hang_limit,
 			     msecs_to_jiffies(500), NULL, NULL,
 			     dev_name(gpu->dev), gpu->dev);
diff --git a/drivers/gpu/drm/exynos/exynos_drm_drv.c b/drivers/gpu/drm/exynos/exynos_drm_drv.c
index 16c539657f730..1b2951edc2607 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_drv.c
+++ b/drivers/gpu/drm/exynos/exynos_drm_drv.c
@@ -111,8 +111,6 @@ static const struct drm_driver exynos_drm_driver = {
 	.lastclose		= drm_fb_helper_lastclose,
 	.postclose		= exynos_drm_postclose,
 	.dumb_create		= exynos_drm_gem_dumb_create,
-	.prime_handle_to_fd	= drm_gem_prime_handle_to_fd,
-	.prime_fd_to_handle	= drm_gem_prime_fd_to_handle,
 	.gem_prime_import	= exynos_drm_gem_prime_import,
 	.gem_prime_import_sg_table	= exynos_drm_gem_prime_import_sg_table,
 	.gem_prime_mmap		= drm_gem_prime_mmap,
diff --git a/drivers/gpu/drm/i915/i915_driver.c b/drivers/gpu/drm/i915/i915_driver.c
index 35bc2a3fa811c..2c9e0da31cf6c 100644
--- a/drivers/gpu/drm/i915/i915_driver.c
+++ b/drivers/gpu/drm/i915/i915_driver.c
@@ -1920,8 +1920,6 @@ static const struct drm_driver i915_drm_driver = {
 	.lastclose = i915_driver_lastclose,
 	.postclose = i915_driver_postclose,
 
-	.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
-	.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
 	.gem_prime_import = i915_gem_prime_import,
 
 	.dumb_create = i915_gem_dumb_create,
diff --git a/drivers/gpu/drm/lima/lima_device.c b/drivers/gpu/drm/lima/lima_device.c
index 02cef0cea6572..0bf7105c8748b 100644
--- a/drivers/gpu/drm/lima/lima_device.c
+++ b/drivers/gpu/drm/lima/lima_device.c
@@ -514,7 +514,7 @@ int lima_device_suspend(struct device *dev)
 
 	/* check any task running */
 	for (i = 0; i < lima_pipe_num; i++) {
-		if (atomic_read(&ldev->pipe[i].base.hw_rq_count))
+		if (atomic_read(&ldev->pipe[i].base.credit_count))
 			return -EBUSY;
 	}
 
diff --git a/drivers/gpu/drm/lima/lima_drv.c b/drivers/gpu/drm/lima/lima_drv.c
index 39cab4a55f572..51efdc107314f 100644
--- a/drivers/gpu/drm/lima/lima_drv.c
+++ b/drivers/gpu/drm/lima/lima_drv.c
@@ -276,9 +276,7 @@ static const struct drm_driver lima_drm_driver = {
 	.patchlevel         = 0,
 
 	.gem_create_object  = lima_gem_create_object,
-	.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
 	.gem_prime_import_sg_table = drm_gem_shmem_prime_import_sg_table,
-	.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
 	.gem_prime_mmap = drm_gem_prime_mmap,
 };
 
diff --git a/drivers/gpu/drm/lima/lima_gem.c b/drivers/gpu/drm/lima/lima_gem.c
index 0f1ca0b0db495..5008f0c2428f9 100644
--- a/drivers/gpu/drm/lima/lima_gem.c
+++ b/drivers/gpu/drm/lima/lima_gem.c
@@ -34,7 +34,7 @@ int lima_heap_alloc(struct lima_bo *bo, struct lima_vm *vm)
 
 	new_size = min(new_size, bo->base.base.size);
 
-	mutex_lock(&bo->base.pages_lock);
+	dma_resv_lock(bo->base.base.resv, NULL);
 
 	if (bo->base.pages) {
 		pages = bo->base.pages;
@@ -42,7 +42,7 @@ int lima_heap_alloc(struct lima_bo *bo, struct lima_vm *vm)
 		pages = kvmalloc_array(bo->base.base.size >> PAGE_SHIFT,
 				       sizeof(*pages), GFP_KERNEL | __GFP_ZERO);
 		if (!pages) {
-			mutex_unlock(&bo->base.pages_lock);
+			dma_resv_unlock(bo->base.base.resv);
 			return -ENOMEM;
 		}
 
@@ -56,13 +56,13 @@ int lima_heap_alloc(struct lima_bo *bo, struct lima_vm *vm)
 		struct page *page = shmem_read_mapping_page(mapping, i);
 
 		if (IS_ERR(page)) {
-			mutex_unlock(&bo->base.pages_lock);
+			dma_resv_unlock(bo->base.base.resv);
 			return PTR_ERR(page);
 		}
 		pages[i] = page;
 	}
 
-	mutex_unlock(&bo->base.pages_lock);
+	dma_resv_unlock(bo->base.base.resv);
 
 	ret = sg_alloc_table_from_pages(&sgt, pages, i, 0,
 					new_size, GFP_KERNEL);
diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c
index e82931712d8a2..b59269db0ffa6 100644
--- a/drivers/gpu/drm/lima/lima_sched.c
+++ b/drivers/gpu/drm/lima/lima_sched.c
@@ -123,7 +123,7 @@ int lima_sched_task_init(struct lima_sched_task *task,
 	for (i = 0; i < num_bos; i++)
 		drm_gem_object_get(&bos[i]->base.base);
 
-	err = drm_sched_job_init(&task->base, &context->base, vm);
+	err = drm_sched_job_init(&task->base, &context->base, 1, vm);
 	if (err) {
 		kfree(task->bos);
 		return err;
@@ -371,7 +371,7 @@ static void lima_sched_build_error_task_list(struct lima_sched_task *task)
 		} else {
 			buffer_chunk->size = lima_bo_size(bo);
 
-			ret = drm_gem_shmem_vmap(&bo->base, &map);
+			ret = drm_gem_vmap_unlocked(&bo->base.base, &map);
 			if (ret) {
 				kvfree(et);
 				goto out;
@@ -379,7 +379,7 @@ static void lima_sched_build_error_task_list(struct lima_sched_task *task)
 
 			memcpy(buffer_chunk + 1, map.vaddr, buffer_chunk->size);
 
-			drm_gem_shmem_vunmap(&bo->base, &map);
+			drm_gem_vunmap_unlocked(&bo->base.base, &map);
 		}
 
 		buffer_chunk = (void *)(buffer_chunk + 1) + buffer_chunk->size;
@@ -488,7 +488,9 @@ int lima_sched_pipe_init(struct lima_sched_pipe *pipe, const char *name)
 
 	INIT_WORK(&pipe->recover_work, lima_sched_recover_work);
 
-	return drm_sched_init(&pipe->base, &lima_sched_ops, 1,
+	return drm_sched_init(&pipe->base, &lima_sched_ops, NULL,
+			      DRM_SCHED_PRIORITY_COUNT,
+			      1,
 			      lima_job_hang_limit,
 			      msecs_to_jiffies(timeout), NULL,
 			      NULL, name, pipe->ldev->dev);
diff --git a/drivers/gpu/drm/mediatek/mtk_drm_drv.c b/drivers/gpu/drm/mediatek/mtk_drm_drv.c
index 25639fbfd374a..c7fa26f45780d 100644
--- a/drivers/gpu/drm/mediatek/mtk_drm_drv.c
+++ b/drivers/gpu/drm/mediatek/mtk_drm_drv.c
@@ -472,8 +472,6 @@ static const struct drm_driver mtk_drm_driver = {
 
 	.dumb_create = mtk_drm_gem_dumb_create,
 
-	.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
-	.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
 	.gem_prime_import = mtk_drm_gem_prime_import,
 	.gem_prime_import_sg_table = mtk_gem_prime_import_sg_table,
 	.gem_prime_mmap = drm_gem_prime_mmap,
diff --git a/drivers/gpu/drm/msm/adreno/adreno_device.c b/drivers/gpu/drm/msm/adreno/adreno_device.c
index ed1e0c650bb1a..41e901cdf0762 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_device.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_device.c
@@ -667,7 +667,8 @@ static void suspend_scheduler(struct msm_gpu *gpu)
 	 */
 	for (i = 0; i < gpu->nr_rings; i++) {
 		struct drm_gpu_scheduler *sched = &gpu->rb[i]->sched;
-		kthread_park(sched->thread);
+
+		drm_sched_wqueue_stop(sched);
 	}
 }
 
@@ -677,7 +678,8 @@ static void resume_scheduler(struct msm_gpu *gpu)
 
 	for (i = 0; i < gpu->nr_rings; i++) {
 		struct drm_gpu_scheduler *sched = &gpu->rb[i]->sched;
-		kthread_unpark(sched->thread);
+
+		drm_sched_wqueue_start(sched);
 	}
 }
 
diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index f982a827be7ca..73539e6bea94a 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -1083,8 +1083,6 @@ static const struct drm_driver msm_driver = {
 	.lastclose          = drm_fb_helper_lastclose,
 	.dumb_create        = msm_gem_dumb_create,
 	.dumb_map_offset    = msm_gem_dumb_map_offset,
-	.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
-	.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
 	.gem_prime_import_sg_table = msm_gem_prime_import_sg_table,
 	.gem_prime_mmap     = msm_gem_prime_mmap,
 #ifdef CONFIG_DEBUG_FS
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c
index c12a6ac2d3840..3a8b9098b39ae 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -41,7 +41,7 @@ static struct msm_gem_submit *submit_create(struct drm_device *dev,
 	if (!submit)
 		return ERR_PTR(-ENOMEM);
 
-	ret = drm_sched_job_init(&submit->base, queue->entity, queue);
+	ret = drm_sched_job_init(&submit->base, queue->entity, 1, queue);
 	if (ret) {
 		kfree(submit);
 		return ERR_PTR(ret);
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index b39cd332751dc..175d7fa7954aa 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -337,7 +337,7 @@ struct msm_gpu_perfcntr {
  * DRM_SCHED_PRIORITY_KERNEL priority level is treated specially in some
  * cases, so we don't use it (no need for kernel generated jobs).
  */
-#define NR_SCHED_PRIORITIES (1 + DRM_SCHED_PRIORITY_HIGH - DRM_SCHED_PRIORITY_MIN)
+#define NR_SCHED_PRIORITIES (1 + DRM_SCHED_PRIORITY_LOW - DRM_SCHED_PRIORITY_HIGH)
 
 /**
  * struct msm_file_private - per-drm_file context
diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c b/drivers/gpu/drm/msm/msm_ringbuffer.c
index 57a8e9564540e..546bd6388b2eb 100644
--- a/drivers/gpu/drm/msm/msm_ringbuffer.c
+++ b/drivers/gpu/drm/msm/msm_ringbuffer.c
@@ -95,9 +95,10 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id,
 	 /* currently managing hangcheck ourselves: */
 	sched_timeout = MAX_SCHEDULE_TIMEOUT;
 
-	ret = drm_sched_init(&ring->sched, &msm_sched_ops,
-			num_hw_submissions, 0, sched_timeout,
-			NULL, NULL, to_msm_bo(ring->bo)->name, gpu->dev->dev);
+	ret = drm_sched_init(&ring->sched, &msm_sched_ops, NULL,
+			     DRM_SCHED_PRIORITY_COUNT,
+			     num_hw_submissions, 0, sched_timeout,
+			     NULL, NULL, to_msm_bo(ring->bo)->name, gpu->dev->dev);
 	if (ret) {
 		goto fail;
 	}
diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c
index 28062d682f436..3b01e86e71a12 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drm.c
@@ -1260,8 +1260,6 @@ driver_stub = {
 	.num_ioctls = ARRAY_SIZE(nouveau_ioctls),
 	.fops = &nouveau_driver_fops,
 
-	.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
-	.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
 	.gem_prime_import_sg_table = nouveau_gem_prime_import_sg_table,
 	.gem_prime_mmap = drm_gem_prime_mmap,
 
diff --git a/drivers/gpu/drm/omapdrm/omap_drv.c b/drivers/gpu/drm/omapdrm/omap_drv.c
index eaf67b9e5f12b..f75d69d901126 100644
--- a/drivers/gpu/drm/omapdrm/omap_drv.c
+++ b/drivers/gpu/drm/omapdrm/omap_drv.c
@@ -694,8 +694,6 @@ static const struct drm_driver omap_drm_driver = {
 #ifdef CONFIG_DEBUG_FS
 	.debugfs_init = omap_debugfs_init,
 #endif
-	.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
-	.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
 	.gem_prime_import = omap_gem_prime_import,
 	.dumb_create = omap_gem_dumb_create,
 	.dumb_map_offset = omap_gem_dumb_map_offset,
diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c
index 919e6cc049828..bdb2c424640bb 100644
--- a/drivers/gpu/drm/panfrost/panfrost_drv.c
+++ b/drivers/gpu/drm/panfrost/panfrost_drv.c
@@ -278,7 +278,7 @@ static int panfrost_ioctl_submit(struct drm_device *dev, void *data,
 
 	ret = drm_sched_job_init(&job->base,
 				 &file_priv->sched_entity[slot],
-				 NULL);
+				 1, NULL);
 	if (ret)
 		goto out_put_job;
 
@@ -414,6 +414,10 @@ static int panfrost_ioctl_madvise(struct drm_device *dev, void *data,
 
 	bo = to_panfrost_bo(gem_obj);
 
+	ret = dma_resv_lock_interruptible(bo->base.base.resv, NULL);
+	if (ret)
+		goto out_put_object;
+
 	mutex_lock(&pfdev->shrinker_lock);
 	mutex_lock(&bo->mappings.lock);
 	if (args->madv == PANFROST_MADV_DONTNEED) {
@@ -451,7 +455,8 @@ static int panfrost_ioctl_madvise(struct drm_device *dev, void *data,
 out_unlock_mappings:
 	mutex_unlock(&bo->mappings.lock);
 	mutex_unlock(&pfdev->shrinker_lock);
-
+	dma_resv_unlock(bo->base.base.resv);
+out_put_object:
 	drm_gem_object_put(gem_obj);
 	return ret;
 }
@@ -546,8 +551,6 @@ static const struct drm_driver panfrost_drm_driver = {
 	.minor			= 2,
 
 	.gem_create_object	= panfrost_gem_create_object,
-	.prime_handle_to_fd	= drm_gem_prime_handle_to_fd,
-	.prime_fd_to_handle	= drm_gem_prime_fd_to_handle,
 	.gem_prime_import_sg_table = panfrost_gem_prime_import_sg_table,
 	.gem_prime_mmap		= drm_gem_prime_mmap,
 };
diff --git a/drivers/gpu/drm/panfrost/panfrost_dump.c b/drivers/gpu/drm/panfrost/panfrost_dump.c
index 6bd0634e2d580..e7942ac449c68 100644
--- a/drivers/gpu/drm/panfrost/panfrost_dump.c
+++ b/drivers/gpu/drm/panfrost/panfrost_dump.c
@@ -209,7 +209,7 @@ void panfrost_core_dump(struct panfrost_job *job)
 			goto dump_header;
 		}
 
-		ret = drm_gem_shmem_vmap(&bo->base, &map);
+		ret = drm_gem_vmap_unlocked(&bo->base.base, &map);
 		if (ret) {
 			dev_err(pfdev->dev, "Panfrost Dump: couldn't map Buffer Object\n");
 			iter.hdr->bomap.valid = 0;
@@ -236,7 +236,7 @@ void panfrost_core_dump(struct panfrost_job *job)
 		vaddr = map.vaddr;
 		memcpy(iter.data, vaddr, bo->base.base.size);
 
-		drm_gem_shmem_vunmap(&bo->base, &map);
+		drm_gem_vunmap_unlocked(&bo->base.base, &map);
 
 		iter.hdr->bomap.valid = 1;
 
diff --git a/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c b/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c
index bf0170782f258..6a71a2555f85c 100644
--- a/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c
+++ b/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c
@@ -48,14 +48,14 @@ static bool panfrost_gem_purge(struct drm_gem_object *obj)
 	if (!mutex_trylock(&bo->mappings.lock))
 		return false;
 
-	if (!mutex_trylock(&shmem->pages_lock))
+	if (!dma_resv_trylock(shmem->base.resv))
 		goto unlock_mappings;
 
 	panfrost_gem_teardown_mappings_locked(bo);
-	drm_gem_shmem_purge_locked(&bo->base);
+	drm_gem_shmem_purge(&bo->base);
 	ret = true;
 
-	mutex_unlock(&shmem->pages_lock);
+	dma_resv_unlock(shmem->base.resv);
 
 unlock_mappings:
 	mutex_unlock(&bo->mappings.lock);
diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
index dbc597ab46fb9..045524a7695ba 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.c
+++ b/drivers/gpu/drm/panfrost/panfrost_job.c
@@ -815,7 +815,8 @@ int panfrost_job_init(struct panfrost_device *pfdev)
 		js->queue[j].fence_context = dma_fence_context_alloc(1);
 
 		ret = drm_sched_init(&js->queue[j].sched,
-				     &panfrost_sched_ops,
+				     &panfrost_sched_ops, NULL,
+				     DRM_SCHED_PRIORITY_COUNT,
 				     nentries, 0,
 				     msecs_to_jiffies(JOB_TIMEOUT_MS),
 				     pfdev->reset.wq,
@@ -922,7 +923,7 @@ int panfrost_job_is_idle(struct panfrost_device *pfdev)
 
 	for (i = 0; i < NUM_JOB_SLOTS; i++) {
 		/* If there are any jobs in the HW queue, we're not idle */
-		if (atomic_read(&js->queue[i].sched.hw_rq_count))
+		if (atomic_read(&js->queue[i].sched.credit_count))
 			return false;
 	}
 
diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c
index e961fa27702ce..c0123d09f699c 100644
--- a/drivers/gpu/drm/panfrost/panfrost_mmu.c
+++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c
@@ -443,6 +443,7 @@ static int panfrost_mmu_map_fault_addr(struct panfrost_device *pfdev, int as,
 	struct panfrost_gem_mapping *bomapping;
 	struct panfrost_gem_object *bo;
 	struct address_space *mapping;
+	struct drm_gem_object *obj;
 	pgoff_t page_offset;
 	struct sg_table *sgt;
 	struct page **pages;
@@ -465,15 +466,16 @@ static int panfrost_mmu_map_fault_addr(struct panfrost_device *pfdev, int as,
 	page_offset = addr >> PAGE_SHIFT;
 	page_offset -= bomapping->mmnode.start;
 
-	mutex_lock(&bo->base.pages_lock);
+	obj = &bo->base.base;
+
+	dma_resv_lock(obj->resv, NULL);
 
 	if (!bo->base.pages) {
 		bo->sgts = kvmalloc_array(bo->base.base.size / SZ_2M,
 				     sizeof(struct sg_table), GFP_KERNEL | __GFP_ZERO);
 		if (!bo->sgts) {
-			mutex_unlock(&bo->base.pages_lock);
 			ret = -ENOMEM;
-			goto err_bo;
+			goto err_unlock;
 		}
 
 		pages = kvmalloc_array(bo->base.base.size >> PAGE_SHIFT,
@@ -481,9 +483,8 @@ static int panfrost_mmu_map_fault_addr(struct panfrost_device *pfdev, int as,
 		if (!pages) {
 			kvfree(bo->sgts);
 			bo->sgts = NULL;
-			mutex_unlock(&bo->base.pages_lock);
 			ret = -ENOMEM;
-			goto err_bo;
+			goto err_unlock;
 		}
 		bo->base.pages = pages;
 		bo->base.pages_use_count = 1;
@@ -491,7 +492,6 @@ static int panfrost_mmu_map_fault_addr(struct panfrost_device *pfdev, int as,
 		pages = bo->base.pages;
 		if (pages[page_offset]) {
 			/* Pages are already mapped, bail out. */
-			mutex_unlock(&bo->base.pages_lock);
 			goto out;
 		}
 	}
@@ -502,15 +502,12 @@ static int panfrost_mmu_map_fault_addr(struct panfrost_device *pfdev, int as,
 	for (i = page_offset; i < page_offset + NUM_FAULT_PAGES; i++) {
 		pages[i] = shmem_read_mapping_page(mapping, i);
 		if (IS_ERR(pages[i])) {
-			mutex_unlock(&bo->base.pages_lock);
 			ret = PTR_ERR(pages[i]);
 			pages[i] = NULL;
 			goto err_pages;
 		}
 	}
 
-	mutex_unlock(&bo->base.pages_lock);
-
 	sgt = &bo->sgts[page_offset / (SZ_2M / PAGE_SIZE)];
 	ret = sg_alloc_table_from_pages(sgt, pages + page_offset,
 					NUM_FAULT_PAGES, 0, SZ_2M, GFP_KERNEL);
@@ -529,6 +526,8 @@ static int panfrost_mmu_map_fault_addr(struct panfrost_device *pfdev, int as,
 	dev_dbg(pfdev->dev, "mapped page fault @ AS%d %llx", as, addr);
 
 out:
+	dma_resv_unlock(obj->resv);
+
 	panfrost_gem_mapping_put(bomapping);
 
 	return 0;
@@ -537,6 +536,8 @@ static int panfrost_mmu_map_fault_addr(struct panfrost_device *pfdev, int as,
 	sg_free_table(sgt);
 err_pages:
 	drm_gem_shmem_put_pages(&bo->base);
+err_unlock:
+	dma_resv_unlock(obj->resv);
 err_bo:
 	panfrost_gem_mapping_put(bomapping);
 	return ret;
diff --git a/drivers/gpu/drm/panfrost/panfrost_perfcnt.c b/drivers/gpu/drm/panfrost/panfrost_perfcnt.c
index bc0df93f7f215..ba9b6e2b26363 100644
--- a/drivers/gpu/drm/panfrost/panfrost_perfcnt.c
+++ b/drivers/gpu/drm/panfrost/panfrost_perfcnt.c
@@ -106,7 +106,7 @@ static int panfrost_perfcnt_enable_locked(struct panfrost_device *pfdev,
 		goto err_close_bo;
 	}
 
-	ret = drm_gem_shmem_vmap(bo, &map);
+	ret = drm_gem_vmap_unlocked(&bo->base, &map);
 	if (ret)
 		goto err_put_mapping;
 	perfcnt->buf = map.vaddr;
@@ -165,7 +165,7 @@ static int panfrost_perfcnt_enable_locked(struct panfrost_device *pfdev,
 	return 0;
 
 err_vunmap:
-	drm_gem_shmem_vunmap(bo, &map);
+	drm_gem_vunmap_unlocked(&bo->base, &map);
 err_put_mapping:
 	panfrost_gem_mapping_put(perfcnt->mapping);
 err_close_bo:
@@ -195,7 +195,7 @@ static int panfrost_perfcnt_disable_locked(struct panfrost_device *pfdev,
 		  GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_OFF));
 
 	perfcnt->user = NULL;
-	drm_gem_shmem_vunmap(&perfcnt->mapping->obj->base, &map);
+	drm_gem_vunmap_unlocked(&perfcnt->mapping->obj->base.base, &map);
 	perfcnt->buf = NULL;
 	panfrost_gem_close(&perfcnt->mapping->obj->base.base, file_priv);
 	panfrost_mmu_as_put(pfdev, perfcnt->mapping->mmu);
diff --git a/drivers/gpu/drm/panthor/Kconfig b/drivers/gpu/drm/panthor/Kconfig
new file mode 100644
index 0000000000000..55b40ad07f3b0
--- /dev/null
+++ b/drivers/gpu/drm/panthor/Kconfig
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: GPL-2.0 or MIT
+
+config DRM_PANTHOR
+	tristate "Panthor (DRM support for ARM Mali CSF-based GPUs)"
+	depends on DRM
+	depends on ARM || ARM64 || COMPILE_TEST
+	depends on !GENERIC_ATOMIC64  # for IOMMU_IO_PGTABLE_LPAE
+	depends on MMU
+	select DEVFREQ_GOV_SIMPLE_ONDEMAND
+	select DRM_EXEC
+	select DRM_GEM_SHMEM_HELPER
+	select DRM_GPUVM
+	select DRM_SCHED
+	select IOMMU_IO_PGTABLE_LPAE
+	select IOMMU_SUPPORT
+	select PM_DEVFREQ
+	help
+	  DRM driver for ARM Mali CSF-based GPUs.
+
+	  This driver is for Mali (or Immortalis) Valhall Gxxx GPUs.
+
+	  Note that the Mali-G68 and Mali-G78, while Valhall architecture, will
+	  be supported with the panfrost driver as they are not CSF GPUs.
diff --git a/drivers/gpu/drm/panthor/Makefile b/drivers/gpu/drm/panthor/Makefile
new file mode 100644
index 0000000000000..15294719b09cb
--- /dev/null
+++ b/drivers/gpu/drm/panthor/Makefile
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0 or MIT
+
+panthor-y := \
+	panthor_devfreq.o \
+	panthor_device.o \
+	panthor_drv.o \
+	panthor_fw.o \
+	panthor_gem.o \
+	panthor_gpu.o \
+	panthor_heap.o \
+	panthor_mmu.o \
+	panthor_sched.o
+
+obj-$(CONFIG_DRM_PANTHOR) += panthor.o
diff --git a/drivers/gpu/drm/panthor/panthor_devfreq.c b/drivers/gpu/drm/panthor/panthor_devfreq.c
new file mode 100644
index 0000000000000..7ac4fa290f272
--- /dev/null
+++ b/drivers/gpu/drm/panthor/panthor_devfreq.c
@@ -0,0 +1,283 @@
+// SPDX-License-Identifier: GPL-2.0 or MIT
+/* Copyright 2019 Collabora ltd. */
+
+#include <linux/clk.h>
+#include <linux/devfreq.h>
+#include <linux/devfreq_cooling.h>
+#include <linux/platform_device.h>
+#include <linux/pm_opp.h>
+
+#include <drm/drm_managed.h>
+
+#include "panthor_devfreq.h"
+#include "panthor_device.h"
+
+/**
+ * struct panthor_devfreq - Device frequency management
+ */
+struct panthor_devfreq {
+	/** @devfreq: devfreq device. */
+	struct devfreq *devfreq;
+
+	/** @gov_data: Governor data. */
+	struct devfreq_simple_ondemand_data gov_data;
+
+	/** @busy_time: Busy time. */
+	ktime_t busy_time;
+
+	/** @idle_time: Idle time. */
+	ktime_t idle_time;
+
+	/** @time_last_update: Last update time. */
+	ktime_t time_last_update;
+
+	/** @last_busy_state: True if the GPU was busy last time we updated the state. */
+	bool last_busy_state;
+
+	/*
+	 * @lock: Lock used to protect busy_time, idle_time, time_last_update and
+	 * last_busy_state.
+	 *
+	 * These fields can be accessed concurrently by panthor_devfreq_get_dev_status()
+	 * and panthor_devfreq_record_{busy,idle}().
+	 */
+	spinlock_t lock;
+};
+
+static void panthor_devfreq_update_utilization(struct panthor_devfreq *pdevfreq)
+{
+	ktime_t now, last;
+
+	now = ktime_get();
+	last = pdevfreq->time_last_update;
+
+	if (pdevfreq->last_busy_state)
+		pdevfreq->busy_time += ktime_sub(now, last);
+	else
+		pdevfreq->idle_time += ktime_sub(now, last);
+
+	pdevfreq->time_last_update = now;
+}
+
+static int panthor_devfreq_target(struct device *dev, unsigned long *freq,
+				  u32 flags)
+{
+	struct dev_pm_opp *opp;
+
+	opp = devfreq_recommended_opp(dev, freq, flags);
+	if (IS_ERR(opp))
+		return PTR_ERR(opp);
+	dev_pm_opp_put(opp);
+
+	return dev_pm_opp_set_rate(dev, *freq);
+}
+
+static void panthor_devfreq_reset(struct panthor_devfreq *pdevfreq)
+{
+	pdevfreq->busy_time = 0;
+	pdevfreq->idle_time = 0;
+	pdevfreq->time_last_update = ktime_get();
+}
+
+static int panthor_devfreq_get_dev_status(struct device *dev,
+					  struct devfreq_dev_status *status)
+{
+	struct panthor_device *ptdev = dev_get_drvdata(dev);
+	struct panthor_devfreq *pdevfreq = ptdev->devfreq;
+	unsigned long irqflags;
+
+	status->current_frequency = clk_get_rate(ptdev->clks.core);
+
+	spin_lock_irqsave(&pdevfreq->lock, irqflags);
+
+	panthor_devfreq_update_utilization(pdevfreq);
+
+	status->total_time = ktime_to_ns(ktime_add(pdevfreq->busy_time,
+						   pdevfreq->idle_time));
+
+	status->busy_time = ktime_to_ns(pdevfreq->busy_time);
+
+	panthor_devfreq_reset(pdevfreq);
+
+	spin_unlock_irqrestore(&pdevfreq->lock, irqflags);
+
+	drm_dbg(&ptdev->base, "busy %lu total %lu %lu %% freq %lu MHz\n",
+		status->busy_time, status->total_time,
+		status->busy_time / (status->total_time / 100),
+		status->current_frequency / 1000 / 1000);
+
+	return 0;
+}
+
+static struct devfreq_dev_profile panthor_devfreq_profile = {
+	.timer = DEVFREQ_TIMER_DELAYED,
+	.polling_ms = 50, /* ~3 frames */
+	.target = panthor_devfreq_target,
+	.get_dev_status = panthor_devfreq_get_dev_status,
+};
+
+int panthor_devfreq_init(struct panthor_device *ptdev)
+{
+	/* There's actually 2 regulators (mali and sram), but the OPP core only
+	 * supports one.
+	 *
+	 * We assume the sram regulator is coupled with the mali one and let
+	 * the coupling logic deal with voltage updates.
+	 */
+	static const char * const reg_names[] = { "mali", NULL };
+	struct thermal_cooling_device *cooling;
+	struct device *dev = ptdev->base.dev;
+	struct panthor_devfreq *pdevfreq;
+	struct dev_pm_opp *opp;
+	unsigned long cur_freq;
+	int ret;
+
+	pdevfreq = drmm_kzalloc(&ptdev->base, sizeof(*ptdev->devfreq), GFP_KERNEL);
+	if (!pdevfreq)
+		return -ENOMEM;
+
+	ptdev->devfreq = pdevfreq;
+
+	ret = devm_pm_opp_set_regulators(dev, reg_names);
+	if (ret) {
+		if (ret != -EPROBE_DEFER)
+			DRM_DEV_ERROR(dev, "Couldn't set OPP regulators\n");
+
+		return ret;
+	}
+
+	ret = devm_pm_opp_of_add_table(dev);
+	if (ret)
+		return ret;
+
+	spin_lock_init(&pdevfreq->lock);
+
+	panthor_devfreq_reset(pdevfreq);
+
+	cur_freq = clk_get_rate(ptdev->clks.core);
+
+	opp = devfreq_recommended_opp(dev, &cur_freq, 0);
+	if (IS_ERR(opp))
+		return PTR_ERR(opp);
+
+	panthor_devfreq_profile.initial_freq = cur_freq;
+
+	/* Regulator coupling only takes care of synchronizing/balancing voltage
+	 * updates, but the coupled regulator needs to be enabled manually.
+	 *
+	 * We use devm_regulator_get_enable_optional() and keep the sram supply
+	 * enabled until the device is removed, just like we do for the mali
+	 * supply, which is enabled when dev_pm_opp_set_opp(dev, opp) is called,
+	 * and disabled when the opp_table is torn down, using the devm action.
+	 *
+	 * If we really care about disabling regulators on suspend, we should:
+	 * - use devm_regulator_get_optional() here
+	 * - call dev_pm_opp_set_opp(dev, NULL) before leaving this function
+	 *   (this disables the regulator passed to the OPP layer)
+	 * - call dev_pm_opp_set_opp(dev, NULL) and
+	 *   regulator_disable(ptdev->regulators.sram) in
+	 *   panthor_devfreq_suspend()
+	 * - call dev_pm_opp_set_opp(dev, default_opp) and
+	 *   regulator_enable(ptdev->regulators.sram) in
+	 *   panthor_devfreq_resume()
+	 *
+	 * But without knowing if it's beneficial or not (in term of power
+	 * consumption), or how much it slows down the suspend/resume steps,
+	 * let's just keep regulators enabled for the device lifetime.
+	 */
+	ret = devm_regulator_get_enable_optional(dev, "sram");
+	if (ret && ret != -ENODEV) {
+		if (ret != -EPROBE_DEFER)
+			DRM_DEV_ERROR(dev, "Couldn't retrieve/enable sram supply\n");
+		return ret;
+	}
+
+	/*
+	 * Set the recommend OPP this will enable and configure the regulator
+	 * if any and will avoid a switch off by regulator_late_cleanup()
+	 */
+	ret = dev_pm_opp_set_opp(dev, opp);
+	if (ret) {
+		DRM_DEV_ERROR(dev, "Couldn't set recommended OPP\n");
+		return ret;
+	}
+
+	dev_pm_opp_put(opp);
+
+	/*
+	 * Setup default thresholds for the simple_ondemand governor.
+	 * The values are chosen based on experiments.
+	 */
+	pdevfreq->gov_data.upthreshold = 45;
+	pdevfreq->gov_data.downdifferential = 5;
+
+	pdevfreq->devfreq = devm_devfreq_add_device(dev, &panthor_devfreq_profile,
+						    DEVFREQ_GOV_SIMPLE_ONDEMAND,
+						    &pdevfreq->gov_data);
+	if (IS_ERR(pdevfreq->devfreq)) {
+		DRM_DEV_ERROR(dev, "Couldn't initialize GPU devfreq\n");
+		ret = PTR_ERR(pdevfreq->devfreq);
+		pdevfreq->devfreq = NULL;
+		return ret;
+	}
+
+	cooling = devfreq_cooling_em_register(pdevfreq->devfreq, NULL);
+	if (IS_ERR(cooling))
+		DRM_DEV_INFO(dev, "Failed to register cooling device\n");
+
+	return 0;
+}
+
+int panthor_devfreq_resume(struct panthor_device *ptdev)
+{
+	struct panthor_devfreq *pdevfreq = ptdev->devfreq;
+
+	if (!pdevfreq->devfreq)
+		return 0;
+
+	panthor_devfreq_reset(pdevfreq);
+
+	return devfreq_resume_device(pdevfreq->devfreq);
+}
+
+int panthor_devfreq_suspend(struct panthor_device *ptdev)
+{
+	struct panthor_devfreq *pdevfreq = ptdev->devfreq;
+
+	if (!pdevfreq->devfreq)
+		return 0;
+
+	return devfreq_suspend_device(pdevfreq->devfreq);
+}
+
+void panthor_devfreq_record_busy(struct panthor_device *ptdev)
+{
+	struct panthor_devfreq *pdevfreq = ptdev->devfreq;
+	unsigned long irqflags;
+
+	if (!pdevfreq->devfreq)
+		return;
+
+	spin_lock_irqsave(&pdevfreq->lock, irqflags);
+
+	panthor_devfreq_update_utilization(pdevfreq);
+	pdevfreq->last_busy_state = true;
+
+	spin_unlock_irqrestore(&pdevfreq->lock, irqflags);
+}
+
+void panthor_devfreq_record_idle(struct panthor_device *ptdev)
+{
+	struct panthor_devfreq *pdevfreq = ptdev->devfreq;
+	unsigned long irqflags;
+
+	if (!pdevfreq->devfreq)
+		return;
+
+	spin_lock_irqsave(&pdevfreq->lock, irqflags);
+
+	panthor_devfreq_update_utilization(pdevfreq);
+	pdevfreq->last_busy_state = false;
+
+	spin_unlock_irqrestore(&pdevfreq->lock, irqflags);
+}
diff --git a/drivers/gpu/drm/panthor/panthor_devfreq.h b/drivers/gpu/drm/panthor/panthor_devfreq.h
new file mode 100644
index 0000000000000..83a5c95224934
--- /dev/null
+++ b/drivers/gpu/drm/panthor/panthor_devfreq.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 or MIT */
+/* Copyright 2019 Collabora ltd. */
+
+#ifndef __PANTHOR_DEVFREQ_H__
+#define __PANTHOR_DEVFREQ_H__
+
+struct devfreq;
+struct thermal_cooling_device;
+
+struct panthor_device;
+struct panthor_devfreq;
+
+int panthor_devfreq_init(struct panthor_device *ptdev);
+
+int panthor_devfreq_resume(struct panthor_device *ptdev);
+int panthor_devfreq_suspend(struct panthor_device *ptdev);
+
+void panthor_devfreq_record_busy(struct panthor_device *ptdev);
+void panthor_devfreq_record_idle(struct panthor_device *ptdev);
+
+#endif /* __PANTHOR_DEVFREQ_H__ */
diff --git a/drivers/gpu/drm/panthor/panthor_device.c b/drivers/gpu/drm/panthor/panthor_device.c
new file mode 100644
index 0000000000000..6fb77f2eb429c
--- /dev/null
+++ b/drivers/gpu/drm/panthor/panthor_device.c
@@ -0,0 +1,548 @@
+// SPDX-License-Identifier: GPL-2.0 or MIT
+/* Copyright 2018 Marty E. Plummer <hanetzer@startmail.com> */
+/* Copyright 2019 Linaro, Ltd, Rob Herring <robh@kernel.org> */
+/* Copyright 2023 Collabora ltd. */
+
+#include <linux/clk.h>
+#include <linux/platform_device.h>
+#include <linux/pm_domain.h>
+#include <linux/pm_runtime.h>
+#include <linux/regulator/consumer.h>
+#include <linux/reset.h>
+
+#include <drm/drm_drv.h>
+#include <drm/drm_managed.h>
+
+#include "panthor_devfreq.h"
+#include "panthor_device.h"
+#include "panthor_fw.h"
+#include "panthor_gpu.h"
+#include "panthor_mmu.h"
+#include "panthor_regs.h"
+#include "panthor_sched.h"
+
+static int panthor_clk_init(struct panthor_device *ptdev)
+{
+	ptdev->clks.core = devm_clk_get(ptdev->base.dev, NULL);
+	if (IS_ERR(ptdev->clks.core))
+		return dev_err_probe(ptdev->base.dev,
+				     PTR_ERR(ptdev->clks.core),
+				     "get 'core' clock failed");
+
+	ptdev->clks.stacks = devm_clk_get_optional(ptdev->base.dev, "stacks");
+	if (IS_ERR(ptdev->clks.stacks))
+		return dev_err_probe(ptdev->base.dev,
+				     PTR_ERR(ptdev->clks.stacks),
+				     "get 'stacks' clock failed");
+
+	ptdev->clks.coregroup = devm_clk_get_optional(ptdev->base.dev, "coregroup");
+	if (IS_ERR(ptdev->clks.coregroup))
+		return dev_err_probe(ptdev->base.dev,
+				     PTR_ERR(ptdev->clks.coregroup),
+				     "get 'coregroup' clock failed");
+
+	drm_info(&ptdev->base, "clock rate = %lu\n", clk_get_rate(ptdev->clks.core));
+	return 0;
+}
+
+void panthor_device_unplug(struct panthor_device *ptdev)
+{
+	/* This function can be called from two different path: the reset work
+	 * and the platform device remove callback. drm_dev_unplug() doesn't
+	 * deal with concurrent callers, so we have to protect drm_dev_unplug()
+	 * calls with our own lock, and bail out if the device is already
+	 * unplugged.
+	 */
+	mutex_lock(&ptdev->unplug.lock);
+	if (drm_dev_is_unplugged(&ptdev->base)) {
+		/* Someone beat us, release the lock and wait for the unplug
+		 * operation to be reported as done.
+		 **/
+		mutex_unlock(&ptdev->unplug.lock);
+		wait_for_completion(&ptdev->unplug.done);
+		return;
+	}
+
+	/* Call drm_dev_unplug() so any access to HW blocks happening after
+	 * that point get rejected.
+	 */
+	drm_dev_unplug(&ptdev->base);
+
+	/* We do the rest of the unplug with the unplug lock released,
+	 * future callers will wait on ptdev->unplug.done anyway.
+	 */
+	mutex_unlock(&ptdev->unplug.lock);
+
+	drm_WARN_ON(&ptdev->base, pm_runtime_get_sync(ptdev->base.dev) < 0);
+
+	/* Now, try to cleanly shutdown the GPU before the device resources
+	 * get reclaimed.
+	 */
+	panthor_sched_unplug(ptdev);
+	panthor_fw_unplug(ptdev);
+	panthor_mmu_unplug(ptdev);
+	panthor_gpu_unplug(ptdev);
+
+	pm_runtime_dont_use_autosuspend(ptdev->base.dev);
+	pm_runtime_put_sync_suspend(ptdev->base.dev);
+
+	/* Report the unplug operation as done to unblock concurrent
+	 * panthor_device_unplug() callers.
+	 */
+	complete_all(&ptdev->unplug.done);
+}
+
+static void panthor_device_reset_cleanup(struct drm_device *ddev, void *data)
+{
+	struct panthor_device *ptdev = container_of(ddev, struct panthor_device, base);
+
+	cancel_work_sync(&ptdev->reset.work);
+	destroy_workqueue(ptdev->reset.wq);
+}
+
+static void panthor_device_reset_work(struct work_struct *work)
+{
+	struct panthor_device *ptdev = container_of(work, struct panthor_device, reset.work);
+	int ret = 0, cookie;
+
+	if (atomic_read(&ptdev->pm.state) != PANTHOR_DEVICE_PM_STATE_ACTIVE) {
+		/*
+		 * No need for a reset as the device has been (or will be)
+		 * powered down
+		 */
+		atomic_set(&ptdev->reset.pending, 0);
+		return;
+	}
+
+	if (!drm_dev_enter(&ptdev->base, &cookie))
+		return;
+
+	panthor_sched_pre_reset(ptdev);
+	panthor_fw_pre_reset(ptdev, true);
+	panthor_mmu_pre_reset(ptdev);
+	panthor_gpu_soft_reset(ptdev);
+	panthor_gpu_l2_power_on(ptdev);
+	panthor_mmu_post_reset(ptdev);
+	ret = panthor_fw_post_reset(ptdev);
+	if (ret)
+		goto out_dev_exit;
+
+	atomic_set(&ptdev->reset.pending, 0);
+	panthor_sched_post_reset(ptdev);
+
+out_dev_exit:
+	drm_dev_exit(cookie);
+
+	if (ret) {
+		panthor_device_unplug(ptdev);
+		drm_err(&ptdev->base, "Failed to boot MCU after reset, making device unusable.");
+	}
+}
+
+static bool panthor_device_is_initialized(struct panthor_device *ptdev)
+{
+	return !!ptdev->scheduler;
+}
+
+static void panthor_device_free_page(struct drm_device *ddev, void *data)
+{
+	free_page((unsigned long)data);
+}
+
+int panthor_device_init(struct panthor_device *ptdev)
+{
+	struct resource *res;
+	struct page *p;
+	int ret;
+
+	ptdev->coherent = device_get_dma_attr(ptdev->base.dev) == DEV_DMA_COHERENT;
+
+	init_completion(&ptdev->unplug.done);
+	ret = drmm_mutex_init(&ptdev->base, &ptdev->unplug.lock);
+	if (ret)
+		return ret;
+
+	ret = drmm_mutex_init(&ptdev->base, &ptdev->pm.mmio_lock);
+	if (ret)
+		return ret;
+
+	atomic_set(&ptdev->pm.state, PANTHOR_DEVICE_PM_STATE_SUSPENDED);
+	p = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	if (!p)
+		return -ENOMEM;
+
+	ptdev->pm.dummy_latest_flush = page_address(p);
+	ret = drmm_add_action_or_reset(&ptdev->base, panthor_device_free_page,
+				       ptdev->pm.dummy_latest_flush);
+	if (ret)
+		return ret;
+
+	/*
+	 * Set the dummy page holding the latest flush to 1. This will cause the
+	 * flush to avoided as we know it isn't necessary if the submission
+	 * happens while the dummy page is mapped. Zero cannot be used because
+	 * that means 'always flush'.
+	 */
+	*ptdev->pm.dummy_latest_flush = 1;
+
+	INIT_WORK(&ptdev->reset.work, panthor_device_reset_work);
+	ptdev->reset.wq = alloc_ordered_workqueue("panthor-reset-wq", 0);
+	if (!ptdev->reset.wq)
+		return -ENOMEM;
+
+	ret = drmm_add_action_or_reset(&ptdev->base, panthor_device_reset_cleanup, NULL);
+	if (ret)
+		return ret;
+
+	ret = panthor_clk_init(ptdev);
+	if (ret)
+		return ret;
+
+	ret = panthor_devfreq_init(ptdev);
+	if (ret)
+		return ret;
+
+	ptdev->iomem = devm_platform_get_and_ioremap_resource(to_platform_device(ptdev->base.dev),
+							      0, &res);
+	if (IS_ERR(ptdev->iomem))
+		return PTR_ERR(ptdev->iomem);
+
+	ptdev->phys_addr = res->start;
+
+	ret = devm_pm_runtime_enable(ptdev->base.dev);
+	if (ret)
+		return ret;
+
+	ret = pm_runtime_resume_and_get(ptdev->base.dev);
+	if (ret)
+		return ret;
+
+	ret = panthor_gpu_init(ptdev);
+	if (ret)
+		goto err_rpm_put;
+
+	ret = panthor_mmu_init(ptdev);
+	if (ret)
+		goto err_unplug_gpu;
+
+	ret = panthor_fw_init(ptdev);
+	if (ret)
+		goto err_unplug_mmu;
+
+	ret = panthor_sched_init(ptdev);
+	if (ret)
+		goto err_unplug_fw;
+
+	/* ~3 frames */
+	pm_runtime_set_autosuspend_delay(ptdev->base.dev, 50);
+	pm_runtime_use_autosuspend(ptdev->base.dev);
+
+	ret = drm_dev_register(&ptdev->base, 0);
+	if (ret)
+		goto err_disable_autosuspend;
+
+	pm_runtime_put_autosuspend(ptdev->base.dev);
+	return 0;
+
+err_disable_autosuspend:
+	pm_runtime_dont_use_autosuspend(ptdev->base.dev);
+	panthor_sched_unplug(ptdev);
+
+err_unplug_fw:
+	panthor_fw_unplug(ptdev);
+
+err_unplug_mmu:
+	panthor_mmu_unplug(ptdev);
+
+err_unplug_gpu:
+	panthor_gpu_unplug(ptdev);
+
+err_rpm_put:
+	pm_runtime_put_sync_suspend(ptdev->base.dev);
+	return ret;
+}
+
+#define PANTHOR_EXCEPTION(id) \
+	[DRM_PANTHOR_EXCEPTION_ ## id] = { \
+		.name = #id, \
+	}
+
+struct panthor_exception_info {
+	const char *name;
+};
+
+static const struct panthor_exception_info panthor_exception_infos[] = {
+	PANTHOR_EXCEPTION(OK),
+	PANTHOR_EXCEPTION(TERMINATED),
+	PANTHOR_EXCEPTION(KABOOM),
+	PANTHOR_EXCEPTION(EUREKA),
+	PANTHOR_EXCEPTION(ACTIVE),
+	PANTHOR_EXCEPTION(CS_RES_TERM),
+	PANTHOR_EXCEPTION(CS_CONFIG_FAULT),
+	PANTHOR_EXCEPTION(CS_ENDPOINT_FAULT),
+	PANTHOR_EXCEPTION(CS_BUS_FAULT),
+	PANTHOR_EXCEPTION(CS_INSTR_INVALID),
+	PANTHOR_EXCEPTION(CS_CALL_STACK_OVERFLOW),
+	PANTHOR_EXCEPTION(CS_INHERIT_FAULT),
+	PANTHOR_EXCEPTION(INSTR_INVALID_PC),
+	PANTHOR_EXCEPTION(INSTR_INVALID_ENC),
+	PANTHOR_EXCEPTION(INSTR_BARRIER_FAULT),
+	PANTHOR_EXCEPTION(DATA_INVALID_FAULT),
+	PANTHOR_EXCEPTION(TILE_RANGE_FAULT),
+	PANTHOR_EXCEPTION(ADDR_RANGE_FAULT),
+	PANTHOR_EXCEPTION(IMPRECISE_FAULT),
+	PANTHOR_EXCEPTION(OOM),
+	PANTHOR_EXCEPTION(CSF_FW_INTERNAL_ERROR),
+	PANTHOR_EXCEPTION(CSF_RES_EVICTION_TIMEOUT),
+	PANTHOR_EXCEPTION(GPU_BUS_FAULT),
+	PANTHOR_EXCEPTION(GPU_SHAREABILITY_FAULT),
+	PANTHOR_EXCEPTION(SYS_SHAREABILITY_FAULT),
+	PANTHOR_EXCEPTION(GPU_CACHEABILITY_FAULT),
+	PANTHOR_EXCEPTION(TRANSLATION_FAULT_0),
+	PANTHOR_EXCEPTION(TRANSLATION_FAULT_1),
+	PANTHOR_EXCEPTION(TRANSLATION_FAULT_2),
+	PANTHOR_EXCEPTION(TRANSLATION_FAULT_3),
+	PANTHOR_EXCEPTION(TRANSLATION_FAULT_4),
+	PANTHOR_EXCEPTION(PERM_FAULT_0),
+	PANTHOR_EXCEPTION(PERM_FAULT_1),
+	PANTHOR_EXCEPTION(PERM_FAULT_2),
+	PANTHOR_EXCEPTION(PERM_FAULT_3),
+	PANTHOR_EXCEPTION(ACCESS_FLAG_1),
+	PANTHOR_EXCEPTION(ACCESS_FLAG_2),
+	PANTHOR_EXCEPTION(ACCESS_FLAG_3),
+	PANTHOR_EXCEPTION(ADDR_SIZE_FAULT_IN),
+	PANTHOR_EXCEPTION(ADDR_SIZE_FAULT_OUT0),
+	PANTHOR_EXCEPTION(ADDR_SIZE_FAULT_OUT1),
+	PANTHOR_EXCEPTION(ADDR_SIZE_FAULT_OUT2),
+	PANTHOR_EXCEPTION(ADDR_SIZE_FAULT_OUT3),
+	PANTHOR_EXCEPTION(MEM_ATTR_FAULT_0),
+	PANTHOR_EXCEPTION(MEM_ATTR_FAULT_1),
+	PANTHOR_EXCEPTION(MEM_ATTR_FAULT_2),
+	PANTHOR_EXCEPTION(MEM_ATTR_FAULT_3),
+};
+
+const char *panthor_exception_name(struct panthor_device *ptdev, u32 exception_code)
+{
+	if (exception_code >= ARRAY_SIZE(panthor_exception_infos) ||
+	    !panthor_exception_infos[exception_code].name)
+		return "Unknown exception type";
+
+	return panthor_exception_infos[exception_code].name;
+}
+
+static vm_fault_t panthor_mmio_vm_fault(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	struct panthor_device *ptdev = vma->vm_private_data;
+	u64 offset = (u64)vma->vm_pgoff << PAGE_SHIFT;
+	unsigned long pfn;
+	pgprot_t pgprot;
+	vm_fault_t ret;
+	bool active;
+	int cookie;
+
+	if (!drm_dev_enter(&ptdev->base, &cookie))
+		return VM_FAULT_SIGBUS;
+
+	mutex_lock(&ptdev->pm.mmio_lock);
+	active = atomic_read(&ptdev->pm.state) == PANTHOR_DEVICE_PM_STATE_ACTIVE;
+
+	switch (offset) {
+	case DRM_PANTHOR_USER_FLUSH_ID_MMIO_OFFSET:
+		if (active)
+			pfn = __phys_to_pfn(ptdev->phys_addr + CSF_GPU_LATEST_FLUSH_ID);
+		else
+			pfn = virt_to_pfn(ptdev->pm.dummy_latest_flush);
+		break;
+
+	default:
+		ret = VM_FAULT_SIGBUS;
+		goto out_unlock;
+	}
+
+	pgprot = vma->vm_page_prot;
+	if (active)
+		pgprot = pgprot_noncached(pgprot);
+
+	ret = vmf_insert_pfn_prot(vma, vmf->address, pfn, pgprot);
+
+out_unlock:
+	mutex_unlock(&ptdev->pm.mmio_lock);
+	drm_dev_exit(cookie);
+	return ret;
+}
+
+static const struct vm_operations_struct panthor_mmio_vm_ops = {
+	.fault = panthor_mmio_vm_fault,
+};
+
+int panthor_device_mmap_io(struct panthor_device *ptdev, struct vm_area_struct *vma)
+{
+	u64 offset = (u64)vma->vm_pgoff << PAGE_SHIFT;
+
+	switch (offset) {
+	case DRM_PANTHOR_USER_FLUSH_ID_MMIO_OFFSET:
+		if (vma->vm_end - vma->vm_start != PAGE_SIZE ||
+		    (vma->vm_flags & (VM_WRITE | VM_EXEC)))
+			return -EINVAL;
+
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	/* Defer actual mapping to the fault handler. */
+	vma->vm_private_data = ptdev;
+	vma->vm_ops = &panthor_mmio_vm_ops;
+	vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND |
+            VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP;
+	return 0;
+}
+
+#ifdef CONFIG_PM
+int panthor_device_resume(struct device *dev)
+{
+	struct panthor_device *ptdev = dev_get_drvdata(dev);
+	int ret, cookie;
+
+	if (atomic_read(&ptdev->pm.state) != PANTHOR_DEVICE_PM_STATE_SUSPENDED)
+		return -EINVAL;
+
+	atomic_set(&ptdev->pm.state, PANTHOR_DEVICE_PM_STATE_RESUMING);
+
+	ret = clk_prepare_enable(ptdev->clks.core);
+	if (ret)
+		goto err_set_suspended;
+
+	ret = clk_prepare_enable(ptdev->clks.stacks);
+	if (ret)
+		goto err_disable_core_clk;
+
+	ret = clk_prepare_enable(ptdev->clks.coregroup);
+	if (ret)
+		goto err_disable_stacks_clk;
+
+	ret = panthor_devfreq_resume(ptdev);
+	if (ret)
+		goto err_disable_coregroup_clk;
+
+	if (panthor_device_is_initialized(ptdev) &&
+	    drm_dev_enter(&ptdev->base, &cookie)) {
+		panthor_gpu_resume(ptdev);
+		panthor_mmu_resume(ptdev);
+		ret = drm_WARN_ON(&ptdev->base, panthor_fw_resume(ptdev));
+		if (!ret) {
+			panthor_sched_resume(ptdev);
+		} else {
+			panthor_mmu_suspend(ptdev);
+			panthor_gpu_suspend(ptdev);
+		}
+
+		drm_dev_exit(cookie);
+
+		if (ret)
+			goto err_suspend_devfreq;
+	}
+
+	if (atomic_read(&ptdev->reset.pending))
+		queue_work(ptdev->reset.wq, &ptdev->reset.work);
+
+	/* Clear all IOMEM mappings pointing to this device after we've
+	 * resumed. This way the fake mappings pointing to the dummy pages
+	 * are removed and the real iomem mapping will be restored on next
+	 * access.
+	 */
+	mutex_lock(&ptdev->pm.mmio_lock);
+	unmap_mapping_range(ptdev->base.anon_inode->i_mapping,
+			    DRM_PANTHOR_USER_MMIO_OFFSET, 0, 1);
+	atomic_set(&ptdev->pm.state, PANTHOR_DEVICE_PM_STATE_ACTIVE);
+	mutex_unlock(&ptdev->pm.mmio_lock);
+	return 0;
+
+err_suspend_devfreq:
+	panthor_devfreq_suspend(ptdev);
+
+err_disable_coregroup_clk:
+	clk_disable_unprepare(ptdev->clks.coregroup);
+
+err_disable_stacks_clk:
+	clk_disable_unprepare(ptdev->clks.stacks);
+
+err_disable_core_clk:
+	clk_disable_unprepare(ptdev->clks.core);
+
+err_set_suspended:
+	atomic_set(&ptdev->pm.state, PANTHOR_DEVICE_PM_STATE_SUSPENDED);
+	return ret;
+}
+
+int panthor_device_suspend(struct device *dev)
+{
+	struct panthor_device *ptdev = dev_get_drvdata(dev);
+	int ret, cookie;
+
+	if (atomic_read(&ptdev->pm.state) != PANTHOR_DEVICE_PM_STATE_ACTIVE)
+		return -EINVAL;
+
+	/* Clear all IOMEM mappings pointing to this device before we
+	 * shutdown the power-domain and clocks. Failing to do that results
+	 * in external aborts when the process accesses the iomem region.
+	 * We change the state and call unmap_mapping_range() with the
+	 * mmio_lock held to make sure the vm_fault handler won't set up
+	 * invalid mappings.
+	 */
+	mutex_lock(&ptdev->pm.mmio_lock);
+	atomic_set(&ptdev->pm.state, PANTHOR_DEVICE_PM_STATE_SUSPENDING);
+	unmap_mapping_range(ptdev->base.anon_inode->i_mapping,
+			    DRM_PANTHOR_USER_MMIO_OFFSET, 0, 1);
+	mutex_unlock(&ptdev->pm.mmio_lock);
+
+	if (panthor_device_is_initialized(ptdev) &&
+	    drm_dev_enter(&ptdev->base, &cookie)) {
+		cancel_work_sync(&ptdev->reset.work);
+
+		/* We prepare everything as if we were resetting the GPU.
+		 * The end of the reset will happen in the resume path though.
+		 */
+		panthor_sched_suspend(ptdev);
+		panthor_fw_suspend(ptdev);
+		panthor_mmu_suspend(ptdev);
+		panthor_gpu_suspend(ptdev);
+		drm_dev_exit(cookie);
+	}
+
+	ret = panthor_devfreq_suspend(ptdev);
+	if (ret) {
+		if (panthor_device_is_initialized(ptdev) &&
+		    drm_dev_enter(&ptdev->base, &cookie)) {
+			panthor_gpu_resume(ptdev);
+			panthor_mmu_resume(ptdev);
+			drm_WARN_ON(&ptdev->base, panthor_fw_resume(ptdev));
+			panthor_sched_resume(ptdev);
+			drm_dev_exit(cookie);
+		}
+
+		goto err_set_active;
+	}
+
+	clk_disable_unprepare(ptdev->clks.coregroup);
+	clk_disable_unprepare(ptdev->clks.stacks);
+	clk_disable_unprepare(ptdev->clks.core);
+	atomic_set(&ptdev->pm.state, PANTHOR_DEVICE_PM_STATE_SUSPENDED);
+	return 0;
+
+err_set_active:
+	/* If something failed and we have to revert back to an
+	 * active state, we also need to clear the MMIO userspace
+	 * mappings, so any dumb pages that were mapped while we
+	 * were trying to suspend gets invalidated.
+	 */
+	mutex_lock(&ptdev->pm.mmio_lock);
+	atomic_set(&ptdev->pm.state, PANTHOR_DEVICE_PM_STATE_ACTIVE);
+	unmap_mapping_range(ptdev->base.anon_inode->i_mapping,
+			    DRM_PANTHOR_USER_MMIO_OFFSET, 0, 1);
+	mutex_unlock(&ptdev->pm.mmio_lock);
+	return ret;
+}
+#endif
diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
new file mode 100644
index 0000000000000..99ddc41f26266
--- /dev/null
+++ b/drivers/gpu/drm/panthor/panthor_device.h
@@ -0,0 +1,361 @@
+/* SPDX-License-Identifier: GPL-2.0 or MIT */
+/* Copyright 2018 Marty E. Plummer <hanetzer@startmail.com> */
+/* Copyright 2019 Linaro, Ltd, Rob Herring <robh@kernel.org> */
+/* Copyright 2023 Collabora ltd. */
+
+#ifndef __PANTHOR_DEVICE_H__
+#define __PANTHOR_DEVICE_H__
+
+#include <linux/atomic.h>
+#include <linux/io-pgtable.h>
+#include <linux/regulator/consumer.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+
+#include <drm/drm_device.h>
+#include <drm/drm_mm.h>
+#include <drm/gpu_scheduler.h>
+#include <drm/panthor_drm.h>
+
+struct panthor_csf;
+struct panthor_csf_ctx;
+struct panthor_device;
+struct panthor_gpu;
+struct panthor_group_pool;
+struct panthor_heap_pool;
+struct panthor_job;
+struct panthor_mmu;
+struct panthor_fw;
+struct panthor_perfcnt;
+struct panthor_vm;
+struct panthor_vm_pool;
+
+/**
+ * enum panthor_device_pm_state - PM state
+ */
+enum panthor_device_pm_state {
+	/** @PANTHOR_DEVICE_PM_STATE_SUSPENDED: Device is suspended. */
+	PANTHOR_DEVICE_PM_STATE_SUSPENDED = 0,
+
+	/** @PANTHOR_DEVICE_PM_STATE_RESUMING: Device is being resumed. */
+	PANTHOR_DEVICE_PM_STATE_RESUMING,
+
+	/** @PANTHOR_DEVICE_PM_STATE_ACTIVE: Device is active. */
+	PANTHOR_DEVICE_PM_STATE_ACTIVE,
+
+	/** @PANTHOR_DEVICE_PM_STATE_SUSPENDING: Device is being suspended. */
+	PANTHOR_DEVICE_PM_STATE_SUSPENDING,
+};
+
+/**
+ * struct panthor_irq - IRQ data
+ *
+ * Used to automate IRQ handling for the 3 different IRQs we have in this driver.
+ */
+struct panthor_irq {
+	/** @ptdev: Panthor device */
+	struct panthor_device *ptdev;
+
+	/** @irq: IRQ number. */
+	int irq;
+
+	/** @mask: Current mask being applied to xxx_INT_MASK. */
+	u32 mask;
+
+	/** @suspended: Set to true when the IRQ is suspended. */
+	atomic_t suspended;
+};
+
+/**
+ * struct panthor_device - Panthor device
+ */
+struct panthor_device {
+	/** @base: Base drm_device. */
+	struct drm_device base;
+
+	/** @phys_addr: Physical address of the iomem region. */
+	phys_addr_t phys_addr;
+
+	/** @iomem: CPU mapping of the IOMEM region. */
+	void __iomem *iomem;
+
+	/** @clks: GPU clocks. */
+	struct {
+		/** @core: Core clock. */
+		struct clk *core;
+
+		/** @stacks: Stacks clock. This clock is optional. */
+		struct clk *stacks;
+
+		/** @coregroup: Core group clock. This clock is optional. */
+		struct clk *coregroup;
+	} clks;
+
+	/** @coherent: True if the CPU/GPU are memory coherent. */
+	bool coherent;
+
+	/** @gpu_info: GPU information. */
+	struct drm_panthor_gpu_info gpu_info;
+
+	/** @csif_info: Command stream interface information. */
+	struct drm_panthor_csif_info csif_info;
+
+	/** @gpu: GPU management data. */
+	struct panthor_gpu *gpu;
+
+	/** @fw: FW management data. */
+	struct panthor_fw *fw;
+
+	/** @mmu: MMU management data. */
+	struct panthor_mmu *mmu;
+
+	/** @scheduler: Scheduler management data. */
+	struct panthor_scheduler *scheduler;
+
+	/** @devfreq: Device frequency scaling management data. */
+	struct panthor_devfreq *devfreq;
+
+	/** @unplug: Device unplug related fields. */
+	struct {
+		/** @lock: Lock used to serialize unplug operations. */
+		struct mutex lock;
+
+		/**
+		 * @done: Completion object signaled when the unplug
+		 * operation is done.
+		 */
+		struct completion done;
+	} unplug;
+
+	/** @reset: Reset related fields. */
+	struct {
+		/** @wq: Ordered worqueud used to schedule reset operations. */
+		struct workqueue_struct *wq;
+
+		/** @work: Reset work. */
+		struct work_struct work;
+
+		/** @pending: Set to true if a reset is pending. */
+		atomic_t pending;
+	} reset;
+
+	/** @pm: Power management related data. */
+	struct {
+		/** @state: Power state. */
+		atomic_t state;
+
+		/**
+		 * @mmio_lock: Lock protecting MMIO userspace CPU mappings.
+		 *
+		 * This is needed to ensure we map the dummy IO pages when
+		 * the device is being suspended, and the real IO pages when
+		 * the device is being resumed. We can't just do with the
+		 * state atomicity to deal with this race.
+		 */
+		struct mutex mmio_lock;
+
+		/**
+		 * @dummy_latest_flush: Dummy LATEST_FLUSH page.
+		 *
+		 * Used to replace the real LATEST_FLUSH page when the GPU
+		 * is suspended.
+		 */
+		u32 *dummy_latest_flush;
+	} pm;
+};
+
+/**
+ * struct panthor_file - Panthor file
+ */
+struct panthor_file {
+	/** @ptdev: Device attached to this file. */
+	struct panthor_device *ptdev;
+
+	/** @vms: VM pool attached to this file. */
+	struct panthor_vm_pool *vms;
+
+	/** @groups: Scheduling group pool attached to this file. */
+	struct panthor_group_pool *groups;
+};
+
+int panthor_device_init(struct panthor_device *ptdev);
+void panthor_device_unplug(struct panthor_device *ptdev);
+
+/**
+ * panthor_device_schedule_reset() - Schedules a reset operation
+ */
+static inline void panthor_device_schedule_reset(struct panthor_device *ptdev)
+{
+	if (!atomic_cmpxchg(&ptdev->reset.pending, 0, 1) &&
+	    atomic_read(&ptdev->pm.state) == PANTHOR_DEVICE_PM_STATE_ACTIVE)
+		queue_work(ptdev->reset.wq, &ptdev->reset.work);
+}
+
+/**
+ * panthor_device_reset_is_pending() - Checks if a reset is pending.
+ *
+ * Return: true if a reset is pending, false otherwise.
+ */
+static inline bool panthor_device_reset_is_pending(struct panthor_device *ptdev)
+{
+	return atomic_read(&ptdev->reset.pending) != 0;
+}
+
+int panthor_device_mmap_io(struct panthor_device *ptdev,
+			   struct vm_area_struct *vma);
+
+int panthor_device_resume(struct device *dev);
+int panthor_device_suspend(struct device *dev);
+
+enum drm_panthor_exception_type {
+	DRM_PANTHOR_EXCEPTION_OK = 0x00,
+	DRM_PANTHOR_EXCEPTION_TERMINATED = 0x04,
+	DRM_PANTHOR_EXCEPTION_KABOOM = 0x05,
+	DRM_PANTHOR_EXCEPTION_EUREKA = 0x06,
+	DRM_PANTHOR_EXCEPTION_ACTIVE = 0x08,
+	DRM_PANTHOR_EXCEPTION_CS_RES_TERM = 0x0f,
+	DRM_PANTHOR_EXCEPTION_MAX_NON_FAULT = 0x3f,
+	DRM_PANTHOR_EXCEPTION_CS_CONFIG_FAULT = 0x40,
+	DRM_PANTHOR_EXCEPTION_CS_ENDPOINT_FAULT = 0x44,
+	DRM_PANTHOR_EXCEPTION_CS_BUS_FAULT = 0x48,
+	DRM_PANTHOR_EXCEPTION_CS_INSTR_INVALID = 0x49,
+	DRM_PANTHOR_EXCEPTION_CS_CALL_STACK_OVERFLOW = 0x4a,
+	DRM_PANTHOR_EXCEPTION_CS_INHERIT_FAULT = 0x4b,
+	DRM_PANTHOR_EXCEPTION_INSTR_INVALID_PC = 0x50,
+	DRM_PANTHOR_EXCEPTION_INSTR_INVALID_ENC = 0x51,
+	DRM_PANTHOR_EXCEPTION_INSTR_BARRIER_FAULT = 0x55,
+	DRM_PANTHOR_EXCEPTION_DATA_INVALID_FAULT = 0x58,
+	DRM_PANTHOR_EXCEPTION_TILE_RANGE_FAULT = 0x59,
+	DRM_PANTHOR_EXCEPTION_ADDR_RANGE_FAULT = 0x5a,
+	DRM_PANTHOR_EXCEPTION_IMPRECISE_FAULT = 0x5b,
+	DRM_PANTHOR_EXCEPTION_OOM = 0x60,
+	DRM_PANTHOR_EXCEPTION_CSF_FW_INTERNAL_ERROR = 0x68,
+	DRM_PANTHOR_EXCEPTION_CSF_RES_EVICTION_TIMEOUT = 0x69,
+	DRM_PANTHOR_EXCEPTION_GPU_BUS_FAULT = 0x80,
+	DRM_PANTHOR_EXCEPTION_GPU_SHAREABILITY_FAULT = 0x88,
+	DRM_PANTHOR_EXCEPTION_SYS_SHAREABILITY_FAULT = 0x89,
+	DRM_PANTHOR_EXCEPTION_GPU_CACHEABILITY_FAULT = 0x8a,
+	DRM_PANTHOR_EXCEPTION_TRANSLATION_FAULT_0 = 0xc0,
+	DRM_PANTHOR_EXCEPTION_TRANSLATION_FAULT_1 = 0xc1,
+	DRM_PANTHOR_EXCEPTION_TRANSLATION_FAULT_2 = 0xc2,
+	DRM_PANTHOR_EXCEPTION_TRANSLATION_FAULT_3 = 0xc3,
+	DRM_PANTHOR_EXCEPTION_TRANSLATION_FAULT_4 = 0xc4,
+	DRM_PANTHOR_EXCEPTION_PERM_FAULT_0 = 0xc8,
+	DRM_PANTHOR_EXCEPTION_PERM_FAULT_1 = 0xc9,
+	DRM_PANTHOR_EXCEPTION_PERM_FAULT_2 = 0xca,
+	DRM_PANTHOR_EXCEPTION_PERM_FAULT_3 = 0xcb,
+	DRM_PANTHOR_EXCEPTION_ACCESS_FLAG_1 = 0xd9,
+	DRM_PANTHOR_EXCEPTION_ACCESS_FLAG_2 = 0xda,
+	DRM_PANTHOR_EXCEPTION_ACCESS_FLAG_3 = 0xdb,
+	DRM_PANTHOR_EXCEPTION_ADDR_SIZE_FAULT_IN = 0xe0,
+	DRM_PANTHOR_EXCEPTION_ADDR_SIZE_FAULT_OUT0 = 0xe4,
+	DRM_PANTHOR_EXCEPTION_ADDR_SIZE_FAULT_OUT1 = 0xe5,
+	DRM_PANTHOR_EXCEPTION_ADDR_SIZE_FAULT_OUT2 = 0xe6,
+	DRM_PANTHOR_EXCEPTION_ADDR_SIZE_FAULT_OUT3 = 0xe7,
+	DRM_PANTHOR_EXCEPTION_MEM_ATTR_FAULT_0 = 0xe8,
+	DRM_PANTHOR_EXCEPTION_MEM_ATTR_FAULT_1 = 0xe9,
+	DRM_PANTHOR_EXCEPTION_MEM_ATTR_FAULT_2 = 0xea,
+	DRM_PANTHOR_EXCEPTION_MEM_ATTR_FAULT_3 = 0xeb,
+};
+
+/**
+ * panthor_exception_is_fault() - Checks if an exception is a fault.
+ *
+ * Return: true if the exception is a fault, false otherwise.
+ */
+static inline bool
+panthor_exception_is_fault(u32 exception_code)
+{
+	return exception_code > DRM_PANTHOR_EXCEPTION_MAX_NON_FAULT;
+}
+
+const char *panthor_exception_name(struct panthor_device *ptdev,
+				   u32 exception_code);
+
+/**
+ * PANTHOR_IRQ_HANDLER() - Define interrupt handlers and the interrupt
+ * registration function.
+ *
+ * The boiler-plate to gracefully deal with shared interrupts is
+ * auto-generated. All you have to do is call PANTHOR_IRQ_HANDLER()
+ * just after the actual handler. The handler prototype is:
+ *
+ * void (*handler)(struct panthor_device *, u32 status);
+ */
+#define PANTHOR_IRQ_HANDLER(__name, __reg_prefix, __handler)					\
+static irqreturn_t panthor_ ## __name ## _irq_raw_handler(int irq, void *data)			\
+{												\
+	struct panthor_irq *pirq = data;							\
+	struct panthor_device *ptdev = pirq->ptdev;						\
+												\
+	if (atomic_read(&pirq->suspended))							\
+		return IRQ_NONE;								\
+	if (!gpu_read(ptdev, __reg_prefix ## _INT_STAT))					\
+		return IRQ_NONE;								\
+												\
+	gpu_write(ptdev, __reg_prefix ## _INT_MASK, 0);						\
+	return IRQ_WAKE_THREAD;									\
+}												\
+												\
+static irqreturn_t panthor_ ## __name ## _irq_threaded_handler(int irq, void *data)		\
+{												\
+	struct panthor_irq *pirq = data;							\
+	struct panthor_device *ptdev = pirq->ptdev;						\
+	irqreturn_t ret = IRQ_NONE;								\
+												\
+	while (true) {										\
+		u32 status = gpu_read(ptdev, __reg_prefix ## _INT_RAWSTAT) & pirq->mask;	\
+												\
+		if (!status)									\
+			break;									\
+												\
+		gpu_write(ptdev, __reg_prefix ## _INT_CLEAR, status);				\
+												\
+		__handler(ptdev, status);							\
+		ret = IRQ_HANDLED;								\
+	}											\
+												\
+	if (!atomic_read(&pirq->suspended))							\
+		gpu_write(ptdev, __reg_prefix ## _INT_MASK, pirq->mask);			\
+												\
+	return ret;										\
+}												\
+												\
+static inline void panthor_ ## __name ## _irq_suspend(struct panthor_irq *pirq)			\
+{												\
+	int cookie;										\
+												\
+	pirq->mask = 0;										\
+	gpu_write(pirq->ptdev, __reg_prefix ## _INT_MASK, 0);					\
+	synchronize_irq(pirq->irq);								\
+	atomic_set(&pirq->suspended, true);							\
+}												\
+												\
+static inline void panthor_ ## __name ## _irq_resume(struct panthor_irq *pirq, u32 mask)	\
+{												\
+	int cookie;										\
+												\
+	atomic_set(&pirq->suspended, false);							\
+	pirq->mask = mask;									\
+	gpu_write(pirq->ptdev, __reg_prefix ## _INT_CLEAR, mask);				\
+	gpu_write(pirq->ptdev, __reg_prefix ## _INT_MASK, mask);				\
+}												\
+												\
+static int panthor_request_ ## __name ## _irq(struct panthor_device *ptdev,			\
+					      struct panthor_irq *pirq,				\
+					      int irq, u32 mask)				\
+{												\
+	pirq->ptdev = ptdev;									\
+	pirq->irq = irq;									\
+	panthor_ ## __name ## _irq_resume(pirq, mask);						\
+												\
+	return devm_request_threaded_irq(ptdev->base.dev, irq,					\
+					 panthor_ ## __name ## _irq_raw_handler,		\
+					 panthor_ ## __name ## _irq_threaded_handler,		\
+					 IRQF_SHARED, KBUILD_MODNAME "-" # __name,		\
+					 pirq);							\
+}
+
+extern struct workqueue_struct *panthor_cleanup_wq;
+
+#endif
diff --git a/drivers/gpu/drm/panthor/panthor_drv.c b/drivers/gpu/drm/panthor/panthor_drv.c
new file mode 100644
index 0000000000000..00ef22d8a0ba3
--- /dev/null
+++ b/drivers/gpu/drm/panthor/panthor_drv.c
@@ -0,0 +1,1477 @@
+// SPDX-License-Identifier: GPL-2.0 or MIT
+/* Copyright 2018 Marty E. Plummer <hanetzer@startmail.com> */
+/* Copyright 2019 Linaro, Ltd., Rob Herring <robh@kernel.org> */
+/* Copyright 2019 Collabora ltd. */
+
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/of_platform.h>
+#include <linux/pagemap.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+
+#include <drm/drm_drv.h>
+#include <drm/drm_exec.h>
+#include <drm/drm_ioctl.h>
+#include <drm/drm_syncobj.h>
+#include <drm/drm_utils.h>
+#include <drm/gpu_scheduler.h>
+#include <drm/panthor_drm.h>
+
+#include "panthor_device.h"
+#include "panthor_fw.h"
+#include "panthor_gem.h"
+#include "panthor_gpu.h"
+#include "panthor_heap.h"
+#include "panthor_mmu.h"
+#include "panthor_regs.h"
+#include "panthor_sched.h"
+
+/**
+ * DOC: user <-> kernel object copy helpers.
+ */
+
+/**
+ * panthor_set_uobj() - Copy kernel object to user object.
+ * @usr_ptr: Users pointer.
+ * @usr_size: Size of the user object.
+ * @min_size: Minimum size for this object.
+ * @kern_size: Size of the kernel object.
+ * @in: Address of the kernel object to copy.
+ *
+ * Helper automating kernel -> user object copies.
+ *
+ * Don't use this function directly, use PANTHOR_UOBJ_SET() instead.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+static int
+panthor_set_uobj(u64 usr_ptr, u32 usr_size, u32 min_size, u32 kern_size, const void *in)
+{
+	/* User size shouldn't be smaller than the minimal object size. */
+	if (usr_size < min_size)
+		return -EINVAL;
+
+	if (copy_to_user(u64_to_user_ptr(usr_ptr), in, min_t(u32, usr_size, kern_size)))
+		return -EFAULT;
+
+	/* When the kernel object is smaller than the user object, we fill the gap with
+	 * zeros.
+	 */
+	if (usr_size > kern_size &&
+	    clear_user(u64_to_user_ptr(usr_ptr + kern_size), usr_size - kern_size)) {
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+/**
+ * panthor_get_uobj_array() - Copy a user object array into a kernel accessible object array.
+ * @in: The object array to copy.
+ * @min_stride: Minimum array stride.
+ * @obj_size: Kernel object size.
+ *
+ * Helper automating user -> kernel object copies.
+ *
+ * Don't use this function directly, use PANTHOR_UOBJ_GET_ARRAY() instead.
+ *
+ * Return: newly allocated object array or an ERR_PTR on error.
+ */
+static void *
+panthor_get_uobj_array(const struct drm_panthor_obj_array *in, u32 min_stride,
+		       u32 obj_size)
+{
+	int ret = 0;
+	void *out_alloc;
+
+	/* User stride must be at least the minimum object size, otherwise it might
+	 * lack useful information.
+	 */
+	if (in->stride < min_stride)
+		return ERR_PTR(-EINVAL);
+
+	if (!in->count)
+		return NULL;
+
+	out_alloc = kvmalloc_array(in->count, obj_size, GFP_KERNEL);
+	if (!out_alloc)
+		return ERR_PTR(-ENOMEM);
+
+	if (obj_size == in->stride) {
+		/* Fast path when user/kernel have the same uAPI header version. */
+		if (copy_from_user(out_alloc, u64_to_user_ptr(in->array),
+				   (unsigned long)obj_size * in->count))
+			ret = -EFAULT;
+	} else {
+		void __user *in_ptr = u64_to_user_ptr(in->array);
+		void *out_ptr = out_alloc;
+
+		/* If the sizes differ, we need to copy elements one by one. */
+		for (u32 i = 0; i < in->count; i++) {
+			ret = copy_struct_from_user(out_ptr, obj_size, in_ptr, in->stride);
+			if (ret)
+				break;
+
+			out_ptr += obj_size;
+			in_ptr += in->stride;
+		}
+	}
+
+	if (ret) {
+		kvfree(out_alloc);
+		return ERR_PTR(ret);
+	}
+
+	return out_alloc;
+}
+
+/**
+ * PANTHOR_UOBJ_MIN_SIZE_INTERNAL() - Get the minimum user object size
+ * @_typename: Object type.
+ * @_last_mandatory_field: Last mandatory field.
+ *
+ * Get the minimum user object size based on the last mandatory field name,
+ * A.K.A, the name of the last field of the structure at the time this
+ * structure was added to the uAPI.
+ *
+ * Don't use directly, use PANTHOR_UOBJ_DECL() instead.
+ */
+#define PANTHOR_UOBJ_MIN_SIZE_INTERNAL(_typename, _last_mandatory_field) \
+	(offsetof(_typename, _last_mandatory_field) + \
+	 sizeof(((_typename *)NULL)->_last_mandatory_field))
+
+/**
+ * PANTHOR_UOBJ_DECL() - Declare a new uAPI object whose subject to
+ * evolutions.
+ * @_typename: Object type.
+ * @_last_mandatory_field: Last mandatory field.
+ *
+ * Should be used to extend the PANTHOR_UOBJ_MIN_SIZE() list.
+ */
+#define PANTHOR_UOBJ_DECL(_typename, _last_mandatory_field) \
+	_typename : PANTHOR_UOBJ_MIN_SIZE_INTERNAL(_typename, _last_mandatory_field)
+
+/**
+ * PANTHOR_UOBJ_MIN_SIZE() - Get the minimum size of a given uAPI object
+ * @_obj_name: Object to get the minimum size of.
+ *
+ * Don't use this macro directly, it's automatically called by
+ * PANTHOR_UOBJ_{SET,GET_ARRAY}().
+ */
+#define PANTHOR_UOBJ_MIN_SIZE(_obj_name) \
+	_Generic(_obj_name, \
+		 PANTHOR_UOBJ_DECL(struct drm_panthor_gpu_info, tiler_present), \
+		 PANTHOR_UOBJ_DECL(struct drm_panthor_csif_info, pad), \
+		 PANTHOR_UOBJ_DECL(struct drm_panthor_sync_op, timeline_value), \
+		 PANTHOR_UOBJ_DECL(struct drm_panthor_queue_submit, syncs), \
+		 PANTHOR_UOBJ_DECL(struct drm_panthor_queue_create, ringbuf_size), \
+		 PANTHOR_UOBJ_DECL(struct drm_panthor_vm_bind_op, syncs))
+
+/**
+ * PANTHOR_UOBJ_SET() - Copy a kernel object to a user object.
+ * @_dest_usr_ptr: User pointer to copy to.
+ * @_usr_size: Size of the user object.
+ * @_src_obj: Kernel object to copy (not a pointer).
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+#define PANTHOR_UOBJ_SET(_dest_usr_ptr, _usr_size, _src_obj) \
+	panthor_set_uobj(_dest_usr_ptr, _usr_size, \
+			 PANTHOR_UOBJ_MIN_SIZE(_src_obj), \
+			 sizeof(_src_obj), &(_src_obj))
+
+/**
+ * PANTHOR_UOBJ_GET_ARRAY() - Copy a user object array to a kernel accessible
+ * object array.
+ * @_dest_array: Local variable that will hold the newly allocated kernel
+ * object array.
+ * @_uobj_array: The drm_panthor_obj_array object describing the user object
+ * array.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+#define PANTHOR_UOBJ_GET_ARRAY(_dest_array, _uobj_array) \
+	({ \
+		typeof(_dest_array) _tmp; \
+		_tmp = panthor_get_uobj_array(_uobj_array, \
+					      PANTHOR_UOBJ_MIN_SIZE((_dest_array)[0]), \
+					      sizeof((_dest_array)[0])); \
+		if (!IS_ERR(_tmp)) \
+			_dest_array = _tmp; \
+		PTR_ERR_OR_ZERO(_tmp); \
+	})
+
+/**
+ * struct panthor_sync_signal - Represent a synchronization object point to attach
+ * our job fence to.
+ *
+ * This structure is here to keep track of fences that are currently bound to
+ * a specific syncobj point.
+ *
+ * At the beginning of a job submission, the fence
+ * is retrieved from the syncobj itself, and can be NULL if no fence was attached
+ * to this point.
+ *
+ * At the end, it points to the fence of the last job that had a
+ * %DRM_PANTHOR_SYNC_OP_SIGNAL on this syncobj.
+ *
+ * With jobs being submitted in batches, the fence might change several times during
+ * the process, allowing one job to wait on a job that's part of the same submission
+ * but appears earlier in the drm_panthor_group_submit::queue_submits array.
+ */
+struct panthor_sync_signal {
+	/** @node: list_head to track signal ops within a submit operation */
+	struct list_head node;
+
+	/** @handle: The syncobj handle. */
+	u32 handle;
+
+	/**
+	 * @point: The syncobj point.
+	 *
+	 * Zero for regular syncobjs, and non-zero for timeline syncobjs.
+	 */
+	u64 point;
+
+	/**
+	 * @syncobj: The sync object pointed by @handle.
+	 */
+	struct drm_syncobj *syncobj;
+
+	/**
+	 * @chain: Chain object used to link the new fence to an existing
+	 * timeline syncobj.
+	 *
+	 * NULL for regular syncobj, non-NULL for timeline syncobjs.
+	 */
+	struct dma_fence_chain *chain;
+
+	/**
+	 * @fence: The fence to assign to the syncobj or syncobj-point.
+	 */
+	struct dma_fence *fence;
+};
+
+/**
+ * struct panthor_job_ctx - Job context
+ */
+struct panthor_job_ctx {
+	/** @job: The job that is about to be submitted to drm_sched. */
+	struct drm_sched_job *job;
+
+	/** @syncops: Array of sync operations. */
+	struct drm_panthor_sync_op *syncops;
+
+	/** @syncop_count: Number of sync operations. */
+	u32 syncop_count;
+};
+
+/**
+ * struct panthor_submit_ctx - Submission context
+ *
+ * Anything that's related to a submission (%DRM_IOCTL_PANTHOR_VM_BIND or
+ * %DRM_IOCTL_PANTHOR_GROUP_SUBMIT) is kept here, so we can automate the
+ * initialization and cleanup steps.
+ */
+struct panthor_submit_ctx {
+	/** @file: DRM file this submission happens on. */
+	struct drm_file *file;
+
+	/**
+	 * @signals: List of struct panthor_sync_signal.
+	 *
+	 * %DRM_PANTHOR_SYNC_OP_SIGNAL operations will be recorded here,
+	 * and %DRM_PANTHOR_SYNC_OP_WAIT will first check if an entry
+	 * matching the syncobj+point exists before calling
+	 * drm_syncobj_find_fence(). This allows us to describe dependencies
+	 * existing between jobs that are part of the same batch.
+	 */
+	struct list_head signals;
+
+	/** @jobs: Array of jobs. */
+	struct panthor_job_ctx *jobs;
+
+	/** @job_count: Number of entries in the @jobs array. */
+	u32 job_count;
+
+	/** @exec: drm_exec context used to acquire and prepare resv objects. */
+	struct drm_exec exec;
+};
+
+#define PANTHOR_SYNC_OP_FLAGS_MASK \
+	(DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_MASK | DRM_PANTHOR_SYNC_OP_SIGNAL)
+
+static bool sync_op_is_signal(const struct drm_panthor_sync_op *sync_op)
+{
+	return !!(sync_op->flags & DRM_PANTHOR_SYNC_OP_SIGNAL);
+}
+
+static bool sync_op_is_wait(const struct drm_panthor_sync_op *sync_op)
+{
+	/* Note that DRM_PANTHOR_SYNC_OP_WAIT == 0 */
+	return !(sync_op->flags & DRM_PANTHOR_SYNC_OP_SIGNAL);
+}
+
+/**
+ * panthor_check_sync_op() - Check drm_panthor_sync_op fields
+ * @sync_op: The sync operation to check.
+ *
+ * Return: 0 on success, -EINVAL otherwise.
+ */
+static int
+panthor_check_sync_op(const struct drm_panthor_sync_op *sync_op)
+{
+	u8 handle_type;
+
+	if (sync_op->flags & ~PANTHOR_SYNC_OP_FLAGS_MASK)
+		return -EINVAL;
+
+	handle_type = sync_op->flags & DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_MASK;
+	if (handle_type != DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ &&
+	    handle_type != DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ)
+		return -EINVAL;
+
+	if (handle_type == DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ &&
+	    sync_op->timeline_value != 0)
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * panthor_sync_signal_free() - Release resources and free a panthor_sync_signal object
+ * @sig_sync: Signal object to free.
+ */
+static void
+panthor_sync_signal_free(struct panthor_sync_signal *sig_sync)
+{
+	if (!sig_sync)
+		return;
+
+	drm_syncobj_put(sig_sync->syncobj);
+	dma_fence_chain_free(sig_sync->chain);
+	dma_fence_put(sig_sync->fence);
+	kfree(sig_sync);
+}
+
+/**
+ * panthor_submit_ctx_add_sync_signal() - Add a signal operation to a submit context
+ * @ctx: Context to add the signal operation to.
+ * @handle: Syncobj handle.
+ * @point: Syncobj point.
+ *
+ * Return: 0 on success, otherwise negative error value.
+ */
+static int
+panthor_submit_ctx_add_sync_signal(struct panthor_submit_ctx *ctx, u32 handle, u64 point)
+{
+	struct panthor_sync_signal *sig_sync;
+	struct dma_fence *cur_fence;
+	int ret;
+
+	sig_sync = kzalloc(sizeof(*sig_sync), GFP_KERNEL);
+	if (!sig_sync)
+		return -ENOMEM;
+
+	sig_sync->handle = handle;
+	sig_sync->point = point;
+
+	if (point > 0) {
+		sig_sync->chain = dma_fence_chain_alloc();
+		if (!sig_sync->chain) {
+			ret = -ENOMEM;
+			goto err_free_sig_sync;
+		}
+	}
+
+	sig_sync->syncobj = drm_syncobj_find(ctx->file, handle);
+	if (!sig_sync->syncobj) {
+		ret = -EINVAL;
+		goto err_free_sig_sync;
+	}
+
+	/* Retrieve the current fence attached to that point. It's
+	 * perfectly fine to get a NULL fence here, it just means there's
+	 * no fence attached to that point yet.
+	 */
+	if (!drm_syncobj_find_fence(ctx->file, handle, point, 0, &cur_fence))
+		sig_sync->fence = cur_fence;
+
+	list_add_tail(&sig_sync->node, &ctx->signals);
+
+	return 0;
+
+err_free_sig_sync:
+	panthor_sync_signal_free(sig_sync);
+	return ret;
+}
+
+/**
+ * panthor_submit_ctx_search_sync_signal() - Search an existing signal operation in a
+ * submit context.
+ * @ctx: Context to search the signal operation in.
+ * @handle: Syncobj handle.
+ * @point: Syncobj point.
+ *
+ * Return: A valid panthor_sync_signal object if found, NULL otherwise.
+ */
+static struct panthor_sync_signal *
+panthor_submit_ctx_search_sync_signal(struct panthor_submit_ctx *ctx, u32 handle, u64 point)
+{
+	struct panthor_sync_signal *sig_sync;
+
+	list_for_each_entry(sig_sync, &ctx->signals, node) {
+		if (handle == sig_sync->handle && point == sig_sync->point)
+			return sig_sync;
+	}
+
+	return NULL;
+}
+
+/**
+ * panthor_submit_ctx_add_job() - Add a job to a submit context
+ * @ctx: Context to search the signal operation in.
+ * @idx: Index of the job in the context.
+ * @job: Job to add.
+ * @syncs: Sync operations provided by userspace.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+static int
+panthor_submit_ctx_add_job(struct panthor_submit_ctx *ctx, u32 idx,
+			   struct drm_sched_job *job,
+			   const struct drm_panthor_obj_array *syncs)
+{
+	int ret;
+
+	ctx->jobs[idx].job = job;
+
+	ret = PANTHOR_UOBJ_GET_ARRAY(ctx->jobs[idx].syncops, syncs);
+	if (ret)
+		return ret;
+
+	ctx->jobs[idx].syncop_count = syncs->count;
+	return 0;
+}
+
+/**
+ * panthor_submit_ctx_get_sync_signal() - Search signal operation and add one if none was found.
+ * @ctx: Context to search the signal operation in.
+ * @handle: Syncobj handle.
+ * @point: Syncobj point.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+static int
+panthor_submit_ctx_get_sync_signal(struct panthor_submit_ctx *ctx, u32 handle, u64 point)
+{
+	struct panthor_sync_signal *sig_sync;
+
+	sig_sync = panthor_submit_ctx_search_sync_signal(ctx, handle, point);
+	if (sig_sync)
+		return 0;
+
+	return panthor_submit_ctx_add_sync_signal(ctx, handle, point);
+}
+
+/**
+ * panthor_submit_ctx_update_job_sync_signal_fences() - Update fences
+ * on the signal operations specified by a job.
+ * @ctx: Context to search the signal operation in.
+ * @job_idx: Index of the job to operate on.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+static int
+panthor_submit_ctx_update_job_sync_signal_fences(struct panthor_submit_ctx *ctx,
+						 u32 job_idx)
+{
+	struct panthor_device *ptdev = container_of(ctx->file->minor->dev,
+						    struct panthor_device,
+						    base);
+	struct dma_fence *done_fence = &ctx->jobs[job_idx].job->s_fence->finished;
+	const struct drm_panthor_sync_op *sync_ops = ctx->jobs[job_idx].syncops;
+	u32 sync_op_count = ctx->jobs[job_idx].syncop_count;
+
+	for (u32 i = 0; i < sync_op_count; i++) {
+		struct dma_fence *old_fence;
+		struct panthor_sync_signal *sig_sync;
+
+		if (!sync_op_is_signal(&sync_ops[i]))
+			continue;
+
+		sig_sync = panthor_submit_ctx_search_sync_signal(ctx, sync_ops[i].handle,
+								 sync_ops[i].timeline_value);
+		if (drm_WARN_ON(&ptdev->base, !sig_sync))
+			return -EINVAL;
+
+		old_fence = sig_sync->fence;
+		sig_sync->fence = dma_fence_get(done_fence);
+		dma_fence_put(old_fence);
+
+		if (drm_WARN_ON(&ptdev->base, !sig_sync->fence))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * panthor_submit_ctx_collect_job_signal_ops() - Iterate over all job signal operations
+ * and add them to the context.
+ * @ctx: Context to search the signal operation in.
+ * @job_idx: Index of the job to operate on.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+static int
+panthor_submit_ctx_collect_job_signal_ops(struct panthor_submit_ctx *ctx,
+					  u32 job_idx)
+{
+	const struct drm_panthor_sync_op *sync_ops = ctx->jobs[job_idx].syncops;
+	u32 sync_op_count = ctx->jobs[job_idx].syncop_count;
+
+	for (u32 i = 0; i < sync_op_count; i++) {
+		int ret;
+
+		if (!sync_op_is_signal(&sync_ops[i]))
+			continue;
+
+		ret = panthor_check_sync_op(&sync_ops[i]);
+		if (ret)
+			return ret;
+
+		ret = panthor_submit_ctx_get_sync_signal(ctx,
+							 sync_ops[i].handle,
+							 sync_ops[i].timeline_value);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/**
+ * panthor_submit_ctx_push_fences() - Iterate over the signal array, and for each entry, push
+ * the currently assigned fence to the associated syncobj.
+ * @ctx: Context to push fences on.
+ *
+ * This is the last step of a submission procedure, and is done once we know the submission
+ * is effective and job fences are guaranteed to be signaled in finite time.
+ */
+static void
+panthor_submit_ctx_push_fences(struct panthor_submit_ctx *ctx)
+{
+	struct panthor_sync_signal *sig_sync;
+
+	list_for_each_entry(sig_sync, &ctx->signals, node) {
+		if (sig_sync->chain) {
+			drm_syncobj_add_point(sig_sync->syncobj, sig_sync->chain,
+					      sig_sync->fence, sig_sync->point);
+			sig_sync->chain = NULL;
+		} else {
+			drm_syncobj_replace_fence(sig_sync->syncobj, sig_sync->fence);
+		}
+	}
+}
+
+/**
+ * panthor_submit_ctx_add_sync_deps_to_job() - Add sync wait operations as
+ * job dependencies.
+ * @ctx: Submit context.
+ * @job_idx: Index of the job to operate on.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+static int
+panthor_submit_ctx_add_sync_deps_to_job(struct panthor_submit_ctx *ctx,
+					u32 job_idx)
+{
+	struct panthor_device *ptdev = container_of(ctx->file->minor->dev,
+						    struct panthor_device,
+						    base);
+	const struct drm_panthor_sync_op *sync_ops = ctx->jobs[job_idx].syncops;
+	struct drm_sched_job *job = ctx->jobs[job_idx].job;
+	u32 sync_op_count = ctx->jobs[job_idx].syncop_count;
+	int ret = 0;
+
+	for (u32 i = 0; i < sync_op_count; i++) {
+		struct panthor_sync_signal *sig_sync;
+		struct dma_fence *fence;
+
+		if (!sync_op_is_wait(&sync_ops[i]))
+			continue;
+
+		ret = panthor_check_sync_op(&sync_ops[i]);
+		if (ret)
+			return ret;
+
+		sig_sync = panthor_submit_ctx_search_sync_signal(ctx, sync_ops[i].handle,
+								 sync_ops[i].timeline_value);
+		if (sig_sync) {
+			if (drm_WARN_ON(&ptdev->base, !sig_sync->fence))
+				return -EINVAL;
+
+			fence = dma_fence_get(sig_sync->fence);
+		} else {
+			ret = drm_syncobj_find_fence(ctx->file, sync_ops[i].handle,
+						     sync_ops[i].timeline_value,
+						     0, &fence);
+			if (ret)
+				return ret;
+		}
+
+		ret = drm_sched_job_add_dependency(job, fence);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/**
+ * panthor_submit_ctx_collect_jobs_signal_ops() - Collect all signal operations
+ * and add them to the submit context.
+ * @ctx: Submit context.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+static int
+panthor_submit_ctx_collect_jobs_signal_ops(struct panthor_submit_ctx *ctx)
+{
+	for (u32 i = 0; i < ctx->job_count; i++) {
+		int ret;
+
+		ret = panthor_submit_ctx_collect_job_signal_ops(ctx, i);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/**
+ * panthor_submit_ctx_add_deps_and_arm_jobs() - Add jobs dependencies and arm jobs
+ * @ctx: Submit context.
+ *
+ * Must be called after the resv preparation has been taken care of.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+static int
+panthor_submit_ctx_add_deps_and_arm_jobs(struct panthor_submit_ctx *ctx)
+{
+	for (u32 i = 0; i < ctx->job_count; i++) {
+		int ret;
+
+		ret = panthor_submit_ctx_add_sync_deps_to_job(ctx, i);
+		if (ret)
+			return ret;
+
+		drm_sched_job_arm(ctx->jobs[i].job);
+
+		ret = panthor_submit_ctx_update_job_sync_signal_fences(ctx, i);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/**
+ * panthor_submit_ctx_push_jobs() - Push jobs to their scheduling entities.
+ * @ctx: Submit context.
+ * @upd_resvs: Callback used to update reservation objects that were previously
+ * preapred.
+ */
+static void
+panthor_submit_ctx_push_jobs(struct panthor_submit_ctx *ctx,
+			     void (*upd_resvs)(struct drm_exec *, struct drm_sched_job *))
+{
+	for (u32 i = 0; i < ctx->job_count; i++) {
+		upd_resvs(&ctx->exec, ctx->jobs[i].job);
+		drm_sched_entity_push_job(ctx->jobs[i].job);
+
+		/* Job is owned by the scheduler now. */
+		ctx->jobs[i].job = NULL;
+	}
+
+	panthor_submit_ctx_push_fences(ctx);
+}
+
+/**
+ * panthor_submit_ctx_init() - Initializes a submission context
+ * @ctx: Submit context to initialize.
+ * @file: drm_file this submission happens on.
+ * @job_count: Number of jobs that will be submitted.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+static int panthor_submit_ctx_init(struct panthor_submit_ctx *ctx,
+				   struct drm_file *file, u32 job_count)
+{
+	ctx->jobs = kvmalloc_array(job_count, sizeof(*ctx->jobs),
+				   GFP_KERNEL | __GFP_ZERO);
+	if (!ctx->jobs)
+		return -ENOMEM;
+
+	ctx->file = file;
+	ctx->job_count = job_count;
+	INIT_LIST_HEAD(&ctx->signals);
+	drm_exec_init(&ctx->exec,
+		      DRM_EXEC_INTERRUPTIBLE_WAIT | DRM_EXEC_IGNORE_DUPLICATES,
+		      0);
+	return 0;
+}
+
+/**
+ * panthor_submit_ctx_cleanup() - Cleanup a submission context
+ * @ctx: Submit context to cleanup.
+ * @job_put: Job put callback.
+ */
+static void panthor_submit_ctx_cleanup(struct panthor_submit_ctx *ctx,
+				       void (*job_put)(struct drm_sched_job *))
+{
+	struct panthor_sync_signal *sig_sync, *tmp;
+	unsigned long i;
+
+	drm_exec_fini(&ctx->exec);
+
+	list_for_each_entry_safe(sig_sync, tmp, &ctx->signals, node)
+		panthor_sync_signal_free(sig_sync);
+
+	for (i = 0; i < ctx->job_count; i++) {
+		job_put(ctx->jobs[i].job);
+		kvfree(ctx->jobs[i].syncops);
+	}
+
+	kvfree(ctx->jobs);
+}
+
+static int panthor_ioctl_dev_query(struct drm_device *ddev, void *data, struct drm_file *file)
+{
+	struct panthor_device *ptdev = container_of(ddev, struct panthor_device, base);
+	struct drm_panthor_dev_query *args = data;
+
+	if (!args->pointer) {
+		switch (args->type) {
+		case DRM_PANTHOR_DEV_QUERY_GPU_INFO:
+			args->size = sizeof(ptdev->gpu_info);
+			return 0;
+
+		case DRM_PANTHOR_DEV_QUERY_CSIF_INFO:
+			args->size = sizeof(ptdev->csif_info);
+			return 0;
+
+		default:
+			return -EINVAL;
+		}
+	}
+
+	switch (args->type) {
+	case DRM_PANTHOR_DEV_QUERY_GPU_INFO:
+		return PANTHOR_UOBJ_SET(args->pointer, args->size, ptdev->gpu_info);
+
+	case DRM_PANTHOR_DEV_QUERY_CSIF_INFO:
+		return PANTHOR_UOBJ_SET(args->pointer, args->size, ptdev->csif_info);
+
+	default:
+		return -EINVAL;
+	}
+}
+
+#define PANTHOR_VM_CREATE_FLAGS			0
+
+static int panthor_ioctl_vm_create(struct drm_device *ddev, void *data,
+				   struct drm_file *file)
+{
+	struct panthor_device *ptdev = container_of(ddev, struct panthor_device, base);
+	struct panthor_file *pfile = file->driver_priv;
+	struct drm_panthor_vm_create *args = data;
+	int cookie, ret;
+
+	if (!drm_dev_enter(ddev, &cookie))
+		return -ENODEV;
+
+	ret = panthor_vm_pool_create_vm(ptdev, pfile->vms,  args);
+	if (ret >= 0) {
+		args->id = ret;
+		ret = 0;
+	}
+
+	drm_dev_exit(cookie);
+	return ret;
+}
+
+static int panthor_ioctl_vm_destroy(struct drm_device *ddev, void *data,
+				    struct drm_file *file)
+{
+	struct panthor_file *pfile = file->driver_priv;
+	struct drm_panthor_vm_destroy *args = data;
+
+	if (args->pad)
+		return -EINVAL;
+
+	return panthor_vm_pool_destroy_vm(pfile->vms, args->id);
+}
+
+#define PANTHOR_BO_FLAGS		DRM_PANTHOR_BO_NO_MMAP
+
+static int panthor_ioctl_bo_create(struct drm_device *ddev, void *data,
+				   struct drm_file *file)
+{
+	struct panthor_file *pfile = file->driver_priv;
+	struct drm_panthor_bo_create *args = data;
+	struct panthor_vm *vm = NULL;
+	int cookie, ret;
+
+	if (!drm_dev_enter(ddev, &cookie))
+		return -ENODEV;
+
+	if (!args->size || args->pad ||
+	    (args->flags & ~PANTHOR_BO_FLAGS)) {
+		ret = -EINVAL;
+		goto out_dev_exit;
+	}
+
+	if (args->exclusive_vm_id) {
+		vm = panthor_vm_pool_get_vm(pfile->vms, args->exclusive_vm_id);
+		if (!vm) {
+			ret = -EINVAL;
+			goto out_dev_exit;
+		}
+	}
+
+	ret = panthor_gem_create_with_handle(file, ddev, vm, &args->size,
+					     args->flags, &args->handle);
+
+	panthor_vm_put(vm);
+
+out_dev_exit:
+	drm_dev_exit(cookie);
+	return ret;
+}
+
+static int panthor_ioctl_bo_mmap_offset(struct drm_device *ddev, void *data,
+					struct drm_file *file)
+{
+	struct drm_panthor_bo_mmap_offset *args = data;
+	struct drm_gem_object *obj;
+	int ret;
+
+	if (args->pad)
+		return -EINVAL;
+
+	obj = drm_gem_object_lookup(file, args->handle);
+	if (!obj)
+		return -ENOENT;
+
+	ret = drm_gem_create_mmap_offset(obj);
+	if (ret)
+		goto out;
+
+	args->offset = drm_vma_node_offset_addr(&obj->vma_node);
+
+out:
+	drm_gem_object_put(obj);
+	return ret;
+}
+
+static int panthor_ioctl_group_submit(struct drm_device *ddev, void *data,
+				      struct drm_file *file)
+{
+	struct panthor_file *pfile = file->driver_priv;
+	struct drm_panthor_group_submit *args = data;
+	struct drm_panthor_queue_submit *jobs_args;
+	struct panthor_submit_ctx ctx;
+	int ret = 0, cookie;
+
+	if (args->pad)
+		return -EINVAL;
+
+	if (!drm_dev_enter(ddev, &cookie))
+		return -ENODEV;
+
+	ret = PANTHOR_UOBJ_GET_ARRAY(jobs_args, &args->queue_submits);
+	if (ret)
+		goto out_dev_exit;
+
+	ret = panthor_submit_ctx_init(&ctx, file, args->queue_submits.count);
+	if (ret)
+		goto out_free_jobs_args;
+
+	/* Create jobs and attach sync operations */
+	for (u32 i = 0; i < args->queue_submits.count; i++) {
+		const struct drm_panthor_queue_submit *qsubmit = &jobs_args[i];
+		struct drm_sched_job *job;
+
+		job = panthor_job_create(pfile, args->group_handle, qsubmit);
+		if (IS_ERR(job)) {
+			ret = PTR_ERR(job);
+			goto out_cleanup_submit_ctx;
+		}
+
+		ret = panthor_submit_ctx_add_job(&ctx, i, job, &qsubmit->syncs);
+		if (ret)
+			goto out_cleanup_submit_ctx;
+	}
+
+	/*
+	 * Collect signal operations on all jobs, such that each job can pick
+	 * from it for its dependencies and update the fence to signal when the
+	 * job is submitted.
+	 */
+	ret = panthor_submit_ctx_collect_jobs_signal_ops(&ctx);
+	if (ret)
+		goto out_cleanup_submit_ctx;
+
+	/*
+	 * We acquire/prepare revs on all jobs before proceeding with the
+	 * dependency registration.
+	 *
+	 * This is solving two problems:
+	 * 1. drm_sched_job_arm() and drm_sched_entity_push_job() must be
+	 *    protected by a lock to make sure no concurrent access to the same
+	 *    entity get interleaved, which would mess up with the fence seqno
+	 *    ordering. Luckily, one of the resv being acquired is the VM resv,
+	 *    and a scheduling entity is only bound to a single VM. As soon as
+	 *    we acquire the VM resv, we should be safe.
+	 * 2. Jobs might depend on fences that were issued by previous jobs in
+	 *    the same batch, so we can't add dependencies on all jobs before
+	 *    arming previous jobs and registering the fence to the signal
+	 *    array, otherwise we might miss dependencies, or point to an
+	 *    outdated fence.
+	 */
+	if (args->queue_submits.count > 0) {
+		/* All jobs target the same group, so they also point to the same VM. */
+		struct panthor_vm *vm = panthor_job_vm(ctx.jobs[0].job);
+
+		drm_exec_until_all_locked(&ctx.exec) {
+			ret = panthor_vm_prepare_mapped_bos_resvs(&ctx.exec, vm,
+								  args->queue_submits.count);
+		}
+
+		if (ret)
+			goto out_cleanup_submit_ctx;
+	}
+
+	/*
+	 * Now that resvs are locked/prepared, we can iterate over each job to
+	 * add the dependencies, arm the job fence, register the job fence to
+	 * the signal array.
+	 */
+	ret = panthor_submit_ctx_add_deps_and_arm_jobs(&ctx);
+	if (ret)
+		goto out_cleanup_submit_ctx;
+
+	/* Nothing can fail after that point, so we can make our job fences
+	 * visible to the outside world. Push jobs and set the job fences to
+	 * the resv slots we reserved.  This also pushes the fences to the
+	 * syncobjs that are part of the signal array.
+	 */
+	panthor_submit_ctx_push_jobs(&ctx, panthor_job_update_resvs);
+
+out_cleanup_submit_ctx:
+	panthor_submit_ctx_cleanup(&ctx, panthor_job_put);
+
+out_free_jobs_args:
+	kvfree(jobs_args);
+
+out_dev_exit:
+	drm_dev_exit(cookie);
+	return ret;
+}
+
+static int panthor_ioctl_group_destroy(struct drm_device *ddev, void *data,
+				       struct drm_file *file)
+{
+	struct panthor_file *pfile = file->driver_priv;
+	struct drm_panthor_group_destroy *args = data;
+
+	if (args->pad)
+		return -EINVAL;
+
+	return panthor_group_destroy(pfile, args->group_handle);
+}
+
+static int panthor_ioctl_group_create(struct drm_device *ddev, void *data,
+				      struct drm_file *file)
+{
+	struct panthor_file *pfile = file->driver_priv;
+	struct drm_panthor_group_create *args = data;
+	struct drm_panthor_queue_create *queue_args;
+	int ret;
+
+	if (!args->queues.count)
+		return -EINVAL;
+
+	ret = PANTHOR_UOBJ_GET_ARRAY(queue_args, &args->queues);
+	if (ret)
+		return ret;
+
+	ret = panthor_group_create(pfile, args, queue_args);
+	if (ret >= 0) {
+		args->group_handle = ret;
+		ret = 0;
+	}
+
+	kvfree(queue_args);
+	return ret;
+}
+
+static int panthor_ioctl_group_get_state(struct drm_device *ddev, void *data,
+					 struct drm_file *file)
+{
+	struct panthor_file *pfile = file->driver_priv;
+	struct drm_panthor_group_get_state *args = data;
+
+	return panthor_group_get_state(pfile, args);
+}
+
+static int panthor_ioctl_tiler_heap_create(struct drm_device *ddev, void *data,
+					   struct drm_file *file)
+{
+	struct panthor_file *pfile = file->driver_priv;
+	struct drm_panthor_tiler_heap_create *args = data;
+	struct panthor_heap_pool *pool;
+	struct panthor_vm *vm;
+	int ret;
+
+	vm = panthor_vm_pool_get_vm(pfile->vms, args->vm_id);
+	if (!vm)
+		return -EINVAL;
+
+	pool = panthor_vm_get_heap_pool(vm, true);
+	if (IS_ERR(pool)) {
+		ret = PTR_ERR(pool);
+		goto out_put_vm;
+	}
+
+	ret = panthor_heap_create(pool,
+				  args->initial_chunk_count,
+				  args->chunk_size,
+				  args->max_chunks,
+				  args->target_in_flight,
+				  &args->tiler_heap_ctx_gpu_va,
+				  &args->first_heap_chunk_gpu_va);
+	if (ret < 0)
+		goto out_put_heap_pool;
+
+	/* Heap pools are per-VM. We combine the VM and HEAP id to make
+	 * a unique heap handle.
+	 */
+	args->handle = (args->vm_id << 16) | ret;
+	ret = 0;
+
+out_put_heap_pool:
+	panthor_heap_pool_put(pool);
+
+out_put_vm:
+	panthor_vm_put(vm);
+	return ret;
+}
+
+static int panthor_ioctl_tiler_heap_destroy(struct drm_device *ddev, void *data,
+					    struct drm_file *file)
+{
+	struct panthor_file *pfile = file->driver_priv;
+	struct drm_panthor_tiler_heap_destroy *args = data;
+	struct panthor_heap_pool *pool;
+	struct panthor_vm *vm;
+	int ret;
+
+	if (args->pad)
+		return -EINVAL;
+
+	vm = panthor_vm_pool_get_vm(pfile->vms, args->handle >> 16);
+	if (!vm)
+		return -EINVAL;
+
+	pool = panthor_vm_get_heap_pool(vm, false);
+	if (!pool) {
+		ret = -EINVAL;
+		goto out_put_vm;
+	}
+
+	ret = panthor_heap_destroy(pool, args->handle & GENMASK(15, 0));
+	panthor_heap_pool_put(pool);
+
+out_put_vm:
+	panthor_vm_put(vm);
+	return ret;
+}
+
+static int panthor_ioctl_vm_bind_async(struct drm_device *ddev,
+				       struct drm_panthor_vm_bind *args,
+				       struct drm_file *file)
+{
+	struct panthor_file *pfile = file->driver_priv;
+	struct drm_panthor_vm_bind_op *jobs_args;
+	struct panthor_submit_ctx ctx;
+	struct panthor_vm *vm;
+	int ret = 0;
+
+	vm = panthor_vm_pool_get_vm(pfile->vms, args->vm_id);
+	if (!vm)
+		return -EINVAL;
+
+	ret = PANTHOR_UOBJ_GET_ARRAY(jobs_args, &args->ops);
+	if (ret)
+		goto out_put_vm;
+
+	ret = panthor_submit_ctx_init(&ctx, file, args->ops.count);
+	if (ret)
+		goto out_free_jobs_args;
+
+	for (u32 i = 0; i < args->ops.count; i++) {
+		struct drm_panthor_vm_bind_op *op = &jobs_args[i];
+		struct drm_sched_job *job;
+
+		job = panthor_vm_bind_job_create(file, vm, op);
+		if (IS_ERR(job)) {
+			ret = PTR_ERR(job);
+			goto out_cleanup_submit_ctx;
+		}
+
+		ret = panthor_submit_ctx_add_job(&ctx, i, job, &op->syncs);
+		if (ret)
+			goto out_cleanup_submit_ctx;
+	}
+
+	ret = panthor_submit_ctx_collect_jobs_signal_ops(&ctx);
+	if (ret)
+		goto out_cleanup_submit_ctx;
+
+	/* Prepare reservation objects for each VM_BIND job. */
+	drm_exec_until_all_locked(&ctx.exec) {
+		for (u32 i = 0; i < ctx.job_count; i++) {
+			ret = panthor_vm_bind_job_prepare_resvs(&ctx.exec, ctx.jobs[i].job);
+			drm_exec_retry_on_contention(&ctx.exec);
+			if (ret)
+				goto out_cleanup_submit_ctx;
+		}
+	}
+
+	ret = panthor_submit_ctx_add_deps_and_arm_jobs(&ctx);
+	if (ret)
+		goto out_cleanup_submit_ctx;
+
+	/* Nothing can fail after that point. */
+	panthor_submit_ctx_push_jobs(&ctx, panthor_vm_bind_job_update_resvs);
+
+out_cleanup_submit_ctx:
+	panthor_submit_ctx_cleanup(&ctx, panthor_vm_bind_job_put);
+
+out_free_jobs_args:
+	kvfree(jobs_args);
+
+out_put_vm:
+	panthor_vm_put(vm);
+	return ret;
+}
+
+static int panthor_ioctl_vm_bind_sync(struct drm_device *ddev,
+				      struct drm_panthor_vm_bind *args,
+				      struct drm_file *file)
+{
+	struct panthor_file *pfile = file->driver_priv;
+	struct drm_panthor_vm_bind_op *jobs_args;
+	struct panthor_vm *vm;
+	int ret;
+
+	vm = panthor_vm_pool_get_vm(pfile->vms, args->vm_id);
+	if (!vm)
+		return -EINVAL;
+
+	ret = PANTHOR_UOBJ_GET_ARRAY(jobs_args, &args->ops);
+	if (ret)
+		goto out_put_vm;
+
+	for (u32 i = 0; i < args->ops.count; i++) {
+		ret = panthor_vm_bind_exec_sync_op(file, vm, &jobs_args[i]);
+		if (ret) {
+			/* Update ops.count so the user knows where things failed. */
+			args->ops.count = i;
+			break;
+		}
+	}
+
+	kvfree(jobs_args);
+
+out_put_vm:
+	panthor_vm_put(vm);
+	return ret;
+}
+
+#define PANTHOR_VM_BIND_FLAGS DRM_PANTHOR_VM_BIND_ASYNC
+
+static int panthor_ioctl_vm_bind(struct drm_device *ddev, void *data,
+				 struct drm_file *file)
+{
+	struct drm_panthor_vm_bind *args = data;
+	int cookie, ret;
+
+	if (!drm_dev_enter(ddev, &cookie))
+		return -ENODEV;
+
+	if (args->flags & DRM_PANTHOR_VM_BIND_ASYNC)
+		ret = panthor_ioctl_vm_bind_async(ddev, args, file);
+	else
+		ret = panthor_ioctl_vm_bind_sync(ddev, args, file);
+
+	drm_dev_exit(cookie);
+	return ret;
+}
+
+static int panthor_ioctl_vm_get_state(struct drm_device *ddev, void *data,
+				      struct drm_file *file)
+{
+	struct panthor_file *pfile = file->driver_priv;
+	struct drm_panthor_vm_get_state *args = data;
+	struct panthor_vm *vm;
+
+	vm = panthor_vm_pool_get_vm(pfile->vms, args->vm_id);
+	if (!vm)
+		return -EINVAL;
+
+	if (panthor_vm_is_unusable(vm))
+		args->state = DRM_PANTHOR_VM_STATE_UNUSABLE;
+	else
+		args->state = DRM_PANTHOR_VM_STATE_USABLE;
+
+	panthor_vm_put(vm);
+	return 0;
+}
+
+static int
+panthor_open(struct drm_device *ddev, struct drm_file *file)
+{
+	struct panthor_device *ptdev = container_of(ddev, struct panthor_device, base);
+	struct panthor_file *pfile;
+	int ret;
+
+	if (!try_module_get(THIS_MODULE))
+		return -EINVAL;
+
+	pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
+	if (!pfile) {
+		ret = -ENOMEM;
+		goto err_put_mod;
+	}
+
+	pfile->ptdev = ptdev;
+
+	ret = panthor_vm_pool_create(pfile);
+	if (ret)
+		goto err_free_file;
+
+	ret = panthor_group_pool_create(pfile);
+	if (ret)
+		goto err_destroy_vm_pool;
+
+	file->driver_priv = pfile;
+	return 0;
+
+err_destroy_vm_pool:
+	panthor_vm_pool_destroy(pfile);
+
+err_free_file:
+	kfree(pfile);
+
+err_put_mod:
+	module_put(THIS_MODULE);
+	return ret;
+}
+
+static void
+panthor_postclose(struct drm_device *ddev, struct drm_file *file)
+{
+	struct panthor_file *pfile = file->driver_priv;
+
+	panthor_group_pool_destroy(pfile);
+	panthor_vm_pool_destroy(pfile);
+
+	kfree(pfile);
+	module_put(THIS_MODULE);
+}
+
+static const struct drm_ioctl_desc panthor_drm_driver_ioctls[] = {
+#define PANTHOR_IOCTL(n, func, flags) \
+	DRM_IOCTL_DEF_DRV(PANTHOR_##n, panthor_ioctl_##func, flags)
+
+	PANTHOR_IOCTL(DEV_QUERY, dev_query, DRM_RENDER_ALLOW),
+	PANTHOR_IOCTL(VM_CREATE, vm_create, DRM_RENDER_ALLOW),
+	PANTHOR_IOCTL(VM_DESTROY, vm_destroy, DRM_RENDER_ALLOW),
+	PANTHOR_IOCTL(VM_BIND, vm_bind, DRM_RENDER_ALLOW),
+	PANTHOR_IOCTL(VM_GET_STATE, vm_get_state, DRM_RENDER_ALLOW),
+	PANTHOR_IOCTL(BO_CREATE, bo_create, DRM_RENDER_ALLOW),
+	PANTHOR_IOCTL(BO_MMAP_OFFSET, bo_mmap_offset, DRM_RENDER_ALLOW),
+	PANTHOR_IOCTL(GROUP_CREATE, group_create, DRM_RENDER_ALLOW),
+	PANTHOR_IOCTL(GROUP_DESTROY, group_destroy, DRM_RENDER_ALLOW),
+	PANTHOR_IOCTL(GROUP_GET_STATE, group_get_state, DRM_RENDER_ALLOW),
+	PANTHOR_IOCTL(TILER_HEAP_CREATE, tiler_heap_create, DRM_RENDER_ALLOW),
+	PANTHOR_IOCTL(TILER_HEAP_DESTROY, tiler_heap_destroy, DRM_RENDER_ALLOW),
+	PANTHOR_IOCTL(GROUP_SUBMIT, group_submit, DRM_RENDER_ALLOW),
+};
+
+static int panthor_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct drm_file *file = filp->private_data;
+	struct panthor_file *pfile = file->driver_priv;
+	struct panthor_device *ptdev = pfile->ptdev;
+	u64 offset = (u64)vma->vm_pgoff << PAGE_SHIFT;
+	int ret, cookie;
+
+	if (!drm_dev_enter(file->minor->dev, &cookie))
+		return -ENODEV;
+
+#ifdef CONFIG_ARM64
+	/*
+	 * With 32-bit systems being limited by the 32-bit representation of
+	 * mmap2's pgoffset field, we need to make the MMIO offset arch
+	 * specific. This converts a user MMIO offset into something the kernel
+	 * driver understands.
+	 */
+	if (test_tsk_thread_flag(current, TIF_32BIT) &&
+	    offset >= DRM_PANTHOR_USER_MMIO_OFFSET_32BIT) {
+		offset += DRM_PANTHOR_USER_MMIO_OFFSET_64BIT -
+			  DRM_PANTHOR_USER_MMIO_OFFSET_32BIT;
+		vma->vm_pgoff = offset >> PAGE_SHIFT;
+	}
+#endif
+
+	if (offset >= DRM_PANTHOR_USER_MMIO_OFFSET)
+		ret = panthor_device_mmap_io(ptdev, vma);
+	else
+		ret = drm_gem_mmap(filp, vma);
+
+	drm_dev_exit(cookie);
+	return ret;
+}
+
+static const struct file_operations panthor_drm_driver_fops = {
+	.open = drm_open,
+	.release = drm_release,
+	.unlocked_ioctl = drm_ioctl,
+	.compat_ioctl = drm_compat_ioctl,
+	.poll = drm_poll,
+	.read = drm_read,
+	.llseek = noop_llseek,
+	.mmap = panthor_mmap,
+};
+
+/*
+ * PanCSF driver version:
+ * - 1.0 - initial interface
+ */
+static const struct drm_driver panthor_drm_driver = {
+	.driver_features = DRIVER_RENDER | DRIVER_GEM | DRIVER_SYNCOBJ |
+			   DRIVER_SYNCOBJ_TIMELINE | DRIVER_GEM_GPUVA,
+	.open = panthor_open,
+	.postclose = panthor_postclose,
+	.ioctls = panthor_drm_driver_ioctls,
+	.num_ioctls = ARRAY_SIZE(panthor_drm_driver_ioctls),
+	.fops = &panthor_drm_driver_fops,
+	.name = "panthor",
+	.desc = "Panthor DRM driver",
+	.date = "20230801",
+	.major = 1,
+	.minor = 0,
+
+	.gem_create_object = panthor_gem_create_object,
+	.gem_prime_import_sg_table = drm_gem_shmem_prime_import_sg_table,
+};
+
+static int panthor_probe(struct platform_device *pdev)
+{
+	struct panthor_device *ptdev;
+
+	ptdev = devm_drm_dev_alloc(&pdev->dev, &panthor_drm_driver,
+				   struct panthor_device, base);
+	if (!ptdev)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, ptdev);
+
+	return panthor_device_init(ptdev);
+}
+
+static void panthor_remove(struct platform_device *pdev)
+{
+	struct panthor_device *ptdev = platform_get_drvdata(pdev);
+
+	panthor_device_unplug(ptdev);
+}
+
+static const struct of_device_id dt_match[] = {
+	{ .compatible = "rockchip,rk3588-mali" },
+	{ .compatible = "arm,mali-valhall-csf" },
+	{}
+};
+MODULE_DEVICE_TABLE(of, dt_match);
+
+static DEFINE_RUNTIME_DEV_PM_OPS(panthor_pm_ops,
+				 panthor_device_suspend,
+				 panthor_device_resume,
+				 NULL);
+
+static struct platform_driver panthor_driver = {
+	.probe = panthor_probe,
+	.remove_new = panthor_remove,
+	.driver = {
+		.name = "panthor",
+		.pm = &panthor_pm_ops,
+		.of_match_table = dt_match,
+	},
+};
+
+/*
+ * Workqueue used to cleanup stuff.
+ *
+ * We create a dedicated workqueue so we can drain on unplug and
+ * make sure all resources are freed before the module is unloaded.
+ */
+struct workqueue_struct *panthor_cleanup_wq;
+
+static int __init panthor_init(void)
+{
+	int ret;
+
+	ret = panthor_mmu_pt_cache_init();
+	if (ret)
+		return ret;
+
+	panthor_cleanup_wq = alloc_workqueue("panthor-cleanup", WQ_UNBOUND, 0);
+	if (!panthor_cleanup_wq) {
+		pr_err("panthor: Failed to allocate the workqueues");
+		ret = -ENOMEM;
+		goto err_mmu_pt_cache_fini;
+	}
+
+	ret = platform_driver_register(&panthor_driver);
+	if (ret)
+		goto err_destroy_cleanup_wq;
+
+	return 0;
+
+err_destroy_cleanup_wq:
+	destroy_workqueue(panthor_cleanup_wq);
+
+err_mmu_pt_cache_fini:
+	panthor_mmu_pt_cache_fini();
+	return ret;
+}
+module_init(panthor_init);
+
+static void __exit panthor_exit(void)
+{
+	platform_driver_unregister(&panthor_driver);
+	destroy_workqueue(panthor_cleanup_wq);
+	panthor_mmu_pt_cache_fini();
+}
+module_exit(panthor_exit);
+
+MODULE_AUTHOR("Panthor Project Developers");
+MODULE_DESCRIPTION("Panthor DRM Driver");
+MODULE_LICENSE("Dual MIT/GPL");
diff --git a/drivers/gpu/drm/panthor/panthor_fw.c b/drivers/gpu/drm/panthor/panthor_fw.c
new file mode 100644
index 0000000000000..33c87a59834ec
--- /dev/null
+++ b/drivers/gpu/drm/panthor/panthor_fw.c
@@ -0,0 +1,1362 @@
+// SPDX-License-Identifier: GPL-2.0 or MIT
+/* Copyright 2023 Collabora ltd. */
+
+#ifdef CONFIG_ARM_ARCH_TIMER
+#include <asm/arch_timer.h>
+#endif
+
+#include <linux/clk.h>
+#include <linux/dma-mapping.h>
+#include <linux/firmware.h>
+#include <linux/iopoll.h>
+#include <linux/iosys-map.h>
+#include <linux/mutex.h>
+#include <linux/platform_device.h>
+
+#include <drm/drm_drv.h>
+#include <drm/drm_managed.h>
+
+#include "panthor_device.h"
+#include "panthor_fw.h"
+#include "panthor_gem.h"
+#include "panthor_gpu.h"
+#include "panthor_mmu.h"
+#include "panthor_regs.h"
+#include "panthor_sched.h"
+
+#define CSF_FW_NAME "mali_csffw.bin"
+
+#define PING_INTERVAL_MS			12000
+#define PROGRESS_TIMEOUT_CYCLES			(5ull * 500 * 1024 * 1024)
+#define PROGRESS_TIMEOUT_SCALE_SHIFT		10
+#define IDLE_HYSTERESIS_US			800
+#define PWROFF_HYSTERESIS_US			10000
+
+/**
+ * struct panthor_fw_binary_hdr - Firmware binary header.
+ */
+struct panthor_fw_binary_hdr {
+	/** @magic: Magic value to check binary validity. */
+	u32 magic;
+#define CSF_FW_BINARY_HEADER_MAGIC		0xc3f13a6e
+
+	/** @minor: Minor FW version. */
+	u8 minor;
+
+	/** @major: Major FW version. */
+	u8 major;
+#define CSF_FW_BINARY_HEADER_MAJOR_MAX		0
+
+	/** @padding1: MBZ. */
+	u16 padding1;
+
+	/** @version_hash: FW version hash. */
+	u32 version_hash;
+
+	/** @padding2: MBZ. */
+	u32 padding2;
+
+	/** @size: FW binary size. */
+	u32 size;
+};
+
+/**
+ * enum panthor_fw_binary_entry_type - Firmware binary entry type
+ */
+enum panthor_fw_binary_entry_type {
+	/** @CSF_FW_BINARY_ENTRY_TYPE_IFACE: Host <-> FW interface. */
+	CSF_FW_BINARY_ENTRY_TYPE_IFACE = 0,
+
+	/** @CSF_FW_BINARY_ENTRY_TYPE_CONFIG: FW config. */
+	CSF_FW_BINARY_ENTRY_TYPE_CONFIG = 1,
+
+	/** @CSF_FW_BINARY_ENTRY_TYPE_FUTF_TEST: Unit-tests. */
+	CSF_FW_BINARY_ENTRY_TYPE_FUTF_TEST = 2,
+
+	/** @CSF_FW_BINARY_ENTRY_TYPE_TRACE_BUFFER: Trace buffer interface. */
+	CSF_FW_BINARY_ENTRY_TYPE_TRACE_BUFFER = 3,
+
+	/** @CSF_FW_BINARY_ENTRY_TYPE_TIMELINE_METADATA: Timeline metadata interface. */
+	CSF_FW_BINARY_ENTRY_TYPE_TIMELINE_METADATA = 4,
+};
+
+#define CSF_FW_BINARY_ENTRY_TYPE(ehdr)					((ehdr) & 0xff)
+#define CSF_FW_BINARY_ENTRY_SIZE(ehdr)					(((ehdr) >> 8) & 0xff)
+#define CSF_FW_BINARY_ENTRY_UPDATE					BIT(30)
+#define CSF_FW_BINARY_ENTRY_OPTIONAL					BIT(31)
+
+#define CSF_FW_BINARY_IFACE_ENTRY_RD_RD					BIT(0)
+#define CSF_FW_BINARY_IFACE_ENTRY_RD_WR					BIT(1)
+#define CSF_FW_BINARY_IFACE_ENTRY_RD_EX					BIT(2)
+#define CSF_FW_BINARY_IFACE_ENTRY_RD_CACHE_MODE_NONE			(0 << 3)
+#define CSF_FW_BINARY_IFACE_ENTRY_RD_CACHE_MODE_CACHED			(1 << 3)
+#define CSF_FW_BINARY_IFACE_ENTRY_RD_CACHE_MODE_UNCACHED_COHERENT	(2 << 3)
+#define CSF_FW_BINARY_IFACE_ENTRY_RD_CACHE_MODE_CACHED_COHERENT		(3 << 3)
+#define CSF_FW_BINARY_IFACE_ENTRY_RD_CACHE_MODE_MASK			GENMASK(4, 3)
+#define CSF_FW_BINARY_IFACE_ENTRY_RD_PROT				BIT(5)
+#define CSF_FW_BINARY_IFACE_ENTRY_RD_SHARED				BIT(30)
+#define CSF_FW_BINARY_IFACE_ENTRY_RD_ZERO				BIT(31)
+
+#define CSF_FW_BINARY_IFACE_ENTRY_RD_SUPPORTED_FLAGS			\
+	(CSF_FW_BINARY_IFACE_ENTRY_RD_RD |				\
+	 CSF_FW_BINARY_IFACE_ENTRY_RD_WR |				\
+	 CSF_FW_BINARY_IFACE_ENTRY_RD_EX |				\
+	 CSF_FW_BINARY_IFACE_ENTRY_RD_CACHE_MODE_MASK |			\
+	 CSF_FW_BINARY_IFACE_ENTRY_RD_PROT |				\
+	 CSF_FW_BINARY_IFACE_ENTRY_RD_SHARED  |				\
+	 CSF_FW_BINARY_IFACE_ENTRY_RD_ZERO)
+
+/**
+ * struct panthor_fw_binary_section_entry_hdr - Describes a section of FW binary
+ */
+struct panthor_fw_binary_section_entry_hdr {
+	/** @flags: Section flags. */
+	u32 flags;
+
+	/** @va: MCU virtual range to map this binary section to. */
+	struct {
+		/** @start: Start address. */
+		u32 start;
+
+		/** @end: End address. */
+		u32 end;
+	} va;
+
+	/** @data: Data to initialize the FW section with. */
+	struct {
+		/** @start: Start offset in the FW binary. */
+		u32 start;
+
+		/** @end: End offset in the FW binary. */
+		u32 end;
+	} data;
+};
+
+/**
+ * struct panthor_fw_binary_iter - Firmware binary iterator
+ *
+ * Used to parse a firmware binary.
+ */
+struct panthor_fw_binary_iter {
+	/** @data: FW binary data. */
+	const void *data;
+
+	/** @size: FW binary size. */
+	size_t size;
+
+	/** @offset: Iterator offset. */
+	size_t offset;
+};
+
+/**
+ * struct panthor_fw_section - FW section
+ */
+struct panthor_fw_section {
+	/** @node: Used to keep track of FW sections. */
+	struct list_head node;
+
+	/** @flags: Section flags, as encoded in the FW binary. */
+	u32 flags;
+
+	/** @mem: Section memory. */
+	struct panthor_kernel_bo *mem;
+
+	/**
+	 * @name: Name of the section, as specified in the binary.
+	 *
+	 * Can be NULL.
+	 */
+	const char *name;
+
+	/**
+	 * @data: Initial data copied to the FW memory.
+	 *
+	 * We keep data around so we can reload sections after a reset.
+	 */
+	struct {
+		/** @buf: Buffed used to store init data. */
+		const void *buf;
+
+		/** @size: Size of @buf in bytes. */
+		size_t size;
+	} data;
+};
+
+#define CSF_MCU_SHARED_REGION_START		0x04000000ULL
+#define CSF_MCU_SHARED_REGION_SIZE		0x04000000ULL
+
+#define MIN_CS_PER_CSG				8
+#define MIN_CSGS				3
+#define MAX_CSG_PRIO				0xf
+
+#define CSF_IFACE_VERSION(major, minor, patch)	\
+	(((major) << 24) | ((minor) << 16) | (patch))
+#define CSF_IFACE_VERSION_MAJOR(v)		((v) >> 24)
+#define CSF_IFACE_VERSION_MINOR(v)		(((v) >> 16) & 0xff)
+#define CSF_IFACE_VERSION_PATCH(v)		((v) & 0xffff)
+
+#define CSF_GROUP_CONTROL_OFFSET		0x1000
+#define CSF_STREAM_CONTROL_OFFSET		0x40
+#define CSF_UNPRESERVED_REG_COUNT		4
+
+/**
+ * struct panthor_fw_iface - FW interfaces
+ */
+struct panthor_fw_iface {
+	/** @global: Global interface. */
+	struct panthor_fw_global_iface global;
+
+	/** @groups: Group slot interfaces. */
+	struct panthor_fw_csg_iface groups[MAX_CSGS];
+
+	/** @streams: Command stream slot interfaces. */
+	struct panthor_fw_cs_iface streams[MAX_CSGS][MAX_CS_PER_CSG];
+};
+
+/**
+ * struct panthor_fw - Firmware management
+ */
+struct panthor_fw {
+	/** @vm: MCU VM. */
+	struct panthor_vm *vm;
+
+	/** @sections: List of FW sections. */
+	struct list_head sections;
+
+	/** @shared_section: The section containing the FW interfaces. */
+	struct panthor_fw_section *shared_section;
+
+	/** @iface: FW interfaces. */
+	struct panthor_fw_iface iface;
+
+	/** @watchdog: Collection of fields relating to the FW watchdog. */
+	struct {
+		/** @ping_work: Delayed work used to ping the FW. */
+		struct delayed_work ping_work;
+	} watchdog;
+
+	/**
+	 * @req_waitqueue: FW request waitqueue.
+	 *
+	 * Everytime a request is sent to a command stream group or the global
+	 * interface, the caller will first busy wait for the request to be
+	 * acknowledged, and then fallback to a sleeping wait.
+	 *
+	 * This wait queue is here to support the sleeping wait flavor.
+	 */
+	wait_queue_head_t req_waitqueue;
+
+	/** @booted: True is the FW is booted */
+	bool booted;
+
+	/**
+	 * @fast_reset: True if the post_reset logic can proceed with a fast reset.
+	 *
+	 * A fast reset is just a reset where the driver doesn't reload the FW sections.
+	 *
+	 * Any time the firmware is properly suspended, a fast reset can take place.
+	 * On the other hand, if the halt operation failed, the driver will reload
+	 * all sections to make sure we start from a fresh state.
+	 */
+	bool fast_reset;
+
+	/** @irq: Job irq data. */
+	struct panthor_irq irq;
+};
+
+struct panthor_vm *panthor_fw_vm(struct panthor_device *ptdev)
+{
+	return ptdev->fw->vm;
+}
+
+/**
+ * panthor_fw_get_glb_iface() - Get the global interface
+ * @ptdev: Device.
+ *
+ * Return: The global interface.
+ */
+struct panthor_fw_global_iface *
+panthor_fw_get_glb_iface(struct panthor_device *ptdev)
+{
+	return &ptdev->fw->iface.global;
+}
+
+/**
+ * panthor_fw_get_csg_iface() - Get a command stream group slot interface
+ * @ptdev: Device.
+ * @csg_slot: Index of the command stream group slot.
+ *
+ * Return: The command stream group slot interface.
+ */
+struct panthor_fw_csg_iface *
+panthor_fw_get_csg_iface(struct panthor_device *ptdev, u32 csg_slot)
+{
+	if (drm_WARN_ON(&ptdev->base, csg_slot >= MAX_CSGS))
+		return NULL;
+
+	return &ptdev->fw->iface.groups[csg_slot];
+}
+
+/**
+ * panthor_fw_get_cs_iface() - Get a command stream slot interface
+ * @ptdev: Device.
+ * @csg_slot: Index of the command stream group slot.
+ * @cs_slot: Index of the command stream slot.
+ *
+ * Return: The command stream slot interface.
+ */
+struct panthor_fw_cs_iface *
+panthor_fw_get_cs_iface(struct panthor_device *ptdev, u32 csg_slot, u32 cs_slot)
+{
+	if (drm_WARN_ON(&ptdev->base, csg_slot >= MAX_CSGS || cs_slot > MAX_CS_PER_CSG))
+		return NULL;
+
+	return &ptdev->fw->iface.streams[csg_slot][cs_slot];
+}
+
+/**
+ * panthor_fw_conv_timeout() - Convert a timeout into a cycle-count
+ * @ptdev: Device.
+ * @timeout_us: Timeout expressed in micro-seconds.
+ *
+ * The FW has two timer sources: the GPU counter or arch-timer. We need
+ * to express timeouts in term of number of cycles and specify which
+ * timer source should be used.
+ *
+ * Return: A value suitable for timeout fields in the global interface.
+ */
+static u32 panthor_fw_conv_timeout(struct panthor_device *ptdev, u32 timeout_us)
+{
+	bool use_cycle_counter = false;
+	u32 timer_rate = 0;
+	u64 mod_cycles;
+
+#ifdef CONFIG_ARM_ARCH_TIMER
+	timer_rate = arch_timer_get_cntfrq();
+#endif
+
+	if (!timer_rate) {
+		use_cycle_counter = true;
+		timer_rate = clk_get_rate(ptdev->clks.core);
+	}
+
+	if (drm_WARN_ON(&ptdev->base, !timer_rate)) {
+		/* We couldn't get a valid clock rate, let's just pick the
+		 * maximum value so the FW still handles the core
+		 * power on/off requests.
+		 */
+		return GLB_TIMER_VAL(~0) |
+		       GLB_TIMER_SOURCE_GPU_COUNTER;
+	}
+
+	mod_cycles = DIV_ROUND_UP_ULL((u64)timeout_us * timer_rate,
+				      1000000ull << 10);
+	if (drm_WARN_ON(&ptdev->base, mod_cycles > GLB_TIMER_VAL(~0)))
+		mod_cycles = GLB_TIMER_VAL(~0);
+
+	return GLB_TIMER_VAL(mod_cycles) |
+	       (use_cycle_counter ? GLB_TIMER_SOURCE_GPU_COUNTER : 0);
+}
+
+static int panthor_fw_binary_iter_read(struct panthor_device *ptdev,
+				       struct panthor_fw_binary_iter *iter,
+				       void *out, size_t size)
+{
+	size_t new_offset = iter->offset + size;
+
+	if (new_offset > iter->size || new_offset < iter->offset) {
+		drm_err(&ptdev->base, "Firmware too small\n");
+		return -EINVAL;
+	}
+
+	memcpy(out, iter->data + iter->offset, size);
+	iter->offset = new_offset;
+	return 0;
+}
+
+static int panthor_fw_binary_sub_iter_init(struct panthor_device *ptdev,
+					   struct panthor_fw_binary_iter *iter,
+					   struct panthor_fw_binary_iter *sub_iter,
+					   size_t size)
+{
+	size_t new_offset = iter->offset + size;
+
+	if (new_offset > iter->size || new_offset < iter->offset) {
+		drm_err(&ptdev->base, "Firmware entry too long\n");
+		return -EINVAL;
+	}
+
+	sub_iter->offset = 0;
+	sub_iter->data = iter->data + iter->offset;
+	sub_iter->size = size;
+	iter->offset = new_offset;
+	return 0;
+}
+
+static void panthor_fw_init_section_mem(struct panthor_device *ptdev,
+					struct panthor_fw_section *section)
+{
+	bool was_mapped = !!section->mem->kmap;
+	int ret;
+
+	if (!section->data.size &&
+	    !(section->flags & CSF_FW_BINARY_IFACE_ENTRY_RD_ZERO))
+		return;
+
+	ret = panthor_kernel_bo_vmap(section->mem);
+	if (drm_WARN_ON(&ptdev->base, ret))
+		return;
+
+	memcpy(section->mem->kmap, section->data.buf, section->data.size);
+	if (section->flags & CSF_FW_BINARY_IFACE_ENTRY_RD_ZERO) {
+		memset(section->mem->kmap + section->data.size, 0,
+		       panthor_kernel_bo_size(section->mem) - section->data.size);
+	}
+
+	if (!was_mapped)
+		panthor_kernel_bo_vunmap(section->mem);
+}
+
+/**
+ * panthor_fw_alloc_queue_iface_mem() - Allocate a ring-buffer interfaces.
+ * @ptdev: Device.
+ * @input: Pointer holding the input interface on success.
+ * Should be ignored on failure.
+ * @output: Pointer holding the output interface on success.
+ * Should be ignored on failure.
+ * @input_fw_va: Pointer holding the input interface FW VA on success.
+ * Should be ignored on failure.
+ * @output_fw_va: Pointer holding the output interface FW VA on success.
+ * Should be ignored on failure.
+ *
+ * Allocates panthor_fw_ringbuf_{input,out}_iface interfaces. The input
+ * interface is at offset 0, and the output interface at offset 4096.
+ *
+ * Return: A valid pointer in case of success, an ERR_PTR() otherwise.
+ */
+struct panthor_kernel_bo *
+panthor_fw_alloc_queue_iface_mem(struct panthor_device *ptdev,
+				 struct panthor_fw_ringbuf_input_iface **input,
+				 const struct panthor_fw_ringbuf_output_iface **output,
+				 u32 *input_fw_va, u32 *output_fw_va)
+{
+	struct panthor_kernel_bo *mem;
+	int ret;
+
+	mem = panthor_kernel_bo_create(ptdev, ptdev->fw->vm, SZ_8K,
+				       DRM_PANTHOR_BO_NO_MMAP,
+				       DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC |
+				       DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED,
+				       PANTHOR_VM_KERNEL_AUTO_VA);
+	if (IS_ERR(mem))
+		return mem;
+
+	ret = panthor_kernel_bo_vmap(mem);
+	if (ret) {
+		panthor_kernel_bo_destroy(panthor_fw_vm(ptdev), mem);
+		return ERR_PTR(ret);
+	}
+
+	memset(mem->kmap, 0, panthor_kernel_bo_size(mem));
+	*input = mem->kmap;
+	*output = mem->kmap + SZ_4K;
+	*input_fw_va = panthor_kernel_bo_gpuva(mem);
+	*output_fw_va = *input_fw_va + SZ_4K;
+
+	return mem;
+}
+
+/**
+ * panthor_fw_alloc_suspend_buf_mem() - Allocate a suspend buffer for a command stream group.
+ * @ptdev: Device.
+ * @size: Size of the suspend buffer.
+ *
+ * Return: A valid pointer in case of success, an ERR_PTR() otherwise.
+ */
+struct panthor_kernel_bo *
+panthor_fw_alloc_suspend_buf_mem(struct panthor_device *ptdev, size_t size)
+{
+	return panthor_kernel_bo_create(ptdev, panthor_fw_vm(ptdev), size,
+					DRM_PANTHOR_BO_NO_MMAP,
+					DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC,
+					PANTHOR_VM_KERNEL_AUTO_VA);
+}
+
+static int panthor_fw_load_section_entry(struct panthor_device *ptdev,
+					 const struct firmware *fw,
+					 struct panthor_fw_binary_iter *iter,
+					 u32 ehdr)
+{
+	struct panthor_fw_binary_section_entry_hdr hdr;
+	struct panthor_fw_section *section;
+	u32 section_size;
+	u32 name_len;
+	int ret;
+
+	ret = panthor_fw_binary_iter_read(ptdev, iter, &hdr, sizeof(hdr));
+	if (ret)
+		return ret;
+
+	if (hdr.data.end < hdr.data.start) {
+		drm_err(&ptdev->base, "Firmware corrupted, data.end < data.start (0x%x < 0x%x)\n",
+			hdr.data.end, hdr.data.start);
+		return -EINVAL;
+	}
+
+	if (hdr.va.end < hdr.va.start) {
+		drm_err(&ptdev->base, "Firmware corrupted, hdr.va.end < hdr.va.start (0x%x < 0x%x)\n",
+			hdr.va.end, hdr.va.start);
+		return -EINVAL;
+	}
+
+	if (hdr.data.end > fw->size) {
+		drm_err(&ptdev->base, "Firmware corrupted, file truncated? data_end=0x%x > fw size=0x%zx\n",
+			hdr.data.end, fw->size);
+		return -EINVAL;
+	}
+
+	if ((hdr.va.start & ~PAGE_MASK) != 0 ||
+	    (hdr.va.end & ~PAGE_MASK) != 0) {
+		drm_err(&ptdev->base, "Firmware corrupted, virtual addresses not page aligned: 0x%x-0x%x\n",
+			hdr.va.start, hdr.va.end);
+		return -EINVAL;
+	}
+
+	if (hdr.flags & ~CSF_FW_BINARY_IFACE_ENTRY_RD_SUPPORTED_FLAGS) {
+		drm_err(&ptdev->base, "Firmware contains interface with unsupported flags (0x%x)\n",
+			hdr.flags);
+		return -EINVAL;
+	}
+
+	if (hdr.flags & CSF_FW_BINARY_IFACE_ENTRY_RD_PROT) {
+		drm_warn(&ptdev->base,
+			 "Firmware protected mode entry not be supported, ignoring");
+		return 0;
+	}
+
+	if (hdr.va.start == CSF_MCU_SHARED_REGION_START &&
+	    !(hdr.flags & CSF_FW_BINARY_IFACE_ENTRY_RD_SHARED)) {
+		drm_err(&ptdev->base,
+			"Interface at 0x%llx must be shared", CSF_MCU_SHARED_REGION_START);
+		return -EINVAL;
+	}
+
+	name_len = iter->size - iter->offset;
+
+	section = drmm_kzalloc(&ptdev->base, sizeof(*section), GFP_KERNEL);
+	if (!section)
+		return -ENOMEM;
+
+	list_add_tail(&section->node, &ptdev->fw->sections);
+	section->flags = hdr.flags;
+	section->data.size = hdr.data.end - hdr.data.start;
+
+	if (section->data.size > 0) {
+		void *data = drmm_kmalloc(&ptdev->base, section->data.size, GFP_KERNEL);
+
+		if (!data)
+			return -ENOMEM;
+
+		memcpy(data, fw->data + hdr.data.start, section->data.size);
+		section->data.buf = data;
+	}
+
+	if (name_len > 0) {
+		char *name = drmm_kmalloc(&ptdev->base, name_len + 1, GFP_KERNEL);
+
+		if (!name)
+			return -ENOMEM;
+
+		memcpy(name, iter->data + iter->offset, name_len);
+		name[name_len] = '\0';
+		section->name = name;
+	}
+
+	section_size = hdr.va.end - hdr.va.start;
+	if (section_size) {
+		u32 cache_mode = hdr.flags & CSF_FW_BINARY_IFACE_ENTRY_RD_CACHE_MODE_MASK;
+		struct panthor_gem_object *bo;
+		u32 vm_map_flags = 0;
+		struct sg_table *sgt;
+		u64 va = hdr.va.start;
+
+		if (!(hdr.flags & CSF_FW_BINARY_IFACE_ENTRY_RD_WR))
+			vm_map_flags |= DRM_PANTHOR_VM_BIND_OP_MAP_READONLY;
+
+		if (!(hdr.flags & CSF_FW_BINARY_IFACE_ENTRY_RD_EX))
+			vm_map_flags |= DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC;
+
+		/* TODO: CSF_FW_BINARY_IFACE_ENTRY_RD_CACHE_MODE_*_COHERENT are mapped to
+		 * non-cacheable for now. We might want to introduce a new
+		 * IOMMU_xxx flag (or abuse IOMMU_MMIO, which maps to device
+		 * memory and is currently not used by our driver) for
+		 * AS_MEMATTR_AARCH64_SHARED memory, so we can take benefit
+		 * of IO-coherent systems.
+		 */
+		if (cache_mode != CSF_FW_BINARY_IFACE_ENTRY_RD_CACHE_MODE_CACHED)
+			vm_map_flags |= DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED;
+
+		section->mem = panthor_kernel_bo_create(ptdev, panthor_fw_vm(ptdev),
+							section_size,
+							DRM_PANTHOR_BO_NO_MMAP,
+							vm_map_flags, va);
+		if (IS_ERR(section->mem))
+			return PTR_ERR(section->mem);
+
+		if (drm_WARN_ON(&ptdev->base, section->mem->va_node.start != hdr.va.start))
+			return -EINVAL;
+
+		if (section->flags & CSF_FW_BINARY_IFACE_ENTRY_RD_SHARED) {
+			ret = panthor_kernel_bo_vmap(section->mem);
+			if (ret)
+				return ret;
+		}
+
+		panthor_fw_init_section_mem(ptdev, section);
+
+		bo = to_panthor_bo(section->mem->obj);
+		sgt = drm_gem_shmem_get_pages_sgt(&bo->base);
+		if (IS_ERR(sgt))
+			return PTR_ERR(sgt);
+
+		dma_sync_sgtable_for_device(ptdev->base.dev, sgt, DMA_TO_DEVICE);
+	}
+
+	if (hdr.va.start == CSF_MCU_SHARED_REGION_START)
+		ptdev->fw->shared_section = section;
+
+	return 0;
+}
+
+static void
+panthor_reload_fw_sections(struct panthor_device *ptdev, bool full_reload)
+{
+	struct panthor_fw_section *section;
+
+	list_for_each_entry(section, &ptdev->fw->sections, node) {
+		struct sg_table *sgt;
+
+		if (!full_reload && !(section->flags & CSF_FW_BINARY_IFACE_ENTRY_RD_WR))
+			continue;
+
+		panthor_fw_init_section_mem(ptdev, section);
+		sgt = drm_gem_shmem_get_pages_sgt(&to_panthor_bo(section->mem->obj)->base);
+		if (!drm_WARN_ON(&ptdev->base, IS_ERR_OR_NULL(sgt)))
+			dma_sync_sgtable_for_device(ptdev->base.dev, sgt, DMA_TO_DEVICE);
+	}
+}
+
+static int panthor_fw_load_entry(struct panthor_device *ptdev,
+				 const struct firmware *fw,
+				 struct panthor_fw_binary_iter *iter)
+{
+	struct panthor_fw_binary_iter eiter;
+	u32 ehdr;
+	int ret;
+
+	ret = panthor_fw_binary_iter_read(ptdev, iter, &ehdr, sizeof(ehdr));
+	if (ret)
+		return ret;
+
+	if ((iter->offset % sizeof(u32)) ||
+	    (CSF_FW_BINARY_ENTRY_SIZE(ehdr) % sizeof(u32))) {
+		drm_err(&ptdev->base, "Firmware entry isn't 32 bit aligned, offset=0x%x size=0x%x\n",
+			(u32)(iter->offset - sizeof(u32)), CSF_FW_BINARY_ENTRY_SIZE(ehdr));
+		return -EINVAL;
+	}
+
+	if (panthor_fw_binary_sub_iter_init(ptdev, iter, &eiter,
+					    CSF_FW_BINARY_ENTRY_SIZE(ehdr) - sizeof(ehdr)))
+		return -EINVAL;
+
+	switch (CSF_FW_BINARY_ENTRY_TYPE(ehdr)) {
+	case CSF_FW_BINARY_ENTRY_TYPE_IFACE:
+		return panthor_fw_load_section_entry(ptdev, fw, &eiter, ehdr);
+
+	/* FIXME: handle those entry types? */
+	case CSF_FW_BINARY_ENTRY_TYPE_CONFIG:
+	case CSF_FW_BINARY_ENTRY_TYPE_FUTF_TEST:
+	case CSF_FW_BINARY_ENTRY_TYPE_TRACE_BUFFER:
+	case CSF_FW_BINARY_ENTRY_TYPE_TIMELINE_METADATA:
+		return 0;
+	default:
+		break;
+	}
+
+	if (ehdr & CSF_FW_BINARY_ENTRY_OPTIONAL)
+		return 0;
+
+	drm_err(&ptdev->base,
+		"Unsupported non-optional entry type %u in firmware\n",
+		CSF_FW_BINARY_ENTRY_TYPE(ehdr));
+	return -EINVAL;
+}
+
+static int panthor_fw_load(struct panthor_device *ptdev)
+{
+	const struct firmware *fw = NULL;
+	struct panthor_fw_binary_iter iter = {};
+	struct panthor_fw_binary_hdr hdr;
+	char fw_path[128];
+	int ret;
+
+	snprintf(fw_path, sizeof(fw_path), "arm/mali/arch%d.%d/%s",
+		 (u32)GPU_ARCH_MAJOR(ptdev->gpu_info.gpu_id),
+		 (u32)GPU_ARCH_MINOR(ptdev->gpu_info.gpu_id),
+		 CSF_FW_NAME);
+
+	ret = request_firmware(&fw, fw_path, ptdev->base.dev);
+	if (ret) {
+		drm_err(&ptdev->base, "Failed to load firmware image '%s'\n",
+			CSF_FW_NAME);
+		return ret;
+	}
+
+	iter.data = fw->data;
+	iter.size = fw->size;
+	ret = panthor_fw_binary_iter_read(ptdev, &iter, &hdr, sizeof(hdr));
+	if (ret)
+		goto out;
+
+	if (hdr.magic != CSF_FW_BINARY_HEADER_MAGIC) {
+		ret = -EINVAL;
+		drm_err(&ptdev->base, "Invalid firmware magic\n");
+		goto out;
+	}
+
+	if (hdr.major != CSF_FW_BINARY_HEADER_MAJOR_MAX) {
+		ret = -EINVAL;
+		drm_err(&ptdev->base, "Unsupported firmware binary header version %d.%d (expected %d.x)\n",
+			hdr.major, hdr.minor, CSF_FW_BINARY_HEADER_MAJOR_MAX);
+		goto out;
+	}
+
+	if (hdr.size > iter.size) {
+		drm_err(&ptdev->base, "Firmware image is truncated\n");
+		goto out;
+	}
+
+	iter.size = hdr.size;
+
+	while (iter.offset < hdr.size) {
+		ret = panthor_fw_load_entry(ptdev, fw, &iter);
+		if (ret)
+			goto out;
+	}
+
+	if (!ptdev->fw->shared_section) {
+		drm_err(&ptdev->base, "Shared interface region not found\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+out:
+	release_firmware(fw);
+	return ret;
+}
+
+/**
+ * iface_fw_to_cpu_addr() - Turn an MCU address into a CPU address
+ * @ptdev: Device.
+ * @mcu_va: MCU address.
+ *
+ * Return: NULL if the address is not part of the shared section, non-NULL otherwise.
+ */
+static void *iface_fw_to_cpu_addr(struct panthor_device *ptdev, u32 mcu_va)
+{
+	u64 shared_mem_start = panthor_kernel_bo_gpuva(ptdev->fw->shared_section->mem);
+	u64 shared_mem_end = shared_mem_start +
+			     panthor_kernel_bo_size(ptdev->fw->shared_section->mem);
+	if (mcu_va < shared_mem_start || mcu_va >= shared_mem_end)
+		return NULL;
+
+	return ptdev->fw->shared_section->mem->kmap + (mcu_va - shared_mem_start);
+}
+
+static int panthor_init_cs_iface(struct panthor_device *ptdev,
+				 unsigned int csg_idx, unsigned int cs_idx)
+{
+	struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+	struct panthor_fw_csg_iface *csg_iface = panthor_fw_get_csg_iface(ptdev, csg_idx);
+	struct panthor_fw_cs_iface *cs_iface = &ptdev->fw->iface.streams[csg_idx][cs_idx];
+	u64 shared_section_sz = panthor_kernel_bo_size(ptdev->fw->shared_section->mem);
+	u32 iface_offset = CSF_GROUP_CONTROL_OFFSET +
+			   (csg_idx * glb_iface->control->group_stride) +
+			   CSF_STREAM_CONTROL_OFFSET +
+			   (cs_idx * csg_iface->control->stream_stride);
+	struct panthor_fw_cs_iface *first_cs_iface =
+		panthor_fw_get_cs_iface(ptdev, 0, 0);
+
+	if (iface_offset + sizeof(*cs_iface) >= shared_section_sz)
+		return -EINVAL;
+
+	spin_lock_init(&cs_iface->lock);
+	cs_iface->control = ptdev->fw->shared_section->mem->kmap + iface_offset;
+	cs_iface->input = iface_fw_to_cpu_addr(ptdev, cs_iface->control->input_va);
+	cs_iface->output = iface_fw_to_cpu_addr(ptdev, cs_iface->control->output_va);
+
+	if (!cs_iface->input || !cs_iface->output) {
+		drm_err(&ptdev->base, "Invalid stream control interface input/output VA");
+		return -EINVAL;
+	}
+
+	if (cs_iface != first_cs_iface) {
+		if (cs_iface->control->features != first_cs_iface->control->features) {
+			drm_err(&ptdev->base, "Expecting identical CS slots");
+			return -EINVAL;
+		}
+	} else {
+		u32 reg_count = CS_FEATURES_WORK_REGS(cs_iface->control->features);
+
+		ptdev->csif_info.cs_reg_count = reg_count;
+		ptdev->csif_info.unpreserved_cs_reg_count = CSF_UNPRESERVED_REG_COUNT;
+	}
+
+	return 0;
+}
+
+static bool compare_csg(const struct panthor_fw_csg_control_iface *a,
+			const struct panthor_fw_csg_control_iface *b)
+{
+	if (a->features != b->features)
+		return false;
+	if (a->suspend_size != b->suspend_size)
+		return false;
+	if (a->protm_suspend_size != b->protm_suspend_size)
+		return false;
+	if (a->stream_num != b->stream_num)
+		return false;
+	return true;
+}
+
+static int panthor_init_csg_iface(struct panthor_device *ptdev,
+				  unsigned int csg_idx)
+{
+	struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+	struct panthor_fw_csg_iface *csg_iface = &ptdev->fw->iface.groups[csg_idx];
+	u64 shared_section_sz = panthor_kernel_bo_size(ptdev->fw->shared_section->mem);
+	u32 iface_offset = CSF_GROUP_CONTROL_OFFSET + (csg_idx * glb_iface->control->group_stride);
+	unsigned int i;
+
+	if (iface_offset + sizeof(*csg_iface) >= shared_section_sz)
+		return -EINVAL;
+
+	spin_lock_init(&csg_iface->lock);
+	csg_iface->control = ptdev->fw->shared_section->mem->kmap + iface_offset;
+	csg_iface->input = iface_fw_to_cpu_addr(ptdev, csg_iface->control->input_va);
+	csg_iface->output = iface_fw_to_cpu_addr(ptdev, csg_iface->control->output_va);
+
+	if (csg_iface->control->stream_num < MIN_CS_PER_CSG ||
+	    csg_iface->control->stream_num > MAX_CS_PER_CSG)
+		return -EINVAL;
+
+	if (!csg_iface->input || !csg_iface->output) {
+		drm_err(&ptdev->base, "Invalid group control interface input/output VA");
+		return -EINVAL;
+	}
+
+	if (csg_idx > 0) {
+		struct panthor_fw_csg_iface *first_csg_iface =
+			panthor_fw_get_csg_iface(ptdev, 0);
+
+		if (!compare_csg(first_csg_iface->control, csg_iface->control)) {
+			drm_err(&ptdev->base, "Expecting identical CSG slots");
+			return -EINVAL;
+		}
+	}
+
+	for (i = 0; i < csg_iface->control->stream_num; i++) {
+		int ret = panthor_init_cs_iface(ptdev, csg_idx, i);
+
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static u32 panthor_get_instr_features(struct panthor_device *ptdev)
+{
+	struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+
+	if (glb_iface->control->version < CSF_IFACE_VERSION(1, 1, 0))
+		return 0;
+
+	return glb_iface->control->instr_features;
+}
+
+static int panthor_fw_init_ifaces(struct panthor_device *ptdev)
+{
+	struct panthor_fw_global_iface *glb_iface = &ptdev->fw->iface.global;
+	unsigned int i;
+
+	if (!ptdev->fw->shared_section->mem->kmap)
+		return -EINVAL;
+
+	spin_lock_init(&glb_iface->lock);
+	glb_iface->control = ptdev->fw->shared_section->mem->kmap;
+
+	if (!glb_iface->control->version) {
+		drm_err(&ptdev->base, "Firmware version is 0. Firmware may have failed to boot");
+		return -EINVAL;
+	}
+
+	glb_iface->input = iface_fw_to_cpu_addr(ptdev, glb_iface->control->input_va);
+	glb_iface->output = iface_fw_to_cpu_addr(ptdev, glb_iface->control->output_va);
+	if (!glb_iface->input || !glb_iface->output) {
+		drm_err(&ptdev->base, "Invalid global control interface input/output VA");
+		return -EINVAL;
+	}
+
+	if (glb_iface->control->group_num > MAX_CSGS ||
+	    glb_iface->control->group_num < MIN_CSGS) {
+		drm_err(&ptdev->base, "Invalid number of control groups");
+		return -EINVAL;
+	}
+
+	for (i = 0; i < glb_iface->control->group_num; i++) {
+		int ret = panthor_init_csg_iface(ptdev, i);
+
+		if (ret)
+			return ret;
+	}
+
+	drm_info(&ptdev->base, "CSF FW v%d.%d.%d, Features %#x Instrumentation features %#x",
+		 CSF_IFACE_VERSION_MAJOR(glb_iface->control->version),
+		 CSF_IFACE_VERSION_MINOR(glb_iface->control->version),
+		 CSF_IFACE_VERSION_PATCH(glb_iface->control->version),
+		 glb_iface->control->features,
+		 panthor_get_instr_features(ptdev));
+	return 0;
+}
+
+static void panthor_fw_init_global_iface(struct panthor_device *ptdev)
+{
+	struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+
+	/* Enable all cores. */
+	glb_iface->input->core_en_mask = ptdev->gpu_info.shader_present;
+
+	/* Setup timers. */
+	glb_iface->input->poweroff_timer = panthor_fw_conv_timeout(ptdev, PWROFF_HYSTERESIS_US);
+	glb_iface->input->progress_timer = PROGRESS_TIMEOUT_CYCLES >> PROGRESS_TIMEOUT_SCALE_SHIFT;
+	glb_iface->input->idle_timer = panthor_fw_conv_timeout(ptdev, IDLE_HYSTERESIS_US);
+
+	/* Enable interrupts we care about. */
+	glb_iface->input->ack_irq_mask = GLB_CFG_ALLOC_EN |
+					 GLB_PING |
+					 GLB_CFG_PROGRESS_TIMER |
+					 GLB_CFG_POWEROFF_TIMER |
+					 GLB_IDLE_EN |
+					 GLB_IDLE;
+
+	panthor_fw_update_reqs(glb_iface, req, GLB_IDLE_EN, GLB_IDLE_EN);
+	panthor_fw_toggle_reqs(glb_iface, req, ack,
+			       GLB_CFG_ALLOC_EN |
+			       GLB_CFG_POWEROFF_TIMER |
+			       GLB_CFG_PROGRESS_TIMER);
+
+	gpu_write(ptdev, CSF_DOORBELL(CSF_GLB_DOORBELL_ID), 1);
+
+	/* Kick the watchdog. */
+	mod_delayed_work(ptdev->reset.wq, &ptdev->fw->watchdog.ping_work,
+			 msecs_to_jiffies(PING_INTERVAL_MS));
+}
+
+static void panthor_job_irq_handler(struct panthor_device *ptdev, u32 status)
+{
+	if (!ptdev->fw->booted && (status & JOB_INT_GLOBAL_IF))
+		ptdev->fw->booted = true;
+
+	wake_up_all(&ptdev->fw->req_waitqueue);
+
+	/* If the FW is not booted, don't process IRQs, just flag the FW as booted. */
+	if (!ptdev->fw->booted)
+		return;
+
+	panthor_sched_report_fw_events(ptdev, status);
+}
+PANTHOR_IRQ_HANDLER(job, JOB, panthor_job_irq_handler);
+
+static int panthor_fw_start(struct panthor_device *ptdev)
+{
+	bool timedout = false;
+
+	ptdev->fw->booted = false;
+	panthor_job_irq_resume(&ptdev->fw->irq, ~0);
+	gpu_write(ptdev, MCU_CONTROL, MCU_CONTROL_AUTO);
+
+	if (!wait_event_timeout(ptdev->fw->req_waitqueue,
+				ptdev->fw->booted,
+				msecs_to_jiffies(1000))) {
+		if (!ptdev->fw->booted &&
+		    !(gpu_read(ptdev, JOB_INT_STAT) & JOB_INT_GLOBAL_IF))
+			timedout = true;
+	}
+
+	if (timedout) {
+		static const char * const status_str[] = {
+			[MCU_STATUS_DISABLED] = "disabled",
+			[MCU_STATUS_ENABLED] = "enabled",
+			[MCU_STATUS_HALT] = "halt",
+			[MCU_STATUS_FATAL] = "fatal",
+		};
+		u32 status = gpu_read(ptdev, MCU_STATUS);
+
+		drm_err(&ptdev->base, "Failed to boot MCU (status=%s)",
+			status < ARRAY_SIZE(status_str) ? status_str[status] : "unknown");
+		return -ETIMEDOUT;
+	}
+
+	return 0;
+}
+
+static void panthor_fw_stop(struct panthor_device *ptdev)
+{
+	u32 status;
+
+	gpu_write(ptdev, MCU_CONTROL, MCU_CONTROL_DISABLE);
+	if (readl_poll_timeout(ptdev->iomem + MCU_STATUS, status,
+			       status == MCU_STATUS_DISABLED, 10, 100000))
+		drm_err(&ptdev->base, "Failed to stop MCU");
+}
+
+/**
+ * panthor_fw_pre_reset() - Call before a reset.
+ * @ptdev: Device.
+ * @on_hang: true if the reset was triggered on a GPU hang.
+ *
+ * If the reset is not triggered on a hang, we try to gracefully halt the
+ * MCU, so we can do a fast-reset when panthor_fw_post_reset() is called.
+ */
+void panthor_fw_pre_reset(struct panthor_device *ptdev, bool on_hang)
+{
+	/* Make sure we won't be woken up by a ping. */
+	cancel_delayed_work_sync(&ptdev->fw->watchdog.ping_work);
+
+	ptdev->fw->fast_reset = false;
+
+	if (!on_hang) {
+		struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+		u32 status;
+
+		panthor_fw_update_reqs(glb_iface, req, GLB_HALT, GLB_HALT);
+		gpu_write(ptdev, CSF_DOORBELL(CSF_GLB_DOORBELL_ID), 1);
+		if (!readl_poll_timeout(ptdev->iomem + MCU_STATUS, status,
+					status == MCU_STATUS_HALT, 10, 100000) &&
+		    glb_iface->output->halt_status == PANTHOR_FW_HALT_OK) {
+			ptdev->fw->fast_reset = true;
+		} else {
+			drm_warn(&ptdev->base, "Failed to cleanly suspend MCU");
+		}
+
+		/* The FW detects 0 -> 1 transitions. Make sure we reset
+		 * the HALT bit before the FW is rebooted.
+		 */
+		panthor_fw_update_reqs(glb_iface, req, 0, GLB_HALT);
+	}
+
+	panthor_job_irq_suspend(&ptdev->fw->irq);
+}
+
+/**
+ * panthor_fw_post_reset() - Call after a reset.
+ * @ptdev: Device.
+ *
+ * Start the FW. If this is not a fast reset, all FW sections are reloaded to
+ * make sure we can recover from a memory corruption.
+ */
+int panthor_fw_post_reset(struct panthor_device *ptdev)
+{
+	int ret;
+
+	/* Make the MCU VM active. */
+	ret = panthor_vm_active(ptdev->fw->vm);
+	if (ret)
+		return ret;
+
+	/* If this is a fast reset, try to start the MCU without reloading
+	 * the FW sections. If it fails, go for a full reset.
+	 */
+	if (ptdev->fw->fast_reset) {
+		ret = panthor_fw_start(ptdev);
+		if (!ret)
+			goto out;
+
+		/* Force a disable, so we get a fresh boot on the next
+		 * panthor_fw_start() call.
+		 */
+		gpu_write(ptdev, MCU_CONTROL, MCU_CONTROL_DISABLE);
+		drm_err(&ptdev->base, "FW fast reset failed, trying a slow reset");
+	}
+
+	/* Reload all sections, including RO ones. We're not supposed
+	 * to end up here anyway, let's just assume the overhead of
+	 * reloading everything is acceptable.
+	 */
+	panthor_reload_fw_sections(ptdev, true);
+
+	ret = panthor_fw_start(ptdev);
+	if (ret) {
+		drm_err(&ptdev->base, "FW slow reset failed");
+		return ret;
+	}
+
+out:
+	/* We must re-initialize the global interface even on fast-reset. */
+	panthor_fw_init_global_iface(ptdev);
+	return 0;
+}
+
+/**
+ * panthor_fw_unplug() - Called when the device is unplugged.
+ * @ptdev: Device.
+ *
+ * This function must make sure all pending operations are flushed before
+ * will release device resources, thus preventing any interaction with
+ * the HW.
+ *
+ * If there is still FW-related work running after this function returns,
+ * they must use drm_dev_{enter,exit}() and skip any HW access when
+ * drm_dev_enter() returns false.
+ */
+void panthor_fw_unplug(struct panthor_device *ptdev)
+{
+	struct panthor_fw_section *section;
+
+	cancel_delayed_work_sync(&ptdev->fw->watchdog.ping_work);
+
+	/* Make sure the IRQ handler can be called after that point. */
+	if (ptdev->fw->irq.irq)
+		panthor_job_irq_suspend(&ptdev->fw->irq);
+
+	panthor_fw_stop(ptdev);
+
+	list_for_each_entry(section, &ptdev->fw->sections, node)
+		panthor_kernel_bo_destroy(panthor_fw_vm(ptdev), section->mem);
+
+	/* We intentionally don't call panthor_vm_idle() and let
+	 * panthor_mmu_unplug() release the AS we acquired with
+	 * panthor_vm_active() so we don't have to track the VM active/idle
+	 * state to keep the active_refcnt balanced.
+	 */
+	panthor_vm_put(ptdev->fw->vm);
+
+	panthor_gpu_power_off(ptdev, L2, ptdev->gpu_info.l2_present, 20000);
+}
+
+/**
+ * panthor_fw_wait_acks() - Wait for requests to be acknowledged by the FW.
+ * @req_ptr: Pointer to the req register.
+ * @ack_ptr: Pointer to the ack register.
+ * @wq: Wait queue to use for the sleeping wait.
+ * @req_mask: Mask of requests to wait for.
+ * @acked: Pointer to field that's updated with the acked requests.
+ * If the function returns 0, *acked == req_mask.
+ * @timeout_ms: Timeout expressed in milliseconds.
+ *
+ * Return: 0 on success, -ETIMEDOUT otherwise.
+ */
+static int panthor_fw_wait_acks(const u32 *req_ptr, const u32 *ack_ptr,
+				wait_queue_head_t *wq,
+				u32 req_mask, u32 *acked,
+				u32 timeout_ms)
+{
+	u32 ack, req = READ_ONCE(*req_ptr) & req_mask;
+	int ret;
+
+	/* Busy wait for a few µsecs before falling back to a sleeping wait. */
+	*acked = req_mask;
+	ret = read_poll_timeout_atomic(READ_ONCE, ack,
+				       (ack & req_mask) == req,
+				       0, 10, 0,
+				       *ack_ptr);
+	if (!ret)
+		return 0;
+
+	if (wait_event_timeout(*wq, (READ_ONCE(*ack_ptr) & req_mask) == req,
+			       msecs_to_jiffies(timeout_ms)))
+		return 0;
+
+	/* Check one last time, in case we were not woken up for some reason. */
+	ack = READ_ONCE(*ack_ptr);
+	if ((ack & req_mask) == req)
+		return 0;
+
+	*acked = ~(req ^ ack) & req_mask;
+	return -ETIMEDOUT;
+}
+
+/**
+ * panthor_fw_glb_wait_acks() - Wait for global requests to be acknowledged.
+ * @ptdev: Device.
+ * @req_mask: Mask of requests to wait for.
+ * @acked: Pointer to field that's updated with the acked requests.
+ * If the function returns 0, *acked == req_mask.
+ * @timeout_ms: Timeout expressed in milliseconds.
+ *
+ * Return: 0 on success, -ETIMEDOUT otherwise.
+ */
+int panthor_fw_glb_wait_acks(struct panthor_device *ptdev,
+			     u32 req_mask, u32 *acked,
+			     u32 timeout_ms)
+{
+	struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+
+	/* GLB_HALT doesn't get acked through the FW interface. */
+	if (drm_WARN_ON(&ptdev->base, req_mask & (~GLB_REQ_MASK | GLB_HALT)))
+		return -EINVAL;
+
+	return panthor_fw_wait_acks(&glb_iface->input->req,
+				    &glb_iface->output->ack,
+				    &ptdev->fw->req_waitqueue,
+				    req_mask, acked, timeout_ms);
+}
+
+/**
+ * panthor_fw_csg_wait_acks() - Wait for command stream group requests to be acknowledged.
+ * @ptdev: Device.
+ * @csg_slot: CSG slot ID.
+ * @req_mask: Mask of requests to wait for.
+ * @acked: Pointer to field that's updated with the acked requests.
+ * If the function returns 0, *acked == req_mask.
+ * @timeout_ms: Timeout expressed in milliseconds.
+ *
+ * Return: 0 on success, -ETIMEDOUT otherwise.
+ */
+int panthor_fw_csg_wait_acks(struct panthor_device *ptdev, u32 csg_slot,
+			     u32 req_mask, u32 *acked, u32 timeout_ms)
+{
+	struct panthor_fw_csg_iface *csg_iface = panthor_fw_get_csg_iface(ptdev, csg_slot);
+	int ret;
+
+	if (drm_WARN_ON(&ptdev->base, req_mask & ~CSG_REQ_MASK))
+		return -EINVAL;
+
+	ret = panthor_fw_wait_acks(&csg_iface->input->req,
+				   &csg_iface->output->ack,
+				   &ptdev->fw->req_waitqueue,
+				   req_mask, acked, timeout_ms);
+
+	/*
+	 * Check that all bits in the state field were updated, if any mismatch
+	 * then clear all bits in the state field. This allows code to do
+	 * (acked & CSG_STATE_MASK) and get the right value.
+	 */
+
+	if ((*acked & CSG_STATE_MASK) != CSG_STATE_MASK)
+		*acked &= ~CSG_STATE_MASK;
+
+	return ret;
+}
+
+/**
+ * panthor_fw_ring_csg_doorbells() - Ring command stream group doorbells.
+ * @ptdev: Device.
+ * @csg_mask: Bitmask encoding the command stream group doorbells to ring.
+ *
+ * This function is toggling bits in the doorbell_req and ringing the
+ * global doorbell. It doesn't require a user doorbell to be attached to
+ * the group.
+ */
+void panthor_fw_ring_csg_doorbells(struct panthor_device *ptdev, u32 csg_mask)
+{
+	struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+
+	panthor_fw_toggle_reqs(glb_iface, doorbell_req, doorbell_ack, csg_mask);
+	gpu_write(ptdev, CSF_DOORBELL(CSF_GLB_DOORBELL_ID), 1);
+}
+
+static void panthor_fw_ping_work(struct work_struct *work)
+{
+	struct panthor_fw *fw = container_of(work, struct panthor_fw, watchdog.ping_work.work);
+	struct panthor_device *ptdev = fw->irq.ptdev;
+	struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+	u32 acked;
+	int ret;
+
+	if (panthor_device_reset_is_pending(ptdev))
+		return;
+
+	panthor_fw_toggle_reqs(glb_iface, req, ack, GLB_PING);
+	gpu_write(ptdev, CSF_DOORBELL(CSF_GLB_DOORBELL_ID), 1);
+
+	ret = panthor_fw_glb_wait_acks(ptdev, GLB_PING, &acked, 100);
+	if (ret) {
+		panthor_device_schedule_reset(ptdev);
+		drm_err(&ptdev->base, "FW ping timeout, scheduling a reset");
+	} else {
+		mod_delayed_work(ptdev->reset.wq, &fw->watchdog.ping_work,
+				 msecs_to_jiffies(PING_INTERVAL_MS));
+	}
+}
+
+/**
+ * panthor_fw_init() - Initialize FW related data.
+ * @ptdev: Device.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+int panthor_fw_init(struct panthor_device *ptdev)
+{
+	struct panthor_fw *fw;
+	int ret, irq;
+
+	fw = drmm_kzalloc(&ptdev->base, sizeof(*fw), GFP_KERNEL);
+	if (!fw)
+		return -ENOMEM;
+
+	ptdev->fw = fw;
+	init_waitqueue_head(&fw->req_waitqueue);
+	INIT_LIST_HEAD(&fw->sections);
+	INIT_DELAYED_WORK(&fw->watchdog.ping_work, panthor_fw_ping_work);
+
+	irq = platform_get_irq_byname(to_platform_device(ptdev->base.dev), "job");
+	if (irq <= 0)
+		return -ENODEV;
+
+	ret = panthor_request_job_irq(ptdev, &fw->irq, irq, 0);
+	if (ret) {
+		drm_err(&ptdev->base, "failed to request job irq");
+		return ret;
+	}
+
+	ret = panthor_gpu_l2_power_on(ptdev);
+	if (ret)
+		return ret;
+
+	fw->vm = panthor_vm_create(ptdev, true,
+				   0, SZ_4G,
+				   CSF_MCU_SHARED_REGION_START,
+				   CSF_MCU_SHARED_REGION_SIZE);
+	if (IS_ERR(fw->vm)) {
+		ret = PTR_ERR(fw->vm);
+		fw->vm = NULL;
+		goto err_unplug_fw;
+	}
+
+	ret = panthor_fw_load(ptdev);
+	if (ret)
+		goto err_unplug_fw;
+
+	ret = panthor_vm_active(fw->vm);
+	if (ret)
+		goto err_unplug_fw;
+
+	ret = panthor_fw_start(ptdev);
+	if (ret)
+		goto err_unplug_fw;
+
+	ret = panthor_fw_init_ifaces(ptdev);
+	if (ret)
+		goto err_unplug_fw;
+
+	panthor_fw_init_global_iface(ptdev);
+	return 0;
+
+err_unplug_fw:
+	panthor_fw_unplug(ptdev);
+	return ret;
+}
+
+MODULE_FIRMWARE("arm/mali/arch10.8/mali_csffw.bin");
diff --git a/drivers/gpu/drm/panthor/panthor_fw.h b/drivers/gpu/drm/panthor/panthor_fw.h
new file mode 100644
index 0000000000000..22448abde9923
--- /dev/null
+++ b/drivers/gpu/drm/panthor/panthor_fw.h
@@ -0,0 +1,503 @@
+/* SPDX-License-Identifier: GPL-2.0 or MIT */
+/* Copyright 2023 Collabora ltd. */
+
+#ifndef __PANTHOR_MCU_H__
+#define __PANTHOR_MCU_H__
+
+#include <linux/types.h>
+
+struct panthor_device;
+struct panthor_kernel_bo;
+
+#define MAX_CSGS				31
+#define MAX_CS_PER_CSG                          32
+
+struct panthor_fw_ringbuf_input_iface {
+	u64 insert;
+	u64 extract;
+};
+
+struct panthor_fw_ringbuf_output_iface {
+	u64 extract;
+	u32 active;
+};
+
+struct panthor_fw_cs_control_iface {
+#define CS_FEATURES_WORK_REGS(x)		(((x) & GENMASK(7, 0)) + 1)
+#define CS_FEATURES_SCOREBOARDS(x)		(((x) & GENMASK(15, 8)) >> 8)
+#define CS_FEATURES_COMPUTE			BIT(16)
+#define CS_FEATURES_FRAGMENT			BIT(17)
+#define CS_FEATURES_TILER			BIT(18)
+	u32 features;
+	u32 input_va;
+	u32 output_va;
+};
+
+struct panthor_fw_cs_input_iface {
+#define CS_STATE_MASK				GENMASK(2, 0)
+#define CS_STATE_STOP				0
+#define CS_STATE_START				1
+#define CS_EXTRACT_EVENT			BIT(4)
+#define CS_IDLE_SYNC_WAIT			BIT(8)
+#define CS_IDLE_PROTM_PENDING			BIT(9)
+#define CS_IDLE_EMPTY				BIT(10)
+#define CS_IDLE_RESOURCE_REQ			BIT(11)
+#define CS_TILER_OOM				BIT(26)
+#define CS_PROTM_PENDING			BIT(27)
+#define CS_FATAL				BIT(30)
+#define CS_FAULT				BIT(31)
+#define CS_REQ_MASK				(CS_STATE_MASK | \
+						 CS_EXTRACT_EVENT | \
+						 CS_IDLE_SYNC_WAIT | \
+						 CS_IDLE_PROTM_PENDING | \
+						 CS_IDLE_EMPTY | \
+						 CS_IDLE_RESOURCE_REQ)
+#define CS_EVT_MASK				(CS_TILER_OOM | \
+						 CS_PROTM_PENDING | \
+						 CS_FATAL | \
+						 CS_FAULT)
+	u32 req;
+
+#define CS_CONFIG_PRIORITY(x)			((x) & GENMASK(3, 0))
+#define CS_CONFIG_DOORBELL(x)			(((x) << 8) & GENMASK(15, 8))
+	u32 config;
+	u32 reserved1;
+	u32 ack_irq_mask;
+	u64 ringbuf_base;
+	u32 ringbuf_size;
+	u32 reserved2;
+	u64 heap_start;
+	u64 heap_end;
+	u64 ringbuf_input;
+	u64 ringbuf_output;
+	u32 instr_config;
+	u32 instrbuf_size;
+	u64 instrbuf_base;
+	u64 instrbuf_offset_ptr;
+};
+
+struct panthor_fw_cs_output_iface {
+	u32 ack;
+	u32 reserved1[15];
+	u64 status_cmd_ptr;
+
+#define CS_STATUS_WAIT_SB_MASK			GENMASK(15, 0)
+#define CS_STATUS_WAIT_SB_SRC_MASK		GENMASK(19, 16)
+#define CS_STATUS_WAIT_SB_SRC_NONE		(0 << 16)
+#define CS_STATUS_WAIT_SB_SRC_WAIT		(8 << 16)
+#define CS_STATUS_WAIT_SYNC_COND_LE		(0 << 24)
+#define CS_STATUS_WAIT_SYNC_COND_GT		(1 << 24)
+#define CS_STATUS_WAIT_SYNC_COND_MASK		GENMASK(27, 24)
+#define CS_STATUS_WAIT_PROGRESS			BIT(28)
+#define CS_STATUS_WAIT_PROTM			BIT(29)
+#define CS_STATUS_WAIT_SYNC_64B			BIT(30)
+#define CS_STATUS_WAIT_SYNC			BIT(31)
+	u32 status_wait;
+	u32 status_req_resource;
+	u64 status_wait_sync_ptr;
+	u32 status_wait_sync_value;
+	u32 status_scoreboards;
+
+#define CS_STATUS_BLOCKED_REASON_UNBLOCKED	0
+#define CS_STATUS_BLOCKED_REASON_SB_WAIT	1
+#define CS_STATUS_BLOCKED_REASON_PROGRESS_WAIT	2
+#define CS_STATUS_BLOCKED_REASON_SYNC_WAIT	3
+#define CS_STATUS_BLOCKED_REASON_DEFERRED	5
+#define CS_STATUS_BLOCKED_REASON_RES		6
+#define CS_STATUS_BLOCKED_REASON_FLUSH		7
+#define CS_STATUS_BLOCKED_REASON_MASK		GENMASK(3, 0)
+	u32 status_blocked_reason;
+	u32 status_wait_sync_value_hi;
+	u32 reserved2[6];
+
+#define CS_EXCEPTION_TYPE(x)			((x) & GENMASK(7, 0))
+#define CS_EXCEPTION_DATA(x)			(((x) >> 8) & GENMASK(23, 0))
+	u32 fault;
+	u32 fatal;
+	u64 fault_info;
+	u64 fatal_info;
+	u32 reserved3[10];
+	u32 heap_vt_start;
+	u32 heap_vt_end;
+	u32 reserved4;
+	u32 heap_frag_end;
+	u64 heap_address;
+};
+
+struct panthor_fw_csg_control_iface {
+	u32 features;
+	u32 input_va;
+	u32 output_va;
+	u32 suspend_size;
+	u32 protm_suspend_size;
+	u32 stream_num;
+	u32 stream_stride;
+};
+
+struct panthor_fw_csg_input_iface {
+#define CSG_STATE_MASK				GENMASK(2, 0)
+#define CSG_STATE_TERMINATE			0
+#define CSG_STATE_START				1
+#define CSG_STATE_SUSPEND			2
+#define CSG_STATE_RESUME			3
+#define CSG_ENDPOINT_CONFIG			BIT(4)
+#define CSG_STATUS_UPDATE			BIT(5)
+#define CSG_SYNC_UPDATE				BIT(28)
+#define CSG_IDLE				BIT(29)
+#define CSG_DOORBELL				BIT(30)
+#define CSG_PROGRESS_TIMER_EVENT		BIT(31)
+#define CSG_REQ_MASK				(CSG_STATE_MASK | \
+						 CSG_ENDPOINT_CONFIG | \
+						 CSG_STATUS_UPDATE)
+#define CSG_EVT_MASK				(CSG_SYNC_UPDATE | \
+						 CSG_IDLE | \
+						 CSG_PROGRESS_TIMER_EVENT)
+	u32 req;
+	u32 ack_irq_mask;
+
+	u32 doorbell_req;
+	u32 cs_irq_ack;
+	u32 reserved1[4];
+	u64 allow_compute;
+	u64 allow_fragment;
+	u32 allow_other;
+
+#define CSG_EP_REQ_COMPUTE(x)			((x) & GENMASK(7, 0))
+#define CSG_EP_REQ_FRAGMENT(x)			(((x) << 8) & GENMASK(15, 8))
+#define CSG_EP_REQ_TILER(x)			(((x) << 16) & GENMASK(19, 16))
+#define CSG_EP_REQ_EXCL_COMPUTE			BIT(20)
+#define CSG_EP_REQ_EXCL_FRAGMENT		BIT(21)
+#define CSG_EP_REQ_PRIORITY(x)			(((x) << 28) & GENMASK(31, 28))
+#define CSG_EP_REQ_PRIORITY_MASK		GENMASK(31, 28)
+	u32 endpoint_req;
+	u32 reserved2[2];
+	u64 suspend_buf;
+	u64 protm_suspend_buf;
+	u32 config;
+	u32 iter_trace_config;
+};
+
+struct panthor_fw_csg_output_iface {
+	u32 ack;
+	u32 reserved1;
+	u32 doorbell_ack;
+	u32 cs_irq_req;
+	u32 status_endpoint_current;
+	u32 status_endpoint_req;
+
+#define CSG_STATUS_STATE_IS_IDLE		BIT(0)
+	u32 status_state;
+	u32 resource_dep;
+};
+
+struct panthor_fw_global_control_iface {
+	u32 version;
+	u32 features;
+	u32 input_va;
+	u32 output_va;
+	u32 group_num;
+	u32 group_stride;
+	u32 perfcnt_size;
+	u32 instr_features;
+};
+
+struct panthor_fw_global_input_iface {
+#define GLB_HALT				BIT(0)
+#define GLB_CFG_PROGRESS_TIMER			BIT(1)
+#define GLB_CFG_ALLOC_EN			BIT(2)
+#define GLB_CFG_POWEROFF_TIMER			BIT(3)
+#define GLB_PROTM_ENTER				BIT(4)
+#define GLB_PERFCNT_EN				BIT(5)
+#define GLB_PERFCNT_SAMPLE			BIT(6)
+#define GLB_COUNTER_EN				BIT(7)
+#define GLB_PING				BIT(8)
+#define GLB_FWCFG_UPDATE			BIT(9)
+#define GLB_IDLE_EN				BIT(10)
+#define GLB_SLEEP				BIT(12)
+#define GLB_INACTIVE_COMPUTE			BIT(20)
+#define GLB_INACTIVE_FRAGMENT			BIT(21)
+#define GLB_INACTIVE_TILER			BIT(22)
+#define GLB_PROTM_EXIT				BIT(23)
+#define GLB_PERFCNT_THRESHOLD			BIT(24)
+#define GLB_PERFCNT_OVERFLOW			BIT(25)
+#define GLB_IDLE				BIT(26)
+#define GLB_DBG_CSF				BIT(30)
+#define GLB_DBG_HOST				BIT(31)
+#define GLB_REQ_MASK				GENMASK(10, 0)
+#define GLB_EVT_MASK				GENMASK(26, 20)
+	u32 req;
+	u32 ack_irq_mask;
+	u32 doorbell_req;
+	u32 reserved1;
+	u32 progress_timer;
+
+#define GLB_TIMER_VAL(x)			((x) & GENMASK(30, 0))
+#define GLB_TIMER_SOURCE_GPU_COUNTER		BIT(31)
+	u32 poweroff_timer;
+	u64 core_en_mask;
+	u32 reserved2;
+	u32 perfcnt_as;
+	u64 perfcnt_base;
+	u32 perfcnt_extract;
+	u32 reserved3[3];
+	u32 perfcnt_config;
+	u32 perfcnt_csg_select;
+	u32 perfcnt_fw_enable;
+	u32 perfcnt_csg_enable;
+	u32 perfcnt_csf_enable;
+	u32 perfcnt_shader_enable;
+	u32 perfcnt_tiler_enable;
+	u32 perfcnt_mmu_l2_enable;
+	u32 reserved4[8];
+	u32 idle_timer;
+};
+
+enum panthor_fw_halt_status {
+	PANTHOR_FW_HALT_OK = 0,
+	PANTHOR_FW_HALT_ON_PANIC = 0x4e,
+	PANTHOR_FW_HALT_ON_WATCHDOG_EXPIRATION = 0x4f,
+};
+
+struct panthor_fw_global_output_iface {
+	u32 ack;
+	u32 reserved1;
+	u32 doorbell_ack;
+	u32 reserved2;
+	u32 halt_status;
+	u32 perfcnt_status;
+	u32 perfcnt_insert;
+};
+
+/**
+ * struct panthor_fw_cs_iface - Firmware command stream slot interface
+ */
+struct panthor_fw_cs_iface {
+	/**
+	 * @lock: Lock protecting access to the panthor_fw_cs_input_iface::req
+	 * field.
+	 *
+	 * Needed so we can update the req field concurrently from the interrupt
+	 * handler and the scheduler logic.
+	 *
+	 * TODO: Ideally we'd want to use a cmpxchg() to update the req, but FW
+	 * interface sections are mapped uncached/write-combined right now, and
+	 * using cmpxchg() on such mappings leads to SError faults. Revisit when
+	 * we have 'SHARED' GPU mappings hooked up.
+	 */
+	spinlock_t lock;
+
+	/**
+	 * @control: Command stream slot control interface.
+	 *
+	 * Used to expose command stream slot properties.
+	 *
+	 * This interface is read-only.
+	 */
+	struct panthor_fw_cs_control_iface *control;
+
+	/**
+	 * @input: Command stream slot input interface.
+	 *
+	 * Used for host updates/events.
+	 */
+	struct panthor_fw_cs_input_iface *input;
+
+	/**
+	 * @output: Command stream slot output interface.
+	 *
+	 * Used for FW updates/events.
+	 *
+	 * This interface is read-only.
+	 */
+	const struct panthor_fw_cs_output_iface *output;
+};
+
+/**
+ * struct panthor_fw_csg_iface - Firmware command stream group slot interface
+ */
+struct panthor_fw_csg_iface {
+	/**
+	 * @lock: Lock protecting access to the panthor_fw_csg_input_iface::req
+	 * field.
+	 *
+	 * Needed so we can update the req field concurrently from the interrupt
+	 * handler and the scheduler logic.
+	 *
+	 * TODO: Ideally we'd want to use a cmpxchg() to update the req, but FW
+	 * interface sections are mapped uncached/write-combined right now, and
+	 * using cmpxchg() on such mappings leads to SError faults. Revisit when
+	 * we have 'SHARED' GPU mappings hooked up.
+	 */
+	spinlock_t lock;
+
+	/**
+	 * @control: Command stream group slot control interface.
+	 *
+	 * Used to expose command stream group slot properties.
+	 *
+	 * This interface is read-only.
+	 */
+	const struct panthor_fw_csg_control_iface *control;
+
+	/**
+	 * @input: Command stream slot input interface.
+	 *
+	 * Used for host updates/events.
+	 */
+	struct panthor_fw_csg_input_iface *input;
+
+	/**
+	 * @output: Command stream group slot output interface.
+	 *
+	 * Used for FW updates/events.
+	 *
+	 * This interface is read-only.
+	 */
+	const struct panthor_fw_csg_output_iface *output;
+};
+
+/**
+ * struct panthor_fw_global_iface - Firmware global interface
+ */
+struct panthor_fw_global_iface {
+	/**
+	 * @lock: Lock protecting access to the panthor_fw_global_input_iface::req
+	 * field.
+	 *
+	 * Needed so we can update the req field concurrently from the interrupt
+	 * handler and the scheduler/FW management logic.
+	 *
+	 * TODO: Ideally we'd want to use a cmpxchg() to update the req, but FW
+	 * interface sections are mapped uncached/write-combined right now, and
+	 * using cmpxchg() on such mappings leads to SError faults. Revisit when
+	 * we have 'SHARED' GPU mappings hooked up.
+	 */
+	spinlock_t lock;
+
+	/**
+	 * @control: Command stream group slot control interface.
+	 *
+	 * Used to expose global FW properties.
+	 *
+	 * This interface is read-only.
+	 */
+	const struct panthor_fw_global_control_iface *control;
+
+	/**
+	 * @input: Global input interface.
+	 *
+	 * Used for host updates/events.
+	 */
+	struct panthor_fw_global_input_iface *input;
+
+	/**
+	 * @output: Global output interface.
+	 *
+	 * Used for FW updates/events.
+	 *
+	 * This interface is read-only.
+	 */
+	const struct panthor_fw_global_output_iface *output;
+};
+
+/**
+ * panthor_fw_toggle_reqs() - Toggle acknowledge bits to send an event to the FW
+ * @__iface: The interface to operate on.
+ * @__in_reg: Name of the register to update in the input section of the interface.
+ * @__out_reg: Name of the register to take as a reference in the output section of the
+ * interface.
+ * @__mask: Mask to apply to the update.
+ *
+ * The Host -> FW event/message passing was designed to be lockless, with each side of
+ * the channel having its writeable section. Events are signaled as a difference between
+ * the host and FW side in the req/ack registers (when a bit differs, there's an event
+ * pending, when they are the same, nothing needs attention).
+ *
+ * This helper allows one to update the req register based on the current value of the
+ * ack register managed by the FW. Toggling a specific bit will flag an event. In order
+ * for events to be re-evaluated, the interface doorbell needs to be rung.
+ *
+ * Concurrent accesses to the same req register is covered.
+ *
+ * Anything requiring atomic updates to multiple registers requires a dedicated lock.
+ */
+#define panthor_fw_toggle_reqs(__iface, __in_reg, __out_reg, __mask) \
+	do { \
+		u32 __cur_val, __new_val, __out_val; \
+		spin_lock(&(__iface)->lock); \
+		__cur_val = READ_ONCE((__iface)->input->__in_reg); \
+		__out_val = READ_ONCE((__iface)->output->__out_reg); \
+		__new_val = ((__out_val ^ (__mask)) & (__mask)) | (__cur_val & ~(__mask)); \
+		WRITE_ONCE((__iface)->input->__in_reg, __new_val); \
+		spin_unlock(&(__iface)->lock); \
+	} while (0)
+
+/**
+ * panthor_fw_update_reqs() - Update bits to reflect a configuration change
+ * @__iface: The interface to operate on.
+ * @__in_reg: Name of the register to update in the input section of the interface.
+ * @__val: Value to set.
+ * @__mask: Mask to apply to the update.
+ *
+ * Some configuration get passed through req registers that are also used to
+ * send events to the FW. Those req registers being updated from the interrupt
+ * handler, they require special helpers to update the configuration part as well.
+ *
+ * Concurrent accesses to the same req register is covered.
+ *
+ * Anything requiring atomic updates to multiple registers requires a dedicated lock.
+ */
+#define panthor_fw_update_reqs(__iface, __in_reg, __val, __mask) \
+	do { \
+		u32 __cur_val, __new_val; \
+		spin_lock(&(__iface)->lock); \
+		__cur_val = READ_ONCE((__iface)->input->__in_reg); \
+		__new_val = (__cur_val & ~(__mask)) | ((__val) & (__mask)); \
+		WRITE_ONCE((__iface)->input->__in_reg, __new_val); \
+		spin_unlock(&(__iface)->lock); \
+	} while (0)
+
+struct panthor_fw_global_iface *
+panthor_fw_get_glb_iface(struct panthor_device *ptdev);
+
+struct panthor_fw_csg_iface *
+panthor_fw_get_csg_iface(struct panthor_device *ptdev, u32 csg_slot);
+
+struct panthor_fw_cs_iface *
+panthor_fw_get_cs_iface(struct panthor_device *ptdev, u32 csg_slot, u32 cs_slot);
+
+int panthor_fw_csg_wait_acks(struct panthor_device *ptdev, u32 csg_id, u32 req_mask,
+			     u32 *acked, u32 timeout_ms);
+
+int panthor_fw_glb_wait_acks(struct panthor_device *ptdev, u32 req_mask, u32 *acked,
+			     u32 timeout_ms);
+
+void panthor_fw_ring_csg_doorbells(struct panthor_device *ptdev, u32 csg_slot);
+
+struct panthor_kernel_bo *
+panthor_fw_alloc_queue_iface_mem(struct panthor_device *ptdev,
+				 struct panthor_fw_ringbuf_input_iface **input,
+				 const struct panthor_fw_ringbuf_output_iface **output,
+				 u32 *input_fw_va, u32 *output_fw_va);
+struct panthor_kernel_bo *
+panthor_fw_alloc_suspend_buf_mem(struct panthor_device *ptdev, size_t size);
+
+struct panthor_vm *panthor_fw_vm(struct panthor_device *ptdev);
+
+void panthor_fw_pre_reset(struct panthor_device *ptdev, bool on_hang);
+int panthor_fw_post_reset(struct panthor_device *ptdev);
+
+static inline void panthor_fw_suspend(struct panthor_device *ptdev)
+{
+	panthor_fw_pre_reset(ptdev, false);
+}
+
+static inline int panthor_fw_resume(struct panthor_device *ptdev)
+{
+	return panthor_fw_post_reset(ptdev);
+}
+
+int panthor_fw_init(struct panthor_device *ptdev);
+void panthor_fw_unplug(struct panthor_device *ptdev);
+
+#endif
diff --git a/drivers/gpu/drm/panthor/panthor_gem.c b/drivers/gpu/drm/panthor/panthor_gem.c
new file mode 100644
index 0000000000000..d6483266d0c20
--- /dev/null
+++ b/drivers/gpu/drm/panthor/panthor_gem.c
@@ -0,0 +1,230 @@
+// SPDX-License-Identifier: GPL-2.0 or MIT
+/* Copyright 2019 Linaro, Ltd, Rob Herring <robh@kernel.org> */
+/* Copyright 2023 Collabora ltd. */
+
+#include <linux/dma-buf.h>
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+
+#include <drm/panthor_drm.h>
+
+#include "panthor_device.h"
+#include "panthor_gem.h"
+#include "panthor_mmu.h"
+
+static void panthor_gem_free_object(struct drm_gem_object *obj)
+{
+	struct panthor_gem_object *bo = to_panthor_bo(obj);
+	struct drm_gem_object *vm_root_gem = bo->exclusive_vm_root_gem;
+
+	drm_gem_free_mmap_offset(&bo->base.base);
+	mutex_destroy(&bo->gpuva_list_lock);
+	drm_gem_shmem_free(&bo->base);
+	drm_gem_object_put(vm_root_gem);
+}
+
+/**
+ * panthor_kernel_bo_destroy() - Destroy a kernel buffer object
+ * @vm: The VM this BO was mapped to.
+ * @bo: Kernel buffer object to destroy. If NULL or an ERR_PTR(), the destruction
+ * is skipped.
+ */
+void panthor_kernel_bo_destroy(struct panthor_vm *vm,
+			       struct panthor_kernel_bo *bo)
+{
+	int ret;
+
+	if (IS_ERR_OR_NULL(bo))
+		return;
+
+	panthor_kernel_bo_vunmap(bo);
+
+	if (drm_WARN_ON(bo->obj->dev,
+			to_panthor_bo(bo->obj)->exclusive_vm_root_gem != panthor_vm_root_gem(vm)))
+		goto out_free_bo;
+
+	ret = panthor_vm_unmap_range(vm, bo->va_node.start,
+				     panthor_kernel_bo_size(bo));
+	if (ret)
+		goto out_free_bo;
+
+	panthor_vm_free_va(vm, &bo->va_node);
+	drm_gem_object_put(bo->obj);
+
+out_free_bo:
+	kfree(bo);
+}
+
+/**
+ * panthor_kernel_bo_create() - Create and map a GEM object to a VM
+ * @ptdev: Device.
+ * @vm: VM to map the GEM to. If NULL, the kernel object is not GPU mapped.
+ * @size: Size of the buffer object.
+ * @bo_flags: Combination of drm_panthor_bo_flags flags.
+ * @vm_map_flags: Combination of drm_panthor_vm_bind_op_flags (only those
+ * that are related to map operations).
+ * @gpu_va: GPU address assigned when mapping to the VM.
+ * If gpu_va == PANTHOR_VM_KERNEL_AUTO_VA, the virtual address will be
+ * automatically allocated.
+ *
+ * Return: A valid pointer in case of success, an ERR_PTR() otherwise.
+ */
+struct panthor_kernel_bo *
+panthor_kernel_bo_create(struct panthor_device *ptdev, struct panthor_vm *vm,
+			 size_t size, u32 bo_flags, u32 vm_map_flags,
+			 u64 gpu_va)
+{
+	struct drm_gem_shmem_object *obj;
+	struct panthor_kernel_bo *kbo;
+	struct panthor_gem_object *bo;
+	int ret;
+
+	if (drm_WARN_ON(&ptdev->base, !vm))
+		return ERR_PTR(-EINVAL);
+
+	kbo = kzalloc(sizeof(*kbo), GFP_KERNEL);
+	if (!kbo)
+		return ERR_PTR(-ENOMEM);
+
+	obj = drm_gem_shmem_create(&ptdev->base, size);
+	if (IS_ERR(obj)) {
+		ret = PTR_ERR(obj);
+		goto err_free_bo;
+	}
+
+	bo = to_panthor_bo(&obj->base);
+	size = obj->base.size;
+	kbo->obj = &obj->base;
+	bo->flags = bo_flags;
+
+	ret = panthor_vm_alloc_va(vm, gpu_va, size, &kbo->va_node);
+	if (ret)
+		goto err_put_obj;
+
+	ret = panthor_vm_map_bo_range(vm, bo, 0, size, kbo->va_node.start, vm_map_flags);
+	if (ret)
+		goto err_free_va;
+
+	bo->exclusive_vm_root_gem = panthor_vm_root_gem(vm);
+	drm_gem_object_get(bo->exclusive_vm_root_gem);
+	bo->base.base.resv = bo->exclusive_vm_root_gem->resv;
+	return kbo;
+
+err_free_va:
+	panthor_vm_free_va(vm, &kbo->va_node);
+
+err_put_obj:
+	drm_gem_object_put(&obj->base);
+
+err_free_bo:
+	kfree(kbo);
+	return ERR_PTR(ret);
+}
+
+static int panthor_gem_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma)
+{
+	struct panthor_gem_object *bo = to_panthor_bo(obj);
+
+	/* Don't allow mmap on objects that have the NO_MMAP flag set. */
+	if (bo->flags & DRM_PANTHOR_BO_NO_MMAP)
+		return -EINVAL;
+
+	return drm_gem_shmem_object_mmap(obj, vma);
+}
+
+static struct dma_buf *
+panthor_gem_prime_export(struct drm_gem_object *obj, int flags)
+{
+	/* We can't export GEMs that have an exclusive VM. */
+	if (to_panthor_bo(obj)->exclusive_vm_root_gem)
+		return ERR_PTR(-EINVAL);
+
+	return drm_gem_prime_export(obj, flags);
+}
+
+static const struct drm_gem_object_funcs panthor_gem_funcs = {
+	.free = panthor_gem_free_object,
+	.print_info = drm_gem_shmem_object_print_info,
+	.pin = drm_gem_shmem_object_pin,
+	.unpin = drm_gem_shmem_object_unpin,
+	.get_sg_table = drm_gem_shmem_object_get_sg_table,
+	.vmap = drm_gem_shmem_object_vmap,
+	.vunmap = drm_gem_shmem_object_vunmap,
+	.mmap = panthor_gem_mmap,
+	.export = panthor_gem_prime_export,
+	.vm_ops = &drm_gem_shmem_vm_ops,
+};
+
+/**
+ * panthor_gem_create_object - Implementation of driver->gem_create_object.
+ * @ddev: DRM device
+ * @size: Size in bytes of the memory the object will reference
+ *
+ * This lets the GEM helpers allocate object structs for us, and keep
+ * our BO stats correct.
+ */
+struct drm_gem_object *panthor_gem_create_object(struct drm_device *ddev, size_t size)
+{
+	struct panthor_device *ptdev = container_of(ddev, struct panthor_device, base);
+	struct panthor_gem_object *obj;
+
+	obj = kzalloc(sizeof(*obj), GFP_KERNEL);
+	if (!obj)
+		return ERR_PTR(-ENOMEM);
+
+	obj->base.base.funcs = &panthor_gem_funcs;
+	obj->base.map_wc = !ptdev->coherent;
+	mutex_init(&obj->gpuva_list_lock);
+	drm_gem_gpuva_set_lock(&obj->base.base, &obj->gpuva_list_lock);
+
+	return &obj->base.base;
+}
+
+/**
+ * panthor_gem_create_with_handle() - Create a GEM object and attach it to a handle.
+ * @file: DRM file.
+ * @ddev: DRM device.
+ * @exclusive_vm: Exclusive VM. Not NULL if the GEM object can't be shared.
+ * @size: Size of the GEM object to allocate.
+ * @flags: Combination of drm_panthor_bo_flags flags.
+ * @handle: Pointer holding the handle pointing to the new GEM object.
+ *
+ * Return: Zero on success
+ */
+int
+panthor_gem_create_with_handle(struct drm_file *file,
+			       struct drm_device *ddev,
+			       struct panthor_vm *exclusive_vm,
+			       u64 *size, u32 flags, u32 *handle)
+{
+	int ret;
+	struct drm_gem_shmem_object *shmem;
+	struct panthor_gem_object *bo;
+
+	shmem = drm_gem_shmem_create(ddev, *size);
+	if (IS_ERR(shmem))
+		return PTR_ERR(shmem);
+
+	bo = to_panthor_bo(&shmem->base);
+	bo->flags = flags;
+
+	if (exclusive_vm) {
+		bo->exclusive_vm_root_gem = panthor_vm_root_gem(exclusive_vm);
+		drm_gem_object_get(bo->exclusive_vm_root_gem);
+		bo->base.base.resv = bo->exclusive_vm_root_gem->resv;
+	}
+
+	/*
+	 * Allocate an id of idr table where the obj is registered
+	 * and handle has the id what user can see.
+	 */
+	ret = drm_gem_handle_create(file, &shmem->base, handle);
+	if (!ret)
+		*size = bo->base.base.size;
+
+	/* drop reference from allocate - handle holds it now. */
+	drm_gem_object_put(&shmem->base);
+
+	return ret;
+}
diff --git a/drivers/gpu/drm/panthor/panthor_gem.h b/drivers/gpu/drm/panthor/panthor_gem.h
new file mode 100644
index 0000000000000..3bccba394d00a
--- /dev/null
+++ b/drivers/gpu/drm/panthor/panthor_gem.h
@@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: GPL-2.0 or MIT */
+/* Copyright 2019 Linaro, Ltd, Rob Herring <robh@kernel.org> */
+/* Copyright 2023 Collabora ltd. */
+
+#ifndef __PANTHOR_GEM_H__
+#define __PANTHOR_GEM_H__
+
+#include <drm/drm_gem_shmem_helper.h>
+#include <drm/drm_mm.h>
+
+#include <linux/iosys-map.h>
+#include <linux/rwsem.h>
+
+struct panthor_vm;
+
+/**
+ * struct panthor_gem_object - Driver specific GEM object.
+ */
+struct panthor_gem_object {
+	/** @base: Inherit from drm_gem_shmem_object. */
+	struct drm_gem_shmem_object base;
+
+	/**
+	 * @exclusive_vm_root_gem: Root GEM of the exclusive VM this GEM object
+	 * is attached to.
+	 *
+	 * If @exclusive_vm_root_gem != NULL, any attempt to bind the GEM to a
+	 * different VM will fail.
+	 *
+	 * All FW memory objects have this field set to the root GEM of the MCU
+	 * VM.
+	 */
+	struct drm_gem_object *exclusive_vm_root_gem;
+
+	/**
+	 * @gpuva_list_lock: Custom GPUVA lock.
+	 *
+	 * Used to protect insertion of drm_gpuva elements to the
+	 * drm_gem_object.gpuva.list list.
+	 *
+	 * We can't use the GEM resv for that, because drm_gpuva_link() is
+	 * called in a dma-signaling path, where we're not allowed to take
+	 * resv locks.
+	 */
+	struct mutex gpuva_list_lock;
+
+	/** @flags: Combination of drm_panthor_bo_flags flags. */
+	u32 flags;
+};
+
+/**
+ * struct panthor_kernel_bo - Kernel buffer object.
+ *
+ * These objects are only manipulated by the kernel driver and not
+ * directly exposed to the userspace. The GPU address of a kernel
+ * BO might be passed to userspace though.
+ */
+struct panthor_kernel_bo {
+	/**
+	 * @obj: The GEM object backing this kernel buffer object.
+	 */
+	struct drm_gem_object *obj;
+
+	/**
+	 * @va_node: VA space allocated to this GEM.
+	 */
+	struct drm_mm_node va_node;
+
+	/**
+	 * @kmap: Kernel CPU mapping of @gem.
+	 */
+	void *kmap;
+};
+
+static inline
+struct panthor_gem_object *to_panthor_bo(struct drm_gem_object *obj)
+{
+	return container_of(to_drm_gem_shmem_obj(obj), struct panthor_gem_object, base);
+}
+
+struct drm_gem_object *panthor_gem_create_object(struct drm_device *ddev, size_t size);
+
+struct drm_gem_object *
+panthor_gem_prime_import_sg_table(struct drm_device *ddev,
+				  struct dma_buf_attachment *attach,
+				  struct sg_table *sgt);
+
+int
+panthor_gem_create_with_handle(struct drm_file *file,
+			       struct drm_device *ddev,
+			       struct panthor_vm *exclusive_vm,
+			       u64 *size, u32 flags, uint32_t *handle);
+
+static inline u64
+panthor_kernel_bo_gpuva(struct panthor_kernel_bo *bo)
+{
+	return bo->va_node.start;
+}
+
+static inline size_t
+panthor_kernel_bo_size(struct panthor_kernel_bo *bo)
+{
+	return bo->obj->size;
+}
+
+static inline int
+panthor_kernel_bo_vmap(struct panthor_kernel_bo *bo)
+{
+	struct iosys_map map;
+	int ret;
+
+	if (bo->kmap)
+		return 0;
+
+	ret = drm_gem_vmap_unlocked(bo->obj, &map);
+	if (ret)
+		return ret;
+
+	bo->kmap = map.vaddr;
+	return 0;
+}
+
+static inline void
+panthor_kernel_bo_vunmap(struct panthor_kernel_bo *bo)
+{
+	if (bo->kmap) {
+		struct iosys_map map = IOSYS_MAP_INIT_VADDR(bo->kmap);
+
+		drm_gem_vunmap_unlocked(bo->obj, &map);
+		bo->kmap = NULL;
+	}
+}
+
+struct panthor_kernel_bo *
+panthor_kernel_bo_create(struct panthor_device *ptdev, struct panthor_vm *vm,
+			 size_t size, u32 bo_flags, u32 vm_map_flags,
+			 u64 gpu_va);
+
+void panthor_kernel_bo_destroy(struct panthor_vm *vm,
+			       struct panthor_kernel_bo *bo);
+
+#endif /* __PANTHOR_GEM_H__ */
diff --git a/drivers/gpu/drm/panthor/panthor_gpu.c b/drivers/gpu/drm/panthor/panthor_gpu.c
new file mode 100644
index 0000000000000..6dbbc4cfbe7e6
--- /dev/null
+++ b/drivers/gpu/drm/panthor/panthor_gpu.c
@@ -0,0 +1,482 @@
+// SPDX-License-Identifier: GPL-2.0 or MIT
+/* Copyright 2018 Marty E. Plummer <hanetzer@startmail.com> */
+/* Copyright 2019 Linaro, Ltd., Rob Herring <robh@kernel.org> */
+/* Copyright 2019 Collabora ltd. */
+
+#include <linux/bitfield.h>
+#include <linux/bitmap.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/iopoll.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+
+#include <drm/drm_drv.h>
+#include <drm/drm_managed.h>
+
+#include "panthor_device.h"
+#include "panthor_gpu.h"
+#include "panthor_regs.h"
+
+/**
+ * struct panthor_gpu - GPU block management data.
+ */
+struct panthor_gpu {
+	/** @irq: GPU irq. */
+	struct panthor_irq irq;
+
+	/** @reqs_lock: Lock protecting access to pending_reqs. */
+	spinlock_t reqs_lock;
+
+	/** @pending_reqs: Pending GPU requests. */
+	u32 pending_reqs;
+
+	/** @reqs_acked: GPU request wait queue. */
+	wait_queue_head_t reqs_acked;
+};
+
+/**
+ * struct panthor_model - GPU model description
+ */
+struct panthor_model {
+	/** @name: Model name. */
+	const char *name;
+
+	/** @arch_major: Major version number of architecture. */
+	u8 arch_major;
+
+	/** @product_major: Major version number of product. */
+	u8 product_major;
+};
+
+/**
+ * GPU_MODEL() - Define a GPU model. A GPU product can be uniquely identified
+ * by a combination of the major architecture version and the major product
+ * version.
+ * @_name: Name for the GPU model.
+ * @_arch_major: Architecture major.
+ * @_product_major: Product major.
+ */
+#define GPU_MODEL(_name, _arch_major, _product_major) \
+{\
+	.name = __stringify(_name),				\
+	.arch_major = _arch_major,				\
+	.product_major = _product_major,			\
+}
+
+static const struct panthor_model gpu_models[] = {
+	GPU_MODEL(g610, 10, 7),
+	{},
+};
+
+#define GPU_INTERRUPTS_MASK	\
+	(GPU_IRQ_FAULT | \
+	 GPU_IRQ_PROTM_FAULT | \
+	 GPU_IRQ_RESET_COMPLETED | \
+	 GPU_IRQ_CLEAN_CACHES_COMPLETED)
+
+static void panthor_gpu_init_info(struct panthor_device *ptdev)
+{
+	const struct panthor_model *model;
+	u32 arch_major, product_major;
+	u32 major, minor, status;
+	unsigned int i;
+
+	ptdev->gpu_info.gpu_id = gpu_read(ptdev, GPU_ID);
+	ptdev->gpu_info.csf_id = gpu_read(ptdev, GPU_CSF_ID);
+	ptdev->gpu_info.gpu_rev = gpu_read(ptdev, GPU_REVID);
+	ptdev->gpu_info.core_features = gpu_read(ptdev, GPU_CORE_FEATURES);
+	ptdev->gpu_info.l2_features = gpu_read(ptdev, GPU_L2_FEATURES);
+	ptdev->gpu_info.tiler_features = gpu_read(ptdev, GPU_TILER_FEATURES);
+	ptdev->gpu_info.mem_features = gpu_read(ptdev, GPU_MEM_FEATURES);
+	ptdev->gpu_info.mmu_features = gpu_read(ptdev, GPU_MMU_FEATURES);
+	ptdev->gpu_info.thread_features = gpu_read(ptdev, GPU_THREAD_FEATURES);
+	ptdev->gpu_info.max_threads = gpu_read(ptdev, GPU_THREAD_MAX_THREADS);
+	ptdev->gpu_info.thread_max_workgroup_size = gpu_read(ptdev, GPU_THREAD_MAX_WORKGROUP_SIZE);
+	ptdev->gpu_info.thread_max_barrier_size = gpu_read(ptdev, GPU_THREAD_MAX_BARRIER_SIZE);
+	ptdev->gpu_info.coherency_features = gpu_read(ptdev, GPU_COHERENCY_FEATURES);
+	for (i = 0; i < 4; i++)
+		ptdev->gpu_info.texture_features[i] = gpu_read(ptdev, GPU_TEXTURE_FEATURES(i));
+
+	ptdev->gpu_info.as_present = gpu_read(ptdev, GPU_AS_PRESENT);
+
+	ptdev->gpu_info.shader_present = gpu_read(ptdev, GPU_SHADER_PRESENT_LO);
+	ptdev->gpu_info.shader_present |= (u64)gpu_read(ptdev, GPU_SHADER_PRESENT_HI) << 32;
+
+	ptdev->gpu_info.tiler_present = gpu_read(ptdev, GPU_TILER_PRESENT_LO);
+	ptdev->gpu_info.tiler_present |= (u64)gpu_read(ptdev, GPU_TILER_PRESENT_HI) << 32;
+
+	ptdev->gpu_info.l2_present = gpu_read(ptdev, GPU_L2_PRESENT_LO);
+	ptdev->gpu_info.l2_present |= (u64)gpu_read(ptdev, GPU_L2_PRESENT_HI) << 32;
+
+	arch_major = GPU_ARCH_MAJOR(ptdev->gpu_info.gpu_id);
+	product_major = GPU_PROD_MAJOR(ptdev->gpu_info.gpu_id);
+	major = GPU_VER_MAJOR(ptdev->gpu_info.gpu_id);
+	minor = GPU_VER_MINOR(ptdev->gpu_info.gpu_id);
+	status = GPU_VER_STATUS(ptdev->gpu_info.gpu_id);
+
+	for (model = gpu_models; model->name; model++) {
+		if (model->arch_major == arch_major &&
+		    model->product_major == product_major)
+			break;
+	}
+
+	drm_info(&ptdev->base,
+		 "mali-%s id 0x%x major 0x%x minor 0x%x status 0x%x",
+		 model->name ?: "unknown", ptdev->gpu_info.gpu_id >> 16,
+		 major, minor, status);
+
+	drm_info(&ptdev->base,
+		 "Features: L2:%#x Tiler:%#x Mem:%#x MMU:%#x AS:%#x",
+		 ptdev->gpu_info.l2_features,
+		 ptdev->gpu_info.tiler_features,
+		 ptdev->gpu_info.mem_features,
+		 ptdev->gpu_info.mmu_features,
+		 ptdev->gpu_info.as_present);
+
+	drm_info(&ptdev->base,
+		 "shader_present=0x%0llx l2_present=0x%0llx tiler_present=0x%0llx",
+		 ptdev->gpu_info.shader_present, ptdev->gpu_info.l2_present,
+		 ptdev->gpu_info.tiler_present);
+}
+
+static void panthor_gpu_irq_handler(struct panthor_device *ptdev, u32 status)
+{
+	if (status & GPU_IRQ_FAULT) {
+		u32 fault_status = gpu_read(ptdev, GPU_FAULT_STATUS);
+		u64 address = ((u64)gpu_read(ptdev, GPU_FAULT_ADDR_HI) << 32) |
+			      gpu_read(ptdev, GPU_FAULT_ADDR_LO);
+
+		drm_warn(&ptdev->base, "GPU Fault 0x%08x (%s) at 0x%016llx\n",
+			 fault_status, panthor_exception_name(ptdev, fault_status & 0xFF),
+			 address);
+	}
+	if (status & GPU_IRQ_PROTM_FAULT)
+		drm_warn(&ptdev->base, "GPU Fault in protected mode\n");
+
+	spin_lock(&ptdev->gpu->reqs_lock);
+	if (status & ptdev->gpu->pending_reqs) {
+		ptdev->gpu->pending_reqs &= ~status;
+		wake_up_all(&ptdev->gpu->reqs_acked);
+	}
+	spin_unlock(&ptdev->gpu->reqs_lock);
+}
+PANTHOR_IRQ_HANDLER(gpu, GPU, panthor_gpu_irq_handler);
+
+/**
+ * panthor_gpu_unplug() - Called when the GPU is unplugged.
+ * @ptdev: Device to unplug.
+ */
+void panthor_gpu_unplug(struct panthor_device *ptdev)
+{
+	unsigned long flags;
+
+	/* Make sure the IRQ handler is not running after that point. */
+	panthor_gpu_irq_suspend(&ptdev->gpu->irq);
+
+	/* Wake-up all waiters. */
+	spin_lock_irqsave(&ptdev->gpu->reqs_lock, flags);
+	ptdev->gpu->pending_reqs = 0;
+	wake_up_all(&ptdev->gpu->reqs_acked);
+	spin_unlock_irqrestore(&ptdev->gpu->reqs_lock, flags);
+}
+
+/**
+ * panthor_gpu_init() - Initialize the GPU block
+ * @ptdev: Device.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+int panthor_gpu_init(struct panthor_device *ptdev)
+{
+	struct panthor_gpu *gpu;
+	u32 pa_bits;
+	int ret, irq;
+
+	gpu = drmm_kzalloc(&ptdev->base, sizeof(*gpu), GFP_KERNEL);
+	if (!gpu)
+		return -ENOMEM;
+
+	spin_lock_init(&gpu->reqs_lock);
+	init_waitqueue_head(&gpu->reqs_acked);
+	ptdev->gpu = gpu;
+	panthor_gpu_init_info(ptdev);
+
+	dma_set_max_seg_size(ptdev->base.dev, UINT_MAX);
+	pa_bits = GPU_MMU_FEATURES_PA_BITS(ptdev->gpu_info.mmu_features);
+	ret = dma_set_mask_and_coherent(ptdev->base.dev, DMA_BIT_MASK(pa_bits));
+	if (ret)
+		return ret;
+
+	irq = platform_get_irq_byname(to_platform_device(ptdev->base.dev), "gpu");
+	if (irq <= 0)
+		return ret;
+
+	ret = panthor_request_gpu_irq(ptdev, &ptdev->gpu->irq, irq, GPU_INTERRUPTS_MASK);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+/**
+ * panthor_gpu_block_power_off() - Power-off a specific block of the GPU
+ * @ptdev: Device.
+ * @blk_name: Block name.
+ * @pwroff_reg: Power-off register for this block.
+ * @pwrtrans_reg: Power transition register for this block.
+ * @mask: Sub-elements to power-off.
+ * @timeout_us: Timeout in microseconds.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+int panthor_gpu_block_power_off(struct panthor_device *ptdev,
+				const char *blk_name,
+				u32 pwroff_reg, u32 pwrtrans_reg,
+				u64 mask, u32 timeout_us)
+{
+	u32 val, i;
+	int ret;
+
+	for (i = 0; i < 2; i++) {
+		u32 mask32 = mask >> (i * 32);
+
+		if (!mask32)
+			continue;
+
+		ret = readl_relaxed_poll_timeout(ptdev->iomem + pwrtrans_reg + (i * 4),
+						 val, !(mask32 & val),
+						 100, timeout_us);
+		if (ret) {
+			drm_err(&ptdev->base, "timeout waiting on %s:%llx power transition",
+				blk_name, mask);
+			return ret;
+		}
+	}
+
+	if (mask & GENMASK(31, 0))
+		gpu_write(ptdev, pwroff_reg, mask);
+
+	if (mask >> 32)
+		gpu_write(ptdev, pwroff_reg + 4, mask >> 32);
+
+	for (i = 0; i < 2; i++) {
+		u32 mask32 = mask >> (i * 32);
+
+		if (!mask32)
+			continue;
+
+		ret = readl_relaxed_poll_timeout(ptdev->iomem + pwrtrans_reg + (i * 4),
+						 val, !(mask32 & val),
+						 100, timeout_us);
+		if (ret) {
+			drm_err(&ptdev->base, "timeout waiting on %s:%llx power transition",
+				blk_name, mask);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * panthor_gpu_block_power_on() - Power-on a specific block of the GPU
+ * @ptdev: Device.
+ * @blk_name: Block name.
+ * @pwron_reg: Power-on register for this block.
+ * @pwrtrans_reg: Power transition register for this block.
+ * @rdy_reg: Power transition ready register.
+ * @mask: Sub-elements to power-on.
+ * @timeout_us: Timeout in microseconds.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+int panthor_gpu_block_power_on(struct panthor_device *ptdev,
+			       const char *blk_name,
+			       u32 pwron_reg, u32 pwrtrans_reg,
+			       u32 rdy_reg, u64 mask, u32 timeout_us)
+{
+	u32 val, i;
+	int ret;
+
+	for (i = 0; i < 2; i++) {
+		u32 mask32 = mask >> (i * 32);
+
+		if (!mask32)
+			continue;
+
+		ret = readl_relaxed_poll_timeout(ptdev->iomem + pwrtrans_reg + (i * 4),
+						 val, !(mask32 & val),
+						 100, timeout_us);
+		if (ret) {
+			drm_err(&ptdev->base, "timeout waiting on %s:%llx power transition",
+				blk_name, mask);
+			return ret;
+		}
+	}
+
+	if (mask & GENMASK(31, 0))
+		gpu_write(ptdev, pwron_reg, mask);
+
+	if (mask >> 32)
+		gpu_write(ptdev, pwron_reg + 4, mask >> 32);
+
+	for (i = 0; i < 2; i++) {
+		u32 mask32 = mask >> (i * 32);
+
+		if (!mask32)
+			continue;
+
+		ret = readl_relaxed_poll_timeout(ptdev->iomem + rdy_reg + (i * 4),
+						 val, (mask32 & val) == mask32,
+						 100, timeout_us);
+		if (ret) {
+			drm_err(&ptdev->base, "timeout waiting on %s:%llx readyness",
+				blk_name, mask);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * panthor_gpu_l2_power_on() - Power-on the L2-cache
+ * @ptdev: Device.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+int panthor_gpu_l2_power_on(struct panthor_device *ptdev)
+{
+	if (ptdev->gpu_info.l2_present != 1) {
+		/*
+		 * Only support one core group now.
+		 * ~(l2_present - 1) unsets all bits in l2_present except
+		 * the bottom bit. (l2_present - 2) has all the bits in
+		 * the first core group set. AND them together to generate
+		 * a mask of cores in the first core group.
+		 */
+		u64 core_mask = ~(ptdev->gpu_info.l2_present - 1) &
+				(ptdev->gpu_info.l2_present - 2);
+		drm_info_once(&ptdev->base, "using only 1st core group (%lu cores from %lu)\n",
+			      hweight64(core_mask),
+			      hweight64(ptdev->gpu_info.shader_present));
+	}
+
+	return panthor_gpu_power_on(ptdev, L2, 1, 20000);
+}
+
+/**
+ * panthor_gpu_flush_caches() - Flush caches
+ * @ptdev: Device.
+ * @l2: L2 flush type.
+ * @lsc: LSC flush type.
+ * @other: Other flush type.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+int panthor_gpu_flush_caches(struct panthor_device *ptdev,
+			     u32 l2, u32 lsc, u32 other)
+{
+	bool timedout = false;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ptdev->gpu->reqs_lock, flags);
+	if (!drm_WARN_ON(&ptdev->base,
+			 ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED)) {
+		ptdev->gpu->pending_reqs |= GPU_IRQ_CLEAN_CACHES_COMPLETED;
+		gpu_write(ptdev, GPU_CMD, GPU_FLUSH_CACHES(l2, lsc, other));
+	}
+	spin_unlock_irqrestore(&ptdev->gpu->reqs_lock, flags);
+
+	if (!wait_event_timeout(ptdev->gpu->reqs_acked,
+				!(ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED),
+				msecs_to_jiffies(100))) {
+		spin_lock_irqsave(&ptdev->gpu->reqs_lock, flags);
+		if ((ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED) != 0 &&
+		    !(gpu_read(ptdev, GPU_INT_RAWSTAT) & GPU_IRQ_CLEAN_CACHES_COMPLETED))
+			timedout = true;
+		else
+			ptdev->gpu->pending_reqs &= ~GPU_IRQ_CLEAN_CACHES_COMPLETED;
+		spin_unlock_irqrestore(&ptdev->gpu->reqs_lock, flags);
+	}
+
+	if (timedout) {
+		drm_err(&ptdev->base, "Flush caches timeout");
+		return -ETIMEDOUT;
+	}
+
+	return 0;
+}
+
+/**
+ * panthor_gpu_soft_reset() - Issue a soft-reset
+ * @ptdev: Device.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+int panthor_gpu_soft_reset(struct panthor_device *ptdev)
+{
+	bool timedout = false;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ptdev->gpu->reqs_lock, flags);
+	if (!drm_WARN_ON(&ptdev->base,
+			 ptdev->gpu->pending_reqs & GPU_IRQ_RESET_COMPLETED)) {
+		ptdev->gpu->pending_reqs |= GPU_IRQ_RESET_COMPLETED;
+		gpu_write(ptdev, GPU_INT_CLEAR, GPU_IRQ_RESET_COMPLETED);
+		gpu_write(ptdev, GPU_CMD, GPU_SOFT_RESET);
+	}
+	spin_unlock_irqrestore(&ptdev->gpu->reqs_lock, flags);
+
+	if (!wait_event_timeout(ptdev->gpu->reqs_acked,
+				!(ptdev->gpu->pending_reqs & GPU_IRQ_RESET_COMPLETED),
+				msecs_to_jiffies(100))) {
+		spin_lock_irqsave(&ptdev->gpu->reqs_lock, flags);
+		if ((ptdev->gpu->pending_reqs & GPU_IRQ_RESET_COMPLETED) != 0 &&
+		    !(gpu_read(ptdev, GPU_INT_RAWSTAT) & GPU_IRQ_RESET_COMPLETED))
+			timedout = true;
+		else
+			ptdev->gpu->pending_reqs &= ~GPU_IRQ_RESET_COMPLETED;
+		spin_unlock_irqrestore(&ptdev->gpu->reqs_lock, flags);
+	}
+
+	if (timedout) {
+		drm_err(&ptdev->base, "Soft reset timeout");
+		return -ETIMEDOUT;
+	}
+
+	return 0;
+}
+
+/**
+ * panthor_gpu_suspend() - Suspend the GPU block.
+ * @ptdev: Device.
+ *
+ * Suspend the GPU irq. This should be called last in the suspend procedure,
+ * after all other blocks have been suspented.
+ */
+void panthor_gpu_suspend(struct panthor_device *ptdev)
+{
+	/*
+	 * It may be preferable to simply power down the L2, but for now just
+	 * soft-reset which will leave the L2 powered down.
+	 */
+	panthor_gpu_soft_reset(ptdev);
+	panthor_gpu_irq_suspend(&ptdev->gpu->irq);
+}
+
+/**
+ * panthor_gpu_resume() - Resume the GPU block.
+ * @ptdev: Device.
+ *
+ * Resume the IRQ handler and power-on the L2-cache.
+ * The FW takes care of powering the other blocks.
+ */
+void panthor_gpu_resume(struct panthor_device *ptdev)
+{
+	panthor_gpu_irq_resume(&ptdev->gpu->irq, GPU_INTERRUPTS_MASK);
+	panthor_gpu_l2_power_on(ptdev);
+}
diff --git a/drivers/gpu/drm/panthor/panthor_gpu.h b/drivers/gpu/drm/panthor/panthor_gpu.h
new file mode 100644
index 0000000000000..bba7555dd3c64
--- /dev/null
+++ b/drivers/gpu/drm/panthor/panthor_gpu.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 or MIT */
+/* Copyright 2018 Marty E. Plummer <hanetzer@startmail.com> */
+/* Copyright 2019 Collabora ltd. */
+
+#ifndef __PANTHOR_GPU_H__
+#define __PANTHOR_GPU_H__
+
+struct panthor_device;
+
+int panthor_gpu_init(struct panthor_device *ptdev);
+void panthor_gpu_unplug(struct panthor_device *ptdev);
+void panthor_gpu_suspend(struct panthor_device *ptdev);
+void panthor_gpu_resume(struct panthor_device *ptdev);
+
+int panthor_gpu_block_power_on(struct panthor_device *ptdev,
+			       const char *blk_name,
+			       u32 pwron_reg, u32 pwrtrans_reg,
+			       u32 rdy_reg, u64 mask, u32 timeout_us);
+int panthor_gpu_block_power_off(struct panthor_device *ptdev,
+				const char *blk_name,
+				u32 pwroff_reg, u32 pwrtrans_reg,
+				u64 mask, u32 timeout_us);
+
+/**
+ * panthor_gpu_power_on() - Power on the GPU block.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+#define panthor_gpu_power_on(ptdev, type, mask, timeout_us) \
+	panthor_gpu_block_power_on(ptdev, #type, \
+				  type ## _PWRON_LO, \
+				  type ## _PWRTRANS_LO, \
+				  type ## _READY_LO, \
+				  mask, timeout_us)
+
+/**
+ * panthor_gpu_power_off() - Power off the GPU block.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+#define panthor_gpu_power_off(ptdev, type, mask, timeout_us) \
+	panthor_gpu_block_power_off(ptdev, #type, \
+				   type ## _PWROFF_LO, \
+				   type ## _PWRTRANS_LO, \
+				   mask, timeout_us)
+
+int panthor_gpu_l2_power_on(struct panthor_device *ptdev);
+int panthor_gpu_flush_caches(struct panthor_device *ptdev,
+			     u32 l2, u32 lsc, u32 other);
+int panthor_gpu_soft_reset(struct panthor_device *ptdev);
+
+#endif
diff --git a/drivers/gpu/drm/panthor/panthor_heap.c b/drivers/gpu/drm/panthor/panthor_heap.c
new file mode 100644
index 0000000000000..143fa35f2e746
--- /dev/null
+++ b/drivers/gpu/drm/panthor/panthor_heap.c
@@ -0,0 +1,597 @@
+// SPDX-License-Identifier: GPL-2.0 or MIT
+/* Copyright 2023 Collabora ltd. */
+
+#include <linux/iosys-map.h>
+#include <linux/rwsem.h>
+
+#include <drm/panthor_drm.h>
+
+#include "panthor_device.h"
+#include "panthor_gem.h"
+#include "panthor_heap.h"
+#include "panthor_mmu.h"
+#include "panthor_regs.h"
+
+/*
+ * The GPU heap context is an opaque structure used by the GPU to track the
+ * heap allocations. The driver should only touch it to initialize it (zero all
+ * fields). Because the CPU and GPU can both access this structure it is
+ * required to be GPU cache line aligned.
+ */
+#define HEAP_CONTEXT_SIZE	32
+
+/**
+ * struct panthor_heap_chunk_header - Heap chunk header
+ */
+struct panthor_heap_chunk_header {
+	/**
+	 * @next: Next heap chunk in the list.
+	 *
+	 * This is a GPU VA.
+	 */
+	u64 next;
+
+	/** @unknown: MBZ. */
+	u32 unknown[14];
+};
+
+/**
+ * struct panthor_heap_chunk - Structure used to keep track of allocated heap chunks.
+ */
+struct panthor_heap_chunk {
+	/** @node: Used to insert the heap chunk in panthor_heap::chunks. */
+	struct list_head node;
+
+	/** @bo: Buffer object backing the heap chunk. */
+	struct panthor_kernel_bo *bo;
+};
+
+/**
+ * struct panthor_heap - Structure used to manage tiler heap contexts.
+ */
+struct panthor_heap {
+	/** @chunks: List containing all heap chunks allocated so far. */
+	struct list_head chunks;
+
+	/** @lock: Lock protecting insertion in the chunks list. */
+	struct mutex lock;
+
+	/** @chunk_size: Size of each chunk. */
+	u32 chunk_size;
+
+	/** @max_chunks: Maximum number of chunks. */
+	u32 max_chunks;
+
+	/**
+	 * @target_in_flight: Number of in-flight render passes after which
+	 * we'd let the FW wait for fragment job to finish instead of allocating new chunks.
+	 */
+	u32 target_in_flight;
+
+	/** @chunk_count: Number of heap chunks currently allocated. */
+	u32 chunk_count;
+};
+
+#define MAX_HEAPS_PER_POOL    128
+
+/**
+ * struct panthor_heap_pool - Pool of heap contexts
+ *
+ * The pool is attached to a panthor_file and can't be shared across processes.
+ */
+struct panthor_heap_pool {
+	/** @refcount: Reference count. */
+	struct kref refcount;
+
+	/** @ptdev: Device. */
+	struct panthor_device *ptdev;
+
+	/** @vm: VM this pool is bound to. */
+	struct panthor_vm *vm;
+
+	/** @lock: Lock protecting access to @xa. */
+	struct rw_semaphore lock;
+
+	/** @xa: Array storing panthor_heap objects. */
+	struct xarray xa;
+
+	/** @gpu_contexts: Buffer object containing the GPU heap contexts. */
+	struct panthor_kernel_bo *gpu_contexts;
+};
+
+static int panthor_heap_ctx_stride(struct panthor_device *ptdev)
+{
+	u32 l2_features = ptdev->gpu_info.l2_features;
+	u32 gpu_cache_line_size = GPU_L2_FEATURES_LINE_SIZE(l2_features);
+
+	return ALIGN(HEAP_CONTEXT_SIZE, gpu_cache_line_size);
+}
+
+static int panthor_get_heap_ctx_offset(struct panthor_heap_pool *pool, int id)
+{
+	return panthor_heap_ctx_stride(pool->ptdev) * id;
+}
+
+static void *panthor_get_heap_ctx(struct panthor_heap_pool *pool, int id)
+{
+	return pool->gpu_contexts->kmap +
+	       panthor_get_heap_ctx_offset(pool, id);
+}
+
+static void panthor_free_heap_chunk(struct panthor_vm *vm,
+				    struct panthor_heap *heap,
+				    struct panthor_heap_chunk *chunk)
+{
+	mutex_lock(&heap->lock);
+	list_del(&chunk->node);
+	heap->chunk_count--;
+	mutex_unlock(&heap->lock);
+
+	panthor_kernel_bo_destroy(vm, chunk->bo);
+	kfree(chunk);
+}
+
+static int panthor_alloc_heap_chunk(struct panthor_device *ptdev,
+				    struct panthor_vm *vm,
+				    struct panthor_heap *heap,
+				    bool initial_chunk)
+{
+	struct panthor_heap_chunk *chunk;
+	struct panthor_heap_chunk_header *hdr;
+	int ret;
+
+	chunk = kmalloc(sizeof(*chunk), GFP_KERNEL);
+	if (!chunk)
+		return -ENOMEM;
+
+	chunk->bo = panthor_kernel_bo_create(ptdev, vm, heap->chunk_size,
+					     DRM_PANTHOR_BO_NO_MMAP,
+					     DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC,
+					     PANTHOR_VM_KERNEL_AUTO_VA);
+	if (IS_ERR(chunk->bo)) {
+		ret = PTR_ERR(chunk->bo);
+		goto err_free_chunk;
+	}
+
+	ret = panthor_kernel_bo_vmap(chunk->bo);
+	if (ret)
+		goto err_destroy_bo;
+
+	hdr = chunk->bo->kmap;
+	memset(hdr, 0, sizeof(*hdr));
+
+	if (initial_chunk && !list_empty(&heap->chunks)) {
+		struct panthor_heap_chunk *prev_chunk;
+		u64 prev_gpuva;
+
+		prev_chunk = list_first_entry(&heap->chunks,
+					      struct panthor_heap_chunk,
+					      node);
+
+		prev_gpuva = panthor_kernel_bo_gpuva(prev_chunk->bo);
+		hdr->next = (prev_gpuva & GENMASK_ULL(63, 12)) |
+			    (heap->chunk_size >> 12);
+	}
+
+	panthor_kernel_bo_vunmap(chunk->bo);
+
+	mutex_lock(&heap->lock);
+	list_add(&chunk->node, &heap->chunks);
+	heap->chunk_count++;
+	mutex_unlock(&heap->lock);
+
+	return 0;
+
+err_destroy_bo:
+	panthor_kernel_bo_destroy(vm, chunk->bo);
+
+err_free_chunk:
+	kfree(chunk);
+
+	return ret;
+}
+
+static void panthor_free_heap_chunks(struct panthor_vm *vm,
+				     struct panthor_heap *heap)
+{
+	struct panthor_heap_chunk *chunk, *tmp;
+
+	list_for_each_entry_safe(chunk, tmp, &heap->chunks, node)
+		panthor_free_heap_chunk(vm, heap, chunk);
+}
+
+static int panthor_alloc_heap_chunks(struct panthor_device *ptdev,
+				     struct panthor_vm *vm,
+				     struct panthor_heap *heap,
+				     u32 chunk_count)
+{
+	int ret;
+	u32 i;
+
+	for (i = 0; i < chunk_count; i++) {
+		ret = panthor_alloc_heap_chunk(ptdev, vm, heap, true);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int
+panthor_heap_destroy_locked(struct panthor_heap_pool *pool, u32 handle)
+{
+	struct panthor_heap *heap;
+
+	heap = xa_erase(&pool->xa, handle);
+	if (!heap)
+		return -EINVAL;
+
+	panthor_free_heap_chunks(pool->vm, heap);
+	mutex_destroy(&heap->lock);
+	kfree(heap);
+	return 0;
+}
+
+/**
+ * panthor_heap_destroy() - Destroy a heap context
+ * @pool: Pool this context belongs to.
+ * @handle: Handle returned by panthor_heap_create().
+ */
+int panthor_heap_destroy(struct panthor_heap_pool *pool, u32 handle)
+{
+	int ret;
+
+	down_write(&pool->lock);
+	ret = panthor_heap_destroy_locked(pool, handle);
+	up_write(&pool->lock);
+
+	return ret;
+}
+
+/**
+ * panthor_heap_create() - Create a heap context
+ * @pool: Pool to instantiate the heap context from.
+ * @initial_chunk_count: Number of chunk allocated at initialization time.
+ * Must be at least 1.
+ * @chunk_size: The size of each chunk. Must be a power of two between 256k
+ * and 2M.
+ * @max_chunks: Maximum number of chunks that can be allocated.
+ * @target_in_flight: Maximum number of in-flight render passes.
+ * @heap_ctx_gpu_va: Pointer holding the GPU address of the allocated heap
+ * context.
+ * @first_chunk_gpu_va: Pointer holding the GPU address of the first chunk
+ * assigned to the heap context.
+ *
+ * Return: a positive handle on success, a negative error otherwise.
+ */
+int panthor_heap_create(struct panthor_heap_pool *pool,
+			u32 initial_chunk_count,
+			u32 chunk_size,
+			u32 max_chunks,
+			u32 target_in_flight,
+			u64 *heap_ctx_gpu_va,
+			u64 *first_chunk_gpu_va)
+{
+	struct panthor_heap *heap;
+	struct panthor_heap_chunk *first_chunk;
+	struct panthor_vm *vm;
+	int ret = 0;
+	u32 id;
+
+	if (initial_chunk_count == 0)
+		return -EINVAL;
+
+	if (hweight32(chunk_size) != 1 ||
+	    chunk_size < SZ_256K || chunk_size > SZ_2M)
+		return -EINVAL;
+
+	down_read(&pool->lock);
+	vm = panthor_vm_get(pool->vm);
+	up_read(&pool->lock);
+
+	/* The pool has been destroyed, we can't create a new heap. */
+	if (!vm)
+		return -EINVAL;
+
+	heap = kzalloc(sizeof(*heap), GFP_KERNEL);
+	if (!heap) {
+		ret = -ENOMEM;
+		goto err_put_vm;
+	}
+
+	mutex_init(&heap->lock);
+	INIT_LIST_HEAD(&heap->chunks);
+	heap->chunk_size = chunk_size;
+	heap->max_chunks = max_chunks;
+	heap->target_in_flight = target_in_flight;
+
+	ret = panthor_alloc_heap_chunks(pool->ptdev, vm, heap,
+					initial_chunk_count);
+	if (ret)
+		goto err_free_heap;
+
+	first_chunk = list_first_entry(&heap->chunks,
+				       struct panthor_heap_chunk,
+				       node);
+	*first_chunk_gpu_va = panthor_kernel_bo_gpuva(first_chunk->bo);
+
+	down_write(&pool->lock);
+	/* The pool has been destroyed, we can't create a new heap. */
+	if (!pool->vm) {
+		ret = -EINVAL;
+	} else {
+		ret = xa_alloc(&pool->xa, &id, heap, XA_LIMIT(1, MAX_HEAPS_PER_POOL), GFP_KERNEL);
+		if (!ret) {
+			void *gpu_ctx = panthor_get_heap_ctx(pool, id);
+
+			memset(gpu_ctx, 0, panthor_heap_ctx_stride(pool->ptdev));
+			*heap_ctx_gpu_va = panthor_kernel_bo_gpuva(pool->gpu_contexts) +
+					   panthor_get_heap_ctx_offset(pool, id);
+		}
+	}
+	up_write(&pool->lock);
+
+	if (ret)
+		goto err_free_heap;
+
+	panthor_vm_put(vm);
+	return id;
+
+err_free_heap:
+	panthor_free_heap_chunks(pool->vm, heap);
+	mutex_destroy(&heap->lock);
+	kfree(heap);
+
+err_put_vm:
+	panthor_vm_put(vm);
+	return ret;
+}
+
+/**
+ * panthor_heap_return_chunk() - Return an unused heap chunk
+ * @pool: The pool this heap belongs to.
+ * @heap_gpu_va: The GPU address of the heap context.
+ * @chunk_gpu_va: The chunk VA to return.
+ *
+ * This function is used when a chunk allocated with panthor_heap_grow()
+ * couldn't be linked to the heap context through the FW interface because
+ * the group requesting the allocation was scheduled out in the meantime.
+ */
+int panthor_heap_return_chunk(struct panthor_heap_pool *pool,
+			      u64 heap_gpu_va,
+			      u64 chunk_gpu_va)
+{
+	u64 offset = heap_gpu_va - panthor_kernel_bo_gpuva(pool->gpu_contexts);
+	u32 heap_id = (u32)offset / panthor_heap_ctx_stride(pool->ptdev);
+	struct panthor_heap_chunk *chunk, *tmp, *removed = NULL;
+	struct panthor_heap *heap;
+	int ret;
+
+	if (offset > U32_MAX || heap_id >= MAX_HEAPS_PER_POOL)
+		return -EINVAL;
+
+	down_read(&pool->lock);
+	heap = xa_load(&pool->xa, heap_id);
+	if (!heap) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	chunk_gpu_va &= GENMASK_ULL(63, 12);
+
+	mutex_lock(&heap->lock);
+	list_for_each_entry_safe(chunk, tmp, &heap->chunks, node) {
+		if (panthor_kernel_bo_gpuva(chunk->bo) == chunk_gpu_va) {
+			removed = chunk;
+			list_del(&chunk->node);
+			heap->chunk_count--;
+			break;
+		}
+	}
+	mutex_unlock(&heap->lock);
+
+	if (removed) {
+		panthor_kernel_bo_destroy(pool->vm, chunk->bo);
+		kfree(chunk);
+		ret = 0;
+	} else {
+		ret = -EINVAL;
+	}
+
+out_unlock:
+	up_read(&pool->lock);
+	return ret;
+}
+
+/**
+ * panthor_heap_grow() - Make a heap context grow.
+ * @pool: The pool this heap belongs to.
+ * @heap_gpu_va: The GPU address of the heap context.
+ * @renderpasses_in_flight: Number of render passes currently in-flight.
+ * @pending_frag_count: Number of fragment jobs waiting for execution/completion.
+ * @new_chunk_gpu_va: Pointer used to return the chunk VA.
+ */
+int panthor_heap_grow(struct panthor_heap_pool *pool,
+		      u64 heap_gpu_va,
+		      u32 renderpasses_in_flight,
+		      u32 pending_frag_count,
+		      u64 *new_chunk_gpu_va)
+{
+	u64 offset = heap_gpu_va - panthor_kernel_bo_gpuva(pool->gpu_contexts);
+	u32 heap_id = (u32)offset / panthor_heap_ctx_stride(pool->ptdev);
+	struct panthor_heap_chunk *chunk;
+	struct panthor_heap *heap;
+	int ret;
+
+	if (offset > U32_MAX || heap_id >= MAX_HEAPS_PER_POOL)
+		return -EINVAL;
+
+	down_read(&pool->lock);
+	heap = xa_load(&pool->xa, heap_id);
+	if (!heap) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* If we reached the target in-flight render passes, or if we
+	 * reached the maximum number of chunks, let the FW figure another way to
+	 * find some memory (wait for render passes to finish, or call the exception
+	 * handler provided by the userspace driver, if any).
+	 */
+	if (renderpasses_in_flight > heap->target_in_flight ||
+	    (pending_frag_count > 0 && heap->chunk_count >= heap->max_chunks)) {
+		ret = -EBUSY;
+		goto out_unlock;
+	} else if (heap->chunk_count >= heap->max_chunks) {
+		ret = -ENOMEM;
+		goto out_unlock;
+	}
+
+	/* FIXME: panthor_alloc_heap_chunk() triggers a kernel BO creation,
+	 * which goes through the blocking allocation path. Ultimately, we
+	 * want a non-blocking allocation, so we can immediately report to the
+	 * FW when the system is running out of memory. In that case, the FW
+	 * can call a user-provided exception handler, which might try to free
+	 * some tiler memory by issuing an intermediate fragment job. If the
+	 * exception handler can't do anything, it will flag the queue as
+	 * faulty so the job that triggered this tiler chunk allocation and all
+	 * further jobs in this queue fail immediately instead of having to
+	 * wait for the job timeout.
+	 */
+	ret = panthor_alloc_heap_chunk(pool->ptdev, pool->vm, heap, false);
+	if (ret)
+		goto out_unlock;
+
+	chunk = list_first_entry(&heap->chunks,
+				 struct panthor_heap_chunk,
+				 node);
+	*new_chunk_gpu_va = (panthor_kernel_bo_gpuva(chunk->bo) & GENMASK_ULL(63, 12)) |
+			    (heap->chunk_size >> 12);
+	ret = 0;
+
+out_unlock:
+	up_read(&pool->lock);
+	return ret;
+}
+
+static void panthor_heap_pool_release(struct kref *refcount)
+{
+	struct panthor_heap_pool *pool =
+		container_of(refcount, struct panthor_heap_pool, refcount);
+
+	xa_destroy(&pool->xa);
+	kfree(pool);
+}
+
+/**
+ * panthor_heap_pool_put() - Release a heap pool reference
+ * @pool: Pool to release the reference on. Can be NULL.
+ */
+void panthor_heap_pool_put(struct panthor_heap_pool *pool)
+{
+	if (pool)
+		kref_put(&pool->refcount, panthor_heap_pool_release);
+}
+
+/**
+ * panthor_heap_pool_get() - Get a heap pool reference
+ * @pool: Pool to get the reference on. Can be NULL.
+ *
+ * Return: @pool.
+ */
+struct panthor_heap_pool *
+panthor_heap_pool_get(struct panthor_heap_pool *pool)
+{
+	if (pool)
+		kref_get(&pool->refcount);
+
+	return pool;
+}
+
+/**
+ * panthor_heap_pool_create() - Create a heap pool
+ * @ptdev: Device.
+ * @vm: The VM this heap pool will be attached to.
+ *
+ * Heap pools might contain up to 128 heap contexts, and are per-VM.
+ *
+ * Return: A valid pointer on success, a negative error code otherwise.
+ */
+struct panthor_heap_pool *
+panthor_heap_pool_create(struct panthor_device *ptdev, struct panthor_vm *vm)
+{
+	size_t bosize = ALIGN(MAX_HEAPS_PER_POOL *
+			      panthor_heap_ctx_stride(ptdev),
+			      4096);
+	struct panthor_heap_pool *pool;
+	int ret = 0;
+
+	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+	if (!pool)
+		return ERR_PTR(-ENOMEM);
+
+	/* We want a weak ref here: the heap pool belongs to the VM, so we're
+	 * sure that, as long as the heap pool exists, the VM exists too.
+	 */
+	pool->vm = vm;
+	pool->ptdev = ptdev;
+	init_rwsem(&pool->lock);
+	xa_init_flags(&pool->xa, XA_FLAGS_ALLOC1);
+	kref_init(&pool->refcount);
+
+	pool->gpu_contexts = panthor_kernel_bo_create(ptdev, vm, bosize,
+						      DRM_PANTHOR_BO_NO_MMAP,
+						      DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC,
+						      PANTHOR_VM_KERNEL_AUTO_VA);
+	if (IS_ERR(pool->gpu_contexts)) {
+		ret = PTR_ERR(pool->gpu_contexts);
+		goto err_destroy_pool;
+	}
+
+	ret = panthor_kernel_bo_vmap(pool->gpu_contexts);
+	if (ret)
+		goto err_destroy_pool;
+
+	return pool;
+
+err_destroy_pool:
+	panthor_heap_pool_destroy(pool);
+	return ERR_PTR(ret);
+}
+
+/**
+ * panthor_heap_pool_destroy() - Destroy a heap pool.
+ * @pool: Pool to destroy.
+ *
+ * This function destroys all heap contexts and their resources. Thus
+ * preventing any use of the heap context or the chunk attached to them
+ * after that point.
+ *
+ * If the GPU still has access to some heap contexts, a fault should be
+ * triggered, which should flag the command stream groups using these
+ * context as faulty.
+ *
+ * The heap pool object is only released when all references to this pool
+ * are released.
+ */
+void panthor_heap_pool_destroy(struct panthor_heap_pool *pool)
+{
+	struct panthor_heap *heap;
+	unsigned long i;
+
+	if (!pool)
+		return;
+
+	down_write(&pool->lock);
+	xa_for_each(&pool->xa, i, heap)
+		drm_WARN_ON(&pool->ptdev->base, panthor_heap_destroy_locked(pool, i));
+
+	if (!IS_ERR_OR_NULL(pool->gpu_contexts))
+		panthor_kernel_bo_destroy(pool->vm, pool->gpu_contexts);
+
+	/* Reflects the fact the pool has been destroyed. */
+	pool->vm = NULL;
+	up_write(&pool->lock);
+
+	panthor_heap_pool_put(pool);
+}
diff --git a/drivers/gpu/drm/panthor/panthor_heap.h b/drivers/gpu/drm/panthor/panthor_heap.h
new file mode 100644
index 0000000000000..25a5f2bba4457
--- /dev/null
+++ b/drivers/gpu/drm/panthor/panthor_heap.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 or MIT */
+/* Copyright 2023 Collabora ltd. */
+
+#ifndef __PANTHOR_HEAP_H__
+#define __PANTHOR_HEAP_H__
+
+#include <linux/types.h>
+
+struct panthor_device;
+struct panthor_heap_pool;
+struct panthor_vm;
+
+int panthor_heap_create(struct panthor_heap_pool *pool,
+			u32 initial_chunk_count,
+			u32 chunk_size,
+			u32 max_chunks,
+			u32 target_in_flight,
+			u64 *heap_ctx_gpu_va,
+			u64 *first_chunk_gpu_va);
+int panthor_heap_destroy(struct panthor_heap_pool *pool, u32 handle);
+
+struct panthor_heap_pool *
+panthor_heap_pool_create(struct panthor_device *ptdev, struct panthor_vm *vm);
+void panthor_heap_pool_destroy(struct panthor_heap_pool *pool);
+
+struct panthor_heap_pool *
+panthor_heap_pool_get(struct panthor_heap_pool *pool);
+void panthor_heap_pool_put(struct panthor_heap_pool *pool);
+
+int panthor_heap_grow(struct panthor_heap_pool *pool,
+		      u64 heap_gpu_va,
+		      u32 renderpasses_in_flight,
+		      u32 pending_frag_count,
+		      u64 *new_chunk_gpu_va);
+int panthor_heap_return_chunk(struct panthor_heap_pool *pool,
+			      u64 heap_gpu_va,
+			      u64 chunk_gpu_va);
+
+#endif
diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c
new file mode 100644
index 0000000000000..1ca4e6adc99aa
--- /dev/null
+++ b/drivers/gpu/drm/panthor/panthor_mmu.c
@@ -0,0 +1,2717 @@
+// SPDX-License-Identifier: GPL-2.0 or MIT
+/* Copyright 2019 Linaro, Ltd, Rob Herring <robh@kernel.org> */
+/* Copyright 2023 Collabora ltd. */
+
+#include <drm/drm_drv.h>
+#include <drm/drm_exec.h>
+#include <drm/drm_gpuvm.h>
+#include <drm/drm_managed.h>
+#include <drm/gpu_scheduler.h>
+#include <drm/panthor_drm.h>
+
+#include <linux/atomic.h>
+#include <linux/bitfield.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/iopoll.h>
+#include <linux/io-pgtable.h>
+#include <linux/iommu.h>
+#include <linux/kmemleak.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/shmem_fs.h>
+#include <linux/sizes.h>
+
+#include "panthor_device.h"
+#include "panthor_gem.h"
+#include "panthor_heap.h"
+#include "panthor_mmu.h"
+#include "panthor_regs.h"
+#include "panthor_sched.h"
+
+#define MAX_AS_SLOTS			32
+
+struct panthor_vm;
+
+/**
+ * struct panthor_as_slot - Address space slot
+ */
+struct panthor_as_slot {
+	/** @vm: VM bound to this slot. NULL is no VM is bound. */
+	struct panthor_vm *vm;
+};
+
+/**
+ * struct panthor_mmu - MMU related data
+ */
+struct panthor_mmu {
+	/** @irq: The MMU irq. */
+	struct panthor_irq irq;
+
+	/** @as: Address space related fields.
+	 *
+	 * The GPU has a limited number of address spaces (AS) slots, forcing
+	 * us to re-assign them to re-assign slots on-demand.
+	 */
+	struct {
+		/** @slots_lock: Lock protecting access to all other AS fields. */
+		struct mutex slots_lock;
+
+		/** @alloc_mask: Bitmask encoding the allocated slots. */
+		unsigned long alloc_mask;
+
+		/** @faulty_mask: Bitmask encoding the faulty slots. */
+		unsigned long faulty_mask;
+
+		/** @slots: VMs currently bound to the AS slots. */
+		struct panthor_as_slot slots[MAX_AS_SLOTS];
+
+		/**
+		 * @lru_list: List of least recently used VMs.
+		 *
+		 * We use this list to pick a VM to evict when all slots are
+		 * used.
+		 *
+		 * There should be no more active VMs than there are AS slots,
+		 * so this LRU is just here to keep VMs bound until there's
+		 * a need to release a slot, thus avoid unnecessary TLB/cache
+		 * flushes.
+		 */
+		struct list_head lru_list;
+	} as;
+
+	/** @vm: VMs management fields */
+	struct {
+		/** @lock: Lock protecting access to list. */
+		struct mutex lock;
+
+		/** @list: List containing all VMs. */
+		struct list_head list;
+
+		/** @reset_in_progress: True if a reset is in progress. */
+		bool reset_in_progress;
+
+		/** @wq: Workqueue used for the VM_BIND queues. */
+		struct workqueue_struct *wq;
+	} vm;
+};
+
+/**
+ * struct panthor_vm_pool - VM pool object
+ */
+struct panthor_vm_pool {
+	/** @xa: Array used for VM handle tracking. */
+	struct xarray xa;
+};
+
+/**
+ * struct panthor_vma - GPU mapping object
+ *
+ * This is used to track GEM mappings in GPU space.
+ */
+struct panthor_vma {
+	/** @base: Inherits from drm_gpuva. */
+	struct drm_gpuva base;
+
+	/** @node: Used to implement deferred release of VMAs. */
+	struct list_head node;
+
+	/**
+	 * @flags: Combination of drm_panthor_vm_bind_op_flags.
+	 *
+	 * Only map related flags are accepted.
+	 */
+	u32 flags;
+};
+
+/**
+ * struct panthor_vm_op_ctx - VM operation context
+ *
+ * With VM operations potentially taking place in a dma-signaling path, we
+ * need to make sure everything that might require resource allocation is
+ * pre-allocated upfront. This is what this operation context is far.
+ *
+ * We also collect resources that have been freed, so we can release them
+ * asynchronously, and let the VM_BIND scheduler process the next VM_BIND
+ * request.
+ */
+struct panthor_vm_op_ctx {
+	/** @rsvd_page_tables: Pages reserved for the MMU page table update. */
+	struct {
+		/** @count: Number of pages reserved. */
+		u32 count;
+
+		/** @ptr: Point to the first unused page in the @pages table. */
+		u32 ptr;
+
+		/**
+		 * @page: Array of pages that can be used for an MMU page table update.
+		 *
+		 * After an VM operation, there might be free pages left in this array.
+		 * They should be returned to the pt_cache as part of the op_ctx cleanup.
+		 */
+		void **pages;
+	} rsvd_page_tables;
+
+	/**
+	 * @preallocated_vmas: Pre-allocated VMAs to handle the remap case.
+	 *
+	 * Partial unmap requests or map requests overlapping existing mappings will
+	 * trigger a remap call, which need to register up to three panthor_vma objects
+	 * (one for the new mapping, and two for the previous and next mappings).
+	 */
+	struct panthor_vma *preallocated_vmas[3];
+
+	/** @flags: Combination of drm_panthor_vm_bind_op_flags. */
+	u32 flags;
+
+	/** @va: Virtual range targeted by the VM operation. */
+	struct {
+		/** @addr: Start address. */
+		u64 addr;
+
+		/** @range: Range size. */
+		u64 range;
+	} va;
+
+	/**
+	 * @returned_vmas: List of panthor_vma objects returned after a VM operation.
+	 *
+	 * For unmap operations, this will contain all VMAs that were covered by the
+	 * specified VA range.
+	 *
+	 * For map operations, this will contain all VMAs that previously mapped to
+	 * the specified VA range.
+	 *
+	 * Those VMAs, and the resources they point to will be released as part of
+	 * the op_ctx cleanup operation.
+	 */
+	struct list_head returned_vmas;
+
+	/** @map: Fields specific to a map operation. */
+	struct {
+		/** @vm_bo: Buffer object to map. */
+		struct drm_gpuvm_bo *vm_bo;
+
+		/** @bo_offset: Offset in the buffer object. */
+		u64 bo_offset;
+
+		/**
+		 * @sgt: sg-table pointing to pages backing the GEM object.
+		 *
+		 * This is gathered at job creation time, such that we don't have
+		 * to allocate in ::run_job().
+		 */
+		struct sg_table *sgt;
+
+		/**
+		 * @new_vma: The new VMA object that will be inserted to the VA tree.
+		 */
+		struct panthor_vma *new_vma;
+	} map;
+};
+
+/**
+ * struct panthor_vm - VM object
+ *
+ * A VM is an object representing a GPU (or MCU) virtual address space.
+ * It embeds the MMU page table for this address space, a tree containing
+ * all the virtual mappings of GEM objects, and other things needed to manage
+ * the VM.
+ *
+ * Except for the MCU VM, which is managed by the kernel, all other VMs are
+ * created by userspace and mostly managed by userspace, using the
+ * %DRM_IOCTL_PANTHOR_VM_BIND ioctl.
+ *
+ * A portion of the virtual address space is reserved for kernel objects,
+ * like heap chunks, and userspace gets to decide how much of the virtual
+ * address space is left to the kernel (half of the virtual address space
+ * by default).
+ */
+struct panthor_vm {
+	/**
+	 * @base: Inherit from drm_gpuvm.
+	 *
+	 * We delegate all the VA management to the common drm_gpuvm framework
+	 * and only implement hooks to update the MMU page table.
+	 */
+	struct drm_gpuvm base;
+
+	/**
+	 * @sched: Scheduler used for asynchronous VM_BIND request.
+	 *
+	 * We use a 1:1 scheduler here.
+	 */
+	struct drm_gpu_scheduler sched;
+
+	/**
+	 * @entity: Scheduling entity representing the VM_BIND queue.
+	 *
+	 * There's currently one bind queue per VM. It doesn't make sense to
+	 * allow more given the VM operations are serialized anyway.
+	 */
+	struct drm_sched_entity entity;
+
+	/** @ptdev: Device. */
+	struct panthor_device *ptdev;
+
+	/** @memattr: Value to program to the AS_MEMATTR register. */
+	u64 memattr;
+
+	/** @pgtbl_ops: Page table operations. */
+	struct io_pgtable_ops *pgtbl_ops;
+
+	/** @root_page_table: Stores the root page table pointer. */
+	void *root_page_table;
+
+	/**
+	 * @op_lock: Lock used to serialize operations on a VM.
+	 *
+	 * The serialization of jobs queued to the VM_BIND queue is already
+	 * taken care of by drm_sched, but we need to serialize synchronous
+	 * and asynchronous VM_BIND request. This is what this lock is for.
+	 */
+	struct mutex op_lock;
+
+	/**
+	 * @op_ctx: The context attached to the currently executing VM operation.
+	 *
+	 * NULL when no operation is in progress.
+	 */
+	struct panthor_vm_op_ctx *op_ctx;
+
+	/**
+	 * @mm: Memory management object representing the auto-VA/kernel-VA.
+	 *
+	 * Used to auto-allocate VA space for kernel-managed objects (tiler
+	 * heaps, ...).
+	 *
+	 * For the MCU VM, this is managing the VA range that's used to map
+	 * all shared interfaces.
+	 *
+	 * For user VMs, the range is specified by userspace, and must not
+	 * exceed half of the VA space addressable.
+	 */
+	struct drm_mm mm;
+
+	/** @mm_lock: Lock protecting the @mm field. */
+	struct mutex mm_lock;
+
+	/** @kernel_auto_va: Automatic VA-range for kernel BOs. */
+	struct {
+		/** @start: Start of the automatic VA-range for kernel BOs. */
+		u64 start;
+
+		/** @size: Size of the automatic VA-range for kernel BOs. */
+		u64 end;
+	} kernel_auto_va;
+
+	/** @as: Address space related fields. */
+	struct {
+		/**
+		 * @id: ID of the address space this VM is bound to.
+		 *
+		 * A value of -1 means the VM is inactive/not bound.
+		 */
+		int id;
+
+		/** @active_cnt: Number of active users of this VM. */
+		refcount_t active_cnt;
+
+		/**
+		 * @lru_node: Used to instead the VM in the panthor_mmu::as::lru_list.
+		 *
+		 * Active VMs should not be inserted in the LRU list.
+		 */
+		struct list_head lru_node;
+	} as;
+
+	/**
+	 * @heaps: Tiler heap related fields.
+	 */
+	struct {
+		/**
+		 * @pool: The heap pool attached to this VM.
+		 *
+		 * Will stay NULL until someone creates a heap context on this VM.
+		 */
+		struct panthor_heap_pool *pool;
+
+		/** @lock: Lock used to protect access to @pool. */
+		struct mutex lock;
+	} heaps;
+
+	/** @node: Used to insert the VM in the panthor_mmu::vm::list. */
+	struct list_head node;
+
+	/** @for_mcu: True if this is the MCU VM. */
+	bool for_mcu;
+
+	/**
+	 * @destroyed: True if the VM was destroyed.
+	 *
+	 * No further bind requests should be queued to a destroyed VM.
+	 */
+	bool destroyed;
+
+	/**
+	 * @unusable: True if the VM has turned unusable because something
+	 * bad happened during an asynchronous request.
+	 *
+	 * We don't try to recover from such failures, because this implies
+	 * informing userspace about the specific operation that failed, and
+	 * hoping the userspace driver can replay things from there. This all
+	 * sounds very complicated for little gain.
+	 *
+	 * Instead, we should just flag the VM as unusable, and fail any
+	 * further request targeting this VM.
+	 *
+	 * We also provide a way to query a VM state, so userspace can destroy
+	 * it and create a new one.
+	 *
+	 * As an analogy, this would be mapped to a VK_ERROR_DEVICE_LOST
+	 * situation, where the logical device needs to be re-created.
+	 */
+	bool unusable;
+
+	/**
+	 * @unhandled_fault: Unhandled fault happened.
+	 *
+	 * This should be reported to the scheduler, and the queue/group be
+	 * flagged as faulty as a result.
+	 */
+	bool unhandled_fault;
+};
+
+/**
+ * struct panthor_vm_bind_job - VM bind job
+ */
+struct panthor_vm_bind_job {
+	/** @base: Inherit from drm_sched_job. */
+	struct drm_sched_job base;
+
+	/** @refcount: Reference count. */
+	struct kref refcount;
+
+	/** @cleanup_op_ctx_work: Work used to cleanup the VM operation context. */
+	struct work_struct cleanup_op_ctx_work;
+
+	/** @vm: VM targeted by the VM operation. */
+	struct panthor_vm *vm;
+
+	/** @ctx: Operation context. */
+	struct panthor_vm_op_ctx ctx;
+};
+
+/**
+ * @pt_cache: Cache used to allocate MMU page tables.
+ *
+ * The pre-allocation pattern forces us to over-allocate to plan for
+ * the worst case scenario, and return the pages we didn't use.
+ *
+ * Having a kmem_cache allows us to speed allocations.
+ */
+static struct kmem_cache *pt_cache;
+
+/**
+ * alloc_pt() - Custom page table allocator
+ * @cookie: Cookie passed at page table allocation time.
+ * @size: Size of the page table. This size should be fixed,
+ * and determined at creation time based on the granule size.
+ * @gfp: GFP flags.
+ *
+ * We want a custom allocator so we can use a cache for page table
+ * allocations and amortize the cost of the over-reservation that's
+ * done to allow asynchronous VM operations.
+ *
+ * Return: non-NULL on success, NULL if the allocation failed for any
+ * reason.
+ */
+static void *alloc_pt(void *cookie, size_t size, gfp_t gfp)
+{
+	struct panthor_vm *vm = cookie;
+	void *page;
+
+	/* Allocation of the root page table happening during init. */
+	if (unlikely(!vm->root_page_table)) {
+		struct page *p;
+
+		drm_WARN_ON(&vm->ptdev->base, vm->op_ctx);
+		p = alloc_pages_node(dev_to_node(vm->ptdev->base.dev),
+				     gfp | __GFP_ZERO, get_order(size));
+		page = p ? page_address(p) : NULL;
+		vm->root_page_table = page;
+		return page;
+	}
+
+	/* We're not supposed to have anything bigger than 4k here, because we picked a
+	 * 4k granule size at init time.
+	 */
+	if (drm_WARN_ON(&vm->ptdev->base, size != SZ_4K))
+		return NULL;
+
+	/* We must have some op_ctx attached to the VM and it must have at least one
+	 * free page.
+	 */
+	if (drm_WARN_ON(&vm->ptdev->base, !vm->op_ctx) ||
+	    drm_WARN_ON(&vm->ptdev->base,
+			vm->op_ctx->rsvd_page_tables.ptr >= vm->op_ctx->rsvd_page_tables.count))
+		return NULL;
+
+	page = vm->op_ctx->rsvd_page_tables.pages[vm->op_ctx->rsvd_page_tables.ptr++];
+	memset(page, 0, SZ_4K);
+
+	/* Page table entries don't use virtual addresses, which trips out
+	 * kmemleak. kmemleak_alloc_phys() might work, but physical addresses
+	 * are mixed with other fields, and I fear kmemleak won't detect that
+	 * either.
+	 *
+	 * Let's just ignore memory passed to the page-table driver for now.
+	 */
+	kmemleak_ignore(page);
+	return page;
+}
+
+/**
+ * @free_pt() - Custom page table free function
+ * @cookie: Cookie passed at page table allocation time.
+ * @data: Page table to free.
+ * @size: Size of the page table. This size should be fixed,
+ * and determined at creation time based on the granule size.
+ */
+static void free_pt(void *cookie, void *data, size_t size)
+{
+	struct panthor_vm *vm = cookie;
+
+	if (unlikely(vm->root_page_table == data)) {
+		free_pages((unsigned long)data, get_order(size));
+		vm->root_page_table = NULL;
+		return;
+	}
+
+	if (drm_WARN_ON(&vm->ptdev->base, size != SZ_4K))
+		return;
+
+	/* Return the page to the pt_cache. */
+	kmem_cache_free(pt_cache, data);
+}
+
+static int wait_ready(struct panthor_device *ptdev, u32 as_nr)
+{
+	int ret;
+	u32 val;
+
+	/* Wait for the MMU status to indicate there is no active command, in
+	 * case one is pending.
+	 */
+	ret = readl_relaxed_poll_timeout_atomic(ptdev->iomem + AS_STATUS(as_nr),
+						val, !(val & AS_STATUS_AS_ACTIVE),
+						10, 100000);
+
+	if (ret) {
+		panthor_device_schedule_reset(ptdev);
+		drm_err(&ptdev->base, "AS_ACTIVE bit stuck\n");
+	}
+
+	return ret;
+}
+
+static int write_cmd(struct panthor_device *ptdev, u32 as_nr, u32 cmd)
+{
+	int status;
+
+	/* write AS_COMMAND when MMU is ready to accept another command */
+	status = wait_ready(ptdev, as_nr);
+	if (!status)
+		gpu_write(ptdev, AS_COMMAND(as_nr), cmd);
+
+	return status;
+}
+
+static void lock_region(struct panthor_device *ptdev, u32 as_nr,
+			u64 region_start, u64 size)
+{
+	u8 region_width;
+	u64 region;
+	u64 region_end = region_start + size;
+
+	if (!size)
+		return;
+
+	/*
+	 * The locked region is a naturally aligned power of 2 block encoded as
+	 * log2 minus(1).
+	 * Calculate the desired start/end and look for the highest bit which
+	 * differs. The smallest naturally aligned block must include this bit
+	 * change, the desired region starts with this bit (and subsequent bits)
+	 * zeroed and ends with the bit (and subsequent bits) set to one.
+	 */
+	region_width = max(fls64(region_start ^ (region_end - 1)),
+			   const_ilog2(AS_LOCK_REGION_MIN_SIZE)) - 1;
+
+	/*
+	 * Mask off the low bits of region_start (which would be ignored by
+	 * the hardware anyway)
+	 */
+	region_start &= GENMASK_ULL(63, region_width);
+
+	region = region_width | region_start;
+
+	/* Lock the region that needs to be updated */
+	gpu_write(ptdev, AS_LOCKADDR_LO(as_nr), lower_32_bits(region));
+	gpu_write(ptdev, AS_LOCKADDR_HI(as_nr), upper_32_bits(region));
+	write_cmd(ptdev, as_nr, AS_COMMAND_LOCK);
+}
+
+static int mmu_hw_do_operation_locked(struct panthor_device *ptdev, int as_nr,
+				      u64 iova, u64 size, u32 op)
+{
+	lockdep_assert_held(&ptdev->mmu->as.slots_lock);
+
+	if (as_nr < 0)
+		return 0;
+
+	if (op != AS_COMMAND_UNLOCK)
+		lock_region(ptdev, as_nr, iova, size);
+
+	/* Run the MMU operation */
+	write_cmd(ptdev, as_nr, op);
+
+	/* Wait for the flush to complete */
+	return wait_ready(ptdev, as_nr);
+}
+
+static int mmu_hw_do_operation(struct panthor_vm *vm,
+			       u64 iova, u64 size, u32 op)
+{
+	struct panthor_device *ptdev = vm->ptdev;
+	int ret;
+
+	mutex_lock(&ptdev->mmu->as.slots_lock);
+	ret = mmu_hw_do_operation_locked(ptdev, vm->as.id, iova, size, op);
+	mutex_unlock(&ptdev->mmu->as.slots_lock);
+
+	return ret;
+}
+
+static int panthor_mmu_as_enable(struct panthor_device *ptdev, u32 as_nr,
+				 u64 transtab, u64 transcfg, u64 memattr)
+{
+	int ret;
+
+	ret = mmu_hw_do_operation_locked(ptdev, as_nr, 0, ~0ULL, AS_COMMAND_FLUSH_MEM);
+	if (ret)
+		return ret;
+
+	gpu_write(ptdev, AS_TRANSTAB_LO(as_nr), lower_32_bits(transtab));
+	gpu_write(ptdev, AS_TRANSTAB_HI(as_nr), upper_32_bits(transtab));
+
+	gpu_write(ptdev, AS_MEMATTR_LO(as_nr), lower_32_bits(memattr));
+	gpu_write(ptdev, AS_MEMATTR_HI(as_nr), upper_32_bits(memattr));
+
+	gpu_write(ptdev, AS_TRANSCFG_LO(as_nr), lower_32_bits(transcfg));
+	gpu_write(ptdev, AS_TRANSCFG_HI(as_nr), upper_32_bits(transcfg));
+
+	return write_cmd(ptdev, as_nr, AS_COMMAND_UPDATE);
+}
+
+static int panthor_mmu_as_disable(struct panthor_device *ptdev, u32 as_nr)
+{
+	int ret;
+
+	ret = mmu_hw_do_operation_locked(ptdev, as_nr, 0, ~0ULL, AS_COMMAND_FLUSH_MEM);
+	if (ret)
+		return ret;
+
+	gpu_write(ptdev, AS_TRANSTAB_LO(as_nr), 0);
+	gpu_write(ptdev, AS_TRANSTAB_HI(as_nr), 0);
+
+	gpu_write(ptdev, AS_MEMATTR_LO(as_nr), 0);
+	gpu_write(ptdev, AS_MEMATTR_HI(as_nr), 0);
+
+	gpu_write(ptdev, AS_TRANSCFG_LO(as_nr), AS_TRANSCFG_ADRMODE_UNMAPPED);
+	gpu_write(ptdev, AS_TRANSCFG_HI(as_nr), 0);
+
+	return write_cmd(ptdev, as_nr, AS_COMMAND_UPDATE);
+}
+
+static u32 panthor_mmu_fault_mask(struct panthor_device *ptdev, u32 value)
+{
+	/* Bits 16 to 31 mean REQ_COMPLETE. */
+	return value & GENMASK(15, 0);
+}
+
+static u32 panthor_mmu_as_fault_mask(struct panthor_device *ptdev, u32 as)
+{
+	return BIT(as);
+}
+
+/**
+ * panthor_vm_has_unhandled_faults() - Check if a VM has unhandled faults
+ * @vm: VM to check.
+ *
+ * Return: true if the VM has unhandled faults, false otherwise.
+ */
+bool panthor_vm_has_unhandled_faults(struct panthor_vm *vm)
+{
+	return vm->unhandled_fault;
+}
+
+/**
+ * panthor_vm_is_unusable() - Check if the VM is still usable
+ * @vm: VM to check.
+ *
+ * Return: true if the VM is unusable, false otherwise.
+ */
+bool panthor_vm_is_unusable(struct panthor_vm *vm)
+{
+	return vm->unusable;
+}
+
+static void panthor_vm_release_as_locked(struct panthor_vm *vm)
+{
+	struct panthor_device *ptdev = vm->ptdev;
+
+	lockdep_assert_held(&ptdev->mmu->as.slots_lock);
+
+	if (drm_WARN_ON(&ptdev->base, vm->as.id < 0))
+		return;
+
+	ptdev->mmu->as.slots[vm->as.id].vm = NULL;
+	clear_bit(vm->as.id, &ptdev->mmu->as.alloc_mask);
+	refcount_set(&vm->as.active_cnt, 0);
+	list_del_init(&vm->as.lru_node);
+	vm->as.id = -1;
+}
+
+/**
+ * panthor_vm_active() - Flag a VM as active
+ * @VM: VM to flag as active.
+ *
+ * Assigns an address space to a VM so it can be used by the GPU/MCU.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+int panthor_vm_active(struct panthor_vm *vm)
+{
+	struct panthor_device *ptdev = vm->ptdev;
+	u32 va_bits = GPU_MMU_FEATURES_VA_BITS(ptdev->gpu_info.mmu_features);
+	struct io_pgtable_cfg *cfg = &io_pgtable_ops_to_pgtable(vm->pgtbl_ops)->cfg;
+	int ret = 0, as, cookie;
+	u64 transtab, transcfg;
+
+	if (!drm_dev_enter(&ptdev->base, &cookie))
+		return -ENODEV;
+
+	if (refcount_inc_not_zero(&vm->as.active_cnt))
+		goto out_dev_exit;
+
+	mutex_lock(&ptdev->mmu->as.slots_lock);
+
+	if (refcount_inc_not_zero(&vm->as.active_cnt))
+		goto out_unlock;
+
+	as = vm->as.id;
+	if (as >= 0) {
+		/* Unhandled pagefault on this AS, the MMU was disabled. We need to
+		 * re-enable the MMU after clearing+unmasking the AS interrupts.
+		 */
+		if (ptdev->mmu->as.faulty_mask & panthor_mmu_as_fault_mask(ptdev, as))
+			goto out_enable_as;
+
+		goto out_make_active;
+	}
+
+	/* Check for a free AS */
+	if (vm->for_mcu) {
+		drm_WARN_ON(&ptdev->base, ptdev->mmu->as.alloc_mask & BIT(0));
+		as = 0;
+	} else {
+		as = ffz(ptdev->mmu->as.alloc_mask | BIT(0));
+	}
+
+	if (!(BIT(as) & ptdev->gpu_info.as_present)) {
+		struct panthor_vm *lru_vm;
+
+		lru_vm = list_first_entry_or_null(&ptdev->mmu->as.lru_list,
+						  struct panthor_vm,
+						  as.lru_node);
+		if (drm_WARN_ON(&ptdev->base, !lru_vm)) {
+			ret = -EBUSY;
+			goto out_unlock;
+		}
+
+		drm_WARN_ON(&ptdev->base, refcount_read(&lru_vm->as.active_cnt));
+		as = lru_vm->as.id;
+		panthor_vm_release_as_locked(lru_vm);
+	}
+
+	/* Assign the free or reclaimed AS to the FD */
+	vm->as.id = as;
+	set_bit(as, &ptdev->mmu->as.alloc_mask);
+	ptdev->mmu->as.slots[as].vm = vm;
+
+out_enable_as:
+	transtab = cfg->arm_lpae_s1_cfg.ttbr;
+	transcfg = AS_TRANSCFG_PTW_MEMATTR_WB |
+		   AS_TRANSCFG_PTW_RA |
+		   AS_TRANSCFG_ADRMODE_AARCH64_4K |
+		   AS_TRANSCFG_INA_BITS(55 - va_bits);
+	if (ptdev->coherent)
+		transcfg |= AS_TRANSCFG_PTW_SH_OS;
+
+	/* If the VM is re-activated, we clear the fault. */
+	vm->unhandled_fault = false;
+
+	/* Unhandled pagefault on this AS, clear the fault and re-enable interrupts
+	 * before enabling the AS.
+	 */
+	if (ptdev->mmu->as.faulty_mask & panthor_mmu_as_fault_mask(ptdev, as)) {
+		gpu_write(ptdev, MMU_INT_CLEAR, panthor_mmu_as_fault_mask(ptdev, as));
+		ptdev->mmu->as.faulty_mask &= ~panthor_mmu_as_fault_mask(ptdev, as);
+		gpu_write(ptdev, MMU_INT_MASK, ~ptdev->mmu->as.faulty_mask);
+	}
+
+	ret = panthor_mmu_as_enable(vm->ptdev, vm->as.id, transtab, transcfg, vm->memattr);
+
+out_make_active:
+	if (!ret) {
+		refcount_set(&vm->as.active_cnt, 1);
+		list_del_init(&vm->as.lru_node);
+	}
+
+out_unlock:
+	mutex_unlock(&ptdev->mmu->as.slots_lock);
+
+out_dev_exit:
+	drm_dev_exit(cookie);
+	return ret;
+}
+
+/**
+ * panthor_vm_idle() - Flag a VM idle
+ * @VM: VM to flag as idle.
+ *
+ * When we know the GPU is done with the VM (no more jobs to process),
+ * we can relinquish the AS slot attached to this VM, if any.
+ *
+ * We don't release the slot immediately, but instead place the VM in
+ * the LRU list, so it can be evicted if another VM needs an AS slot.
+ * This way, VMs keep attached to the AS they were given until we run
+ * out of free slot, limiting the number of MMU operations (TLB flush
+ * and other AS updates).
+ */
+void panthor_vm_idle(struct panthor_vm *vm)
+{
+	struct panthor_device *ptdev = vm->ptdev;
+
+	if (!refcount_dec_and_mutex_lock(&vm->as.active_cnt, &ptdev->mmu->as.slots_lock))
+		return;
+
+	if (!drm_WARN_ON(&ptdev->base, vm->as.id == -1 || !list_empty(&vm->as.lru_node)))
+		list_add_tail(&vm->as.lru_node, &ptdev->mmu->as.lru_list);
+
+	refcount_set(&vm->as.active_cnt, 0);
+	mutex_unlock(&ptdev->mmu->as.slots_lock);
+}
+
+static void panthor_vm_stop(struct panthor_vm *vm)
+{
+	drm_sched_stop(&vm->sched, NULL);
+}
+
+static void panthor_vm_start(struct panthor_vm *vm)
+{
+	drm_sched_start(&vm->sched, true);
+}
+
+/**
+ * panthor_vm_as() - Get the AS slot attached to a VM
+ * @vm: VM to get the AS slot of.
+ *
+ * Return: -1 if the VM is not assigned an AS slot yet, >= 0 otherwise.
+ */
+int panthor_vm_as(struct panthor_vm *vm)
+{
+	return vm->as.id;
+}
+
+static size_t get_pgsize(u64 addr, size_t size, size_t *count)
+{
+	/*
+	 * io-pgtable only operates on multiple pages within a single table
+	 * entry, so we need to split at boundaries of the table size, i.e.
+	 * the next block size up. The distance from address A to the next
+	 * boundary of block size B is logically B - A % B, but in unsigned
+	 * two's complement where B is a power of two we get the equivalence
+	 * B - A % B == (B - A) % B == (n * B - A) % B, and choose n = 0 :)
+	 */
+	size_t blk_offset = -addr % SZ_2M;
+
+	if (blk_offset || size < SZ_2M) {
+		*count = min_not_zero(blk_offset, size) / SZ_4K;
+		return SZ_4K;
+	}
+	blk_offset = -addr % SZ_1G ?: SZ_1G;
+	*count = min(blk_offset, size) / SZ_2M;
+	return SZ_2M;
+}
+
+static int panthor_vm_flush_range(struct panthor_vm *vm, u64 iova, u64 size)
+{
+	struct panthor_device *ptdev = vm->ptdev;
+	int ret = 0, cookie;
+
+	if (vm->as.id < 0)
+		return 0;
+
+	/* If the device is unplugged, we just silently skip the flush. */
+	if (!drm_dev_enter(&ptdev->base, &cookie))
+		return 0;
+
+	/* Flush the PTs only if we're already awake */
+	if (pm_runtime_active(ptdev->base.dev))
+		ret = mmu_hw_do_operation(vm, iova, size, AS_COMMAND_FLUSH_PT);
+
+	drm_dev_exit(cookie);
+	return ret;
+}
+
+static int panthor_vm_unmap_pages(struct panthor_vm *vm, u64 iova, u64 size)
+{
+	struct panthor_device *ptdev = vm->ptdev;
+	struct io_pgtable_ops *ops = vm->pgtbl_ops;
+	u64 offset = 0;
+
+	drm_dbg(&ptdev->base, "unmap: as=%d, iova=%llx, len=%llx", vm->as.id, iova, size);
+
+	while (offset < size) {
+		size_t unmapped_sz = 0, pgcount;
+		size_t pgsize = get_pgsize(iova + offset, size - offset, &pgcount);
+
+		unmapped_sz = ops->unmap_pages(ops, iova + offset, pgsize, pgcount, NULL);
+
+		if (drm_WARN_ON(&ptdev->base, unmapped_sz != pgsize * pgcount)) {
+			drm_err(&ptdev->base, "failed to unmap range %llx-%llx (requested range %llx-%llx)\n",
+				iova + offset + unmapped_sz,
+				iova + offset + pgsize * pgcount,
+				iova, iova + size);
+			panthor_vm_flush_range(vm, iova, offset + unmapped_sz);
+			return  -EINVAL;
+		}
+		offset += unmapped_sz;
+	}
+
+	return panthor_vm_flush_range(vm, iova, size);
+}
+
+static int
+panthor_vm_map_pages(struct panthor_vm *vm, u64 iova, int prot,
+		     struct sg_table *sgt, u64 offset, u64 size)
+{
+	struct panthor_device *ptdev = vm->ptdev;
+	unsigned int count;
+	struct scatterlist *sgl;
+	struct io_pgtable_ops *ops = vm->pgtbl_ops;
+	u64 start_iova = iova;
+	int ret;
+
+	if (!size)
+		return 0;
+
+	for_each_sgtable_dma_sg(sgt, sgl, count) {
+		dma_addr_t paddr = sg_dma_address(sgl);
+		size_t len = sg_dma_len(sgl);
+
+		if (len <= offset) {
+			offset -= len;
+			continue;
+		}
+
+		paddr += offset;
+		len -= offset;
+		len = min_t(size_t, len, size);
+		size -= len;
+
+		drm_dbg(&ptdev->base, "map: as=%d, iova=%llx, paddr=%pad, len=%zx",
+			vm->as.id, iova, &paddr, len);
+
+		while (len) {
+			size_t pgcount, mapped = 0;
+			size_t pgsize = get_pgsize(iova | paddr, len, &pgcount);
+
+			ret = ops->map_pages(ops, iova, paddr, pgsize, pgcount, prot,
+					     GFP_KERNEL, &mapped);
+			iova += mapped;
+			paddr += mapped;
+			len -= mapped;
+
+			if (drm_WARN_ON(&ptdev->base, !ret && !mapped))
+				ret = -ENOMEM;
+
+			if (ret) {
+				/* If something failed, unmap what we've already mapped before
+				 * returning. The unmap call is not supposed to fail.
+				 */
+				drm_WARN_ON(&ptdev->base,
+					    panthor_vm_unmap_pages(vm, start_iova,
+								   iova - start_iova));
+				return ret;
+			}
+		}
+
+		if (!size)
+			break;
+	}
+
+	return panthor_vm_flush_range(vm, start_iova, iova - start_iova);
+}
+
+static int flags_to_prot(u32 flags)
+{
+	int prot = 0;
+
+	if (flags & DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC)
+		prot |= IOMMU_NOEXEC;
+
+	if (!(flags & DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED))
+		prot |= IOMMU_CACHE;
+
+	if (flags & DRM_PANTHOR_VM_BIND_OP_MAP_READONLY)
+		prot |= IOMMU_READ;
+	else
+		prot |= IOMMU_READ | IOMMU_WRITE;
+
+	return prot;
+}
+
+/**
+ * panthor_vm_alloc_va() - Allocate a region in the auto-va space
+ * @VM: VM to allocate a region on.
+ * @va: start of the VA range. Can be PANTHOR_VM_KERNEL_AUTO_VA if the user
+ * wants the VA to be automatically allocated from the auto-VA range.
+ * @size: size of the VA range.
+ * @va_node: drm_mm_node to initialize. Must be zero-initialized.
+ *
+ * Some GPU objects, like heap chunks, are fully managed by the kernel and
+ * need to be mapped to the userspace VM, in the region reserved for kernel
+ * objects.
+ *
+ * This function takes care of allocating a region in the kernel auto-VA space.
+ *
+ * Return: 0 on success, an error code otherwise.
+ */
+int
+panthor_vm_alloc_va(struct panthor_vm *vm, u64 va, u64 size,
+		    struct drm_mm_node *va_node)
+{
+	int ret;
+
+	if (!size || (size & ~PAGE_MASK))
+		return -EINVAL;
+
+	if (va != PANTHOR_VM_KERNEL_AUTO_VA && (va & ~PAGE_MASK))
+		return -EINVAL;
+
+	mutex_lock(&vm->mm_lock);
+	if (va != PANTHOR_VM_KERNEL_AUTO_VA) {
+		va_node->start = va;
+		va_node->size = size;
+		ret = drm_mm_reserve_node(&vm->mm, va_node);
+	} else {
+		ret = drm_mm_insert_node_in_range(&vm->mm, va_node, size,
+						  size >= SZ_2M ? SZ_2M : SZ_4K,
+						  0, vm->kernel_auto_va.start,
+						  vm->kernel_auto_va.end,
+						  DRM_MM_INSERT_BEST);
+	}
+	mutex_unlock(&vm->mm_lock);
+
+	return ret;
+}
+
+/**
+ * panthor_vm_free_va() - Free a region allocated with panthor_vm_alloc_va()
+ * @VM: VM to free the region on.
+ * @va_node: Memory node representing the region to free.
+ */
+void panthor_vm_free_va(struct panthor_vm *vm, struct drm_mm_node *va_node)
+{
+	mutex_lock(&vm->mm_lock);
+	drm_mm_remove_node(va_node);
+	mutex_unlock(&vm->mm_lock);
+}
+
+static void panthor_vm_bo_put(struct drm_gpuvm_bo *vm_bo)
+{
+	struct panthor_gem_object *bo = to_panthor_bo(vm_bo->obj);
+	struct drm_gpuvm *vm = vm_bo->vm;
+	bool unpin;
+
+	/* We must retain the GEM before calling drm_gpuvm_bo_put(),
+	 * otherwise the mutex might be destroyed while we hold it.
+	 * Same goes for the VM, since we take the VM resv lock.
+	 */
+	drm_gem_object_get(&bo->base.base);
+	drm_gpuvm_get(vm);
+
+	/* We take the resv lock to protect against concurrent accesses to the
+	 * gpuvm evicted/extobj lists that are modified in
+	 * drm_gpuvm_bo_destroy(), which is called if drm_gpuvm_bo_put()
+	 * releases sthe last vm_bo reference.
+	 * We take the BO GPUVA list lock to protect the vm_bo removal from the
+	 * GEM vm_bo list.
+	 */
+	dma_resv_lock(drm_gpuvm_resv(vm), NULL);
+	mutex_lock(&bo->gpuva_list_lock);
+	unpin = drm_gpuvm_bo_put(vm_bo);
+	mutex_unlock(&bo->gpuva_list_lock);
+	dma_resv_unlock(drm_gpuvm_resv(vm));
+
+	/* If the vm_bo object was destroyed, release the pin reference that
+	 * was hold by this object.
+	 */
+	if (unpin && !bo->base.base.import_attach)
+		drm_gem_shmem_unpin(&bo->base);
+
+	drm_gpuvm_put(vm);
+	drm_gem_object_put(&bo->base.base);
+}
+
+static void panthor_vm_cleanup_op_ctx(struct panthor_vm_op_ctx *op_ctx,
+				      struct panthor_vm *vm)
+{
+	struct panthor_vma *vma, *tmp_vma;
+
+	u32 remaining_pt_count = op_ctx->rsvd_page_tables.count -
+				 op_ctx->rsvd_page_tables.ptr;
+
+	if (remaining_pt_count) {
+		kmem_cache_free_bulk(pt_cache, remaining_pt_count,
+				     op_ctx->rsvd_page_tables.pages +
+				     op_ctx->rsvd_page_tables.ptr);
+	}
+
+	kfree(op_ctx->rsvd_page_tables.pages);
+
+	if (op_ctx->map.vm_bo)
+		panthor_vm_bo_put(op_ctx->map.vm_bo);
+
+	for (u32 i = 0; i < ARRAY_SIZE(op_ctx->preallocated_vmas); i++)
+		kfree(op_ctx->preallocated_vmas[i]);
+
+	list_for_each_entry_safe(vma, tmp_vma, &op_ctx->returned_vmas, node) {
+		list_del(&vma->node);
+		panthor_vm_bo_put(vma->base.vm_bo);
+		kfree(vma);
+	}
+}
+
+static struct panthor_vma *
+panthor_vm_op_ctx_get_vma(struct panthor_vm_op_ctx *op_ctx)
+{
+	for (u32 i = 0; i < ARRAY_SIZE(op_ctx->preallocated_vmas); i++) {
+		struct panthor_vma *vma = op_ctx->preallocated_vmas[i];
+
+		if (vma) {
+			op_ctx->preallocated_vmas[i] = NULL;
+			return vma;
+		}
+	}
+
+	return NULL;
+}
+
+static int
+panthor_vm_op_ctx_prealloc_vmas(struct panthor_vm_op_ctx *op_ctx)
+{
+	u32 vma_count;
+
+	switch (op_ctx->flags & DRM_PANTHOR_VM_BIND_OP_TYPE_MASK) {
+	case DRM_PANTHOR_VM_BIND_OP_TYPE_MAP:
+		/* One VMA for the new mapping, and two more VMAs for the remap case
+		 * which might contain both a prev and next VA.
+		 */
+		vma_count = 3;
+		break;
+
+	case DRM_PANTHOR_VM_BIND_OP_TYPE_UNMAP:
+		/* Partial unmaps might trigger a remap with either a prev or a next VA,
+		 * but not both.
+		 */
+		vma_count = 1;
+		break;
+
+	default:
+		return 0;
+	}
+
+	for (u32 i = 0; i < vma_count; i++) {
+		struct panthor_vma *vma = kzalloc(sizeof(*vma), GFP_KERNEL);
+
+		if (!vma)
+			return -ENOMEM;
+
+		op_ctx->preallocated_vmas[i] = vma;
+	}
+
+	return 0;
+}
+
+#define PANTHOR_VM_BIND_OP_MAP_FLAGS \
+	(DRM_PANTHOR_VM_BIND_OP_MAP_READONLY | \
+	 DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC | \
+	 DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED | \
+	 DRM_PANTHOR_VM_BIND_OP_TYPE_MASK)
+
+static int panthor_vm_prepare_map_op_ctx(struct panthor_vm_op_ctx *op_ctx,
+					 struct panthor_vm *vm,
+					 struct panthor_gem_object *bo,
+					 u64 offset,
+					 u64 size, u64 va,
+					 u32 flags)
+{
+	struct drm_gpuvm_bo *preallocated_vm_bo;
+	struct sg_table *sgt = NULL;
+	u64 pt_count;
+	int ret;
+
+	if (!bo)
+		return -EINVAL;
+
+	if ((flags & ~PANTHOR_VM_BIND_OP_MAP_FLAGS) ||
+	    (flags & DRM_PANTHOR_VM_BIND_OP_TYPE_MASK) != DRM_PANTHOR_VM_BIND_OP_TYPE_MAP)
+		return -EINVAL;
+
+	/* Make sure the VA and size are aligned and in-bounds. */
+	if (size > bo->base.base.size || offset > bo->base.base.size - size)
+		return -EINVAL;
+
+	/* If the BO has an exclusive VM attached, it can't be mapped to other VMs. */
+	if (bo->exclusive_vm_root_gem &&
+	    bo->exclusive_vm_root_gem != panthor_vm_root_gem(vm))
+		return -EINVAL;
+
+	memset(op_ctx, 0, sizeof(*op_ctx));
+	INIT_LIST_HEAD(&op_ctx->returned_vmas);
+	op_ctx->flags = flags;
+	op_ctx->va.range = size;
+	op_ctx->va.addr = va;
+
+	ret = panthor_vm_op_ctx_prealloc_vmas(op_ctx);
+	if (ret)
+		goto err_cleanup;
+
+	if (!bo->base.base.import_attach) {
+		/* Pre-reserve the BO pages, so the map operation doesn't have to
+		 * allocate.
+		 */
+		ret = drm_gem_shmem_pin(&bo->base);
+		if (ret)
+			goto err_cleanup;
+	}
+
+	sgt = drm_gem_shmem_get_pages_sgt(&bo->base);
+	if (IS_ERR(sgt)) {
+		if (!bo->base.base.import_attach)
+			drm_gem_shmem_unpin(&bo->base);
+
+		ret = PTR_ERR(sgt);
+		goto err_cleanup;
+	}
+
+	op_ctx->map.sgt = sgt;
+
+	preallocated_vm_bo = drm_gpuvm_bo_create(&vm->base, &bo->base.base);
+	if (!preallocated_vm_bo) {
+		if (!bo->base.base.import_attach)
+			drm_gem_shmem_unpin(&bo->base);
+
+		ret = -ENOMEM;
+		goto err_cleanup;
+	}
+
+	mutex_lock(&bo->gpuva_list_lock);
+	op_ctx->map.vm_bo = drm_gpuvm_bo_obtain_prealloc(preallocated_vm_bo);
+	mutex_unlock(&bo->gpuva_list_lock);
+
+	/* If the a vm_bo for this <VM,BO> combination exists, it already
+	 * retains a pin ref, and we can release the one we took earlier.
+	 *
+	 * If our pre-allocated vm_bo is picked, it now retains the pin ref,
+	 * which will be released in panthor_vm_bo_put().
+	 */
+	if (preallocated_vm_bo != op_ctx->map.vm_bo &&
+	    !bo->base.base.import_attach)
+		drm_gem_shmem_unpin(&bo->base);
+
+	op_ctx->map.bo_offset = offset;
+
+	/* L1, L2 and L3 page tables.
+	 * We could optimize L3 allocation by iterating over the sgt and merging
+	 * 2M contiguous blocks, but it's simpler to over-provision and return
+	 * the pages if they're not used.
+	 */
+	pt_count = ((ALIGN(va + size, 1ull << 39) - ALIGN_DOWN(va, 1ull << 39)) >> 39) +
+		   ((ALIGN(va + size, 1ull << 30) - ALIGN_DOWN(va, 1ull << 30)) >> 30) +
+		   ((ALIGN(va + size, 1ull << 21) - ALIGN_DOWN(va, 1ull << 21)) >> 21);
+
+	op_ctx->rsvd_page_tables.pages = kcalloc(pt_count,
+						 sizeof(*op_ctx->rsvd_page_tables.pages),
+						 GFP_KERNEL);
+	if (!op_ctx->rsvd_page_tables.pages)
+		goto err_cleanup;
+
+	ret = kmem_cache_alloc_bulk(pt_cache, GFP_KERNEL, pt_count,
+				    op_ctx->rsvd_page_tables.pages);
+	op_ctx->rsvd_page_tables.count = ret;
+	if (ret != pt_count) {
+		ret = -ENOMEM;
+		goto err_cleanup;
+	}
+
+	/* Insert BO into the extobj list last, when we know nothing can fail. */
+	dma_resv_lock(panthor_vm_resv(vm), NULL);
+	drm_gpuvm_bo_extobj_add(op_ctx->map.vm_bo);
+	dma_resv_unlock(panthor_vm_resv(vm));
+
+	return 0;
+
+err_cleanup:
+	panthor_vm_cleanup_op_ctx(op_ctx, vm);
+	return ret;
+}
+
+static int panthor_vm_prepare_unmap_op_ctx(struct panthor_vm_op_ctx *op_ctx,
+					   struct panthor_vm *vm,
+					   u64 va, u64 size)
+{
+	u32 pt_count = 0;
+	int ret;
+
+	memset(op_ctx, 0, sizeof(*op_ctx));
+	INIT_LIST_HEAD(&op_ctx->returned_vmas);
+	op_ctx->va.range = size;
+	op_ctx->va.addr = va;
+	op_ctx->flags = DRM_PANTHOR_VM_BIND_OP_TYPE_UNMAP;
+
+	/* Pre-allocate L3 page tables to account for the split-2M-block
+	 * situation on unmap.
+	 */
+	if (va != ALIGN(va, SZ_2M))
+		pt_count++;
+
+	if (va + size != ALIGN(va + size, SZ_2M) &&
+	    ALIGN(va + size, SZ_2M) != ALIGN(va, SZ_2M))
+		pt_count++;
+
+	ret = panthor_vm_op_ctx_prealloc_vmas(op_ctx);
+	if (ret)
+		goto err_cleanup;
+
+	if (pt_count) {
+		op_ctx->rsvd_page_tables.pages = kcalloc(pt_count,
+							 sizeof(*op_ctx->rsvd_page_tables.pages),
+							 GFP_KERNEL);
+		if (!op_ctx->rsvd_page_tables.pages)
+			goto err_cleanup;
+
+		ret = kmem_cache_alloc_bulk(pt_cache, GFP_KERNEL, pt_count,
+					    op_ctx->rsvd_page_tables.pages);
+		if (ret != pt_count) {
+			ret = -ENOMEM;
+			goto err_cleanup;
+		}
+		op_ctx->rsvd_page_tables.count = pt_count;
+	}
+
+	return 0;
+
+err_cleanup:
+	panthor_vm_cleanup_op_ctx(op_ctx, vm);
+	return ret;
+}
+
+static void panthor_vm_prepare_sync_only_op_ctx(struct panthor_vm_op_ctx *op_ctx,
+						struct panthor_vm *vm)
+{
+	memset(op_ctx, 0, sizeof(*op_ctx));
+	INIT_LIST_HEAD(&op_ctx->returned_vmas);
+	op_ctx->flags = DRM_PANTHOR_VM_BIND_OP_TYPE_SYNC_ONLY;
+}
+
+/**
+ * panthor_vm_get_bo_for_va() - Get the GEM object mapped at a virtual address
+ * @vm: VM to look into.
+ * @va: Virtual address to search for.
+ * @bo_offset: Offset of the GEM object mapped at this virtual address.
+ * Only valid on success.
+ *
+ * The object returned by this function might no longer be mapped when the
+ * function returns. It's the caller responsibility to ensure there's no
+ * concurrent map/unmap operations making the returned value invalid, or
+ * make sure it doesn't matter if the object is no longer mapped.
+ *
+ * Return: A valid pointer on success, an ERR_PTR() otherwise.
+ */
+struct panthor_gem_object *
+panthor_vm_get_bo_for_va(struct panthor_vm *vm, u64 va, u64 *bo_offset)
+{
+	struct panthor_gem_object *bo = ERR_PTR(-ENOENT);
+	struct drm_gpuva *gpuva;
+	struct panthor_vma *vma;
+
+	/* Take the VM lock to prevent concurrent map/unmap operations. */
+	mutex_lock(&vm->op_lock);
+	gpuva = drm_gpuva_find_first(&vm->base, va, 1);
+	vma = gpuva ? container_of(gpuva, struct panthor_vma, base) : NULL;
+	if (vma && vma->base.gem.obj) {
+		drm_gem_object_get(vma->base.gem.obj);
+		bo = to_panthor_bo(vma->base.gem.obj);
+		*bo_offset = vma->base.gem.offset + (va - vma->base.va.addr);
+	}
+	mutex_unlock(&vm->op_lock);
+
+	return bo;
+}
+
+#define PANTHOR_VM_MIN_KERNEL_VA_SIZE	SZ_256M
+
+static u64
+panthor_vm_create_get_user_va_range(const struct drm_panthor_vm_create *args,
+				    u64 full_va_range)
+{
+	u64 user_va_range;
+
+	/* Make sure we have a minimum amount of VA space for kernel objects. */
+	if (full_va_range < PANTHOR_VM_MIN_KERNEL_VA_SIZE)
+		return 0;
+
+	if (args->user_va_range) {
+		/* Use the user provided value if != 0. */
+		user_va_range = args->user_va_range;
+	} else if (TASK_SIZE_OF(current) < full_va_range) {
+		/* If the task VM size is smaller than the GPU VA range, pick this
+		 * as our default user VA range, so userspace can CPU/GPU map buffers
+		 * at the same address.
+		 */
+		user_va_range = TASK_SIZE_OF(current);
+	} else {
+		/* If the GPU VA range is smaller than the task VM size, we
+		 * just have to live with the fact we won't be able to map
+		 * all buffers at the same GPU/CPU address.
+		 *
+		 * If the GPU VA range is bigger than 4G (more than 32-bit of
+		 * VA), we split the range in two, and assign half of it to
+		 * the user and the other half to the kernel, if it's not, we
+		 * keep the kernel VA space as small as possible.
+		 */
+		user_va_range = full_va_range > SZ_4G ?
+				full_va_range / 2 :
+				full_va_range - PANTHOR_VM_MIN_KERNEL_VA_SIZE;
+	}
+
+	if (full_va_range - PANTHOR_VM_MIN_KERNEL_VA_SIZE < user_va_range)
+		user_va_range = full_va_range - PANTHOR_VM_MIN_KERNEL_VA_SIZE;
+
+	return user_va_range;
+}
+
+#define PANTHOR_VM_CREATE_FLAGS		0
+
+static int
+panthor_vm_create_check_args(const struct panthor_device *ptdev,
+			     const struct drm_panthor_vm_create *args,
+			     u64 *kernel_va_start, u64 *kernel_va_range)
+{
+	u32 va_bits = GPU_MMU_FEATURES_VA_BITS(ptdev->gpu_info.mmu_features);
+	u64 full_va_range = 1ull << va_bits;
+	u64 user_va_range;
+
+	if (args->flags & ~PANTHOR_VM_CREATE_FLAGS)
+		return -EINVAL;
+
+	user_va_range = panthor_vm_create_get_user_va_range(args, full_va_range);
+	if (!user_va_range || (args->user_va_range && args->user_va_range > user_va_range))
+		return -EINVAL;
+
+	/* Pick a kernel VA range that's a power of two, to have a clear split. */
+	*kernel_va_range = rounddown_pow_of_two(full_va_range - user_va_range);
+	*kernel_va_start = full_va_range - *kernel_va_range;
+	return 0;
+}
+
+/*
+ * Only 32 VMs per open file. If that becomes a limiting factor, we can
+ * increase this number.
+ */
+#define PANTHOR_MAX_VMS_PER_FILE	32
+
+/**
+ * panthor_vm_pool_create_vm() - Create a VM
+ * @pool: The VM to create this VM on.
+ * @kernel_va_start: Start of the region reserved for kernel objects.
+ * @kernel_va_range: Size of the region reserved for kernel objects.
+ *
+ * Return: a positive VM ID on success, a negative error code otherwise.
+ */
+int panthor_vm_pool_create_vm(struct panthor_device *ptdev,
+			      struct panthor_vm_pool *pool,
+			      struct drm_panthor_vm_create *args)
+{
+	u64 kernel_va_start, kernel_va_range;
+	struct panthor_vm *vm;
+	int ret;
+	u32 id;
+
+	ret = panthor_vm_create_check_args(ptdev, args, &kernel_va_start, &kernel_va_range);
+	if (ret)
+		return ret;
+
+	vm = panthor_vm_create(ptdev, false, kernel_va_start, kernel_va_range,
+			       kernel_va_start, kernel_va_range);
+	if (IS_ERR(vm))
+		return PTR_ERR(vm);
+
+	ret = xa_alloc(&pool->xa, &id, vm,
+		       XA_LIMIT(1, PANTHOR_MAX_VMS_PER_FILE), GFP_KERNEL);
+
+	if (ret) {
+		panthor_vm_put(vm);
+		return ret;
+	}
+
+	args->user_va_range = kernel_va_start;
+	return id;
+}
+
+static void panthor_vm_destroy(struct panthor_vm *vm)
+{
+	if (!vm)
+		return;
+
+	vm->destroyed = true;
+
+	mutex_lock(&vm->heaps.lock);
+	panthor_heap_pool_destroy(vm->heaps.pool);
+	vm->heaps.pool = NULL;
+	mutex_unlock(&vm->heaps.lock);
+
+	drm_WARN_ON(&vm->ptdev->base,
+		    panthor_vm_unmap_range(vm, vm->base.mm_start, vm->base.mm_range));
+	panthor_vm_put(vm);
+}
+
+/**
+ * panthor_vm_pool_destroy_vm() - Destroy a VM.
+ * @pool: VM pool.
+ * @handle: VM handle.
+ *
+ * This function doesn't free the VM object or its resources, it just kills
+ * all mappings, and makes sure nothing can be mapped after that point.
+ *
+ * If there was any active jobs at the time this function is called, these
+ * jobs should experience page faults and be killed as a result.
+ *
+ * The VM resources are freed when the last reference on the VM object is
+ * dropped.
+ */
+int panthor_vm_pool_destroy_vm(struct panthor_vm_pool *pool, u32 handle)
+{
+	struct panthor_vm *vm;
+
+	vm = xa_erase(&pool->xa, handle);
+
+	panthor_vm_destroy(vm);
+
+	return vm ? 0 : -EINVAL;
+}
+
+/**
+ * panthor_vm_pool_get_vm() - Retrieve VM object bound to a VM handle
+ * @pool: VM pool to check.
+ * @handle: Handle of the VM to retrieve.
+ *
+ * Return: A valid pointer if the VM exists, NULL otherwise.
+ */
+struct panthor_vm *
+panthor_vm_pool_get_vm(struct panthor_vm_pool *pool, u32 handle)
+{
+	struct panthor_vm *vm;
+
+	vm = panthor_vm_get(xa_load(&pool->xa, handle));
+
+	return vm;
+}
+
+/**
+ * panthor_vm_pool_destroy() - Destroy a VM pool.
+ * @pfile: File.
+ *
+ * Destroy all VMs in the pool, and release the pool resources.
+ *
+ * Note that VMs can outlive the pool they were created from if other
+ * objects hold a reference to there VMs.
+ */
+void panthor_vm_pool_destroy(struct panthor_file *pfile)
+{
+	struct panthor_vm *vm;
+	unsigned long i;
+
+	if (!pfile->vms)
+		return;
+
+	xa_for_each(&pfile->vms->xa, i, vm)
+		panthor_vm_destroy(vm);
+
+	xa_destroy(&pfile->vms->xa);
+	kfree(pfile->vms);
+}
+
+/**
+ * panthor_vm_pool_create() - Create a VM pool
+ * @pfile: File.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+int panthor_vm_pool_create(struct panthor_file *pfile)
+{
+	pfile->vms = kzalloc(sizeof(*pfile->vms), GFP_KERNEL);
+	if (!pfile->vms)
+		return -ENOMEM;
+
+	xa_init_flags(&pfile->vms->xa, XA_FLAGS_ALLOC1);
+	return 0;
+}
+
+/* dummy TLB ops, the real TLB flush happens in panthor_vm_flush_range() */
+static void mmu_tlb_flush_all(void *cookie)
+{
+}
+
+static void mmu_tlb_flush_walk(unsigned long iova, size_t size, size_t granule, void *cookie)
+{
+}
+
+static const struct iommu_flush_ops mmu_tlb_ops = {
+	.tlb_flush_all = mmu_tlb_flush_all,
+	.tlb_flush_walk = mmu_tlb_flush_walk,
+};
+
+static const char *access_type_name(struct panthor_device *ptdev,
+				    u32 fault_status)
+{
+	switch (fault_status & AS_FAULTSTATUS_ACCESS_TYPE_MASK) {
+	case AS_FAULTSTATUS_ACCESS_TYPE_ATOMIC:
+		return "ATOMIC";
+	case AS_FAULTSTATUS_ACCESS_TYPE_READ:
+		return "READ";
+	case AS_FAULTSTATUS_ACCESS_TYPE_WRITE:
+		return "WRITE";
+	case AS_FAULTSTATUS_ACCESS_TYPE_EX:
+		return "EXECUTE";
+	default:
+		drm_WARN_ON(&ptdev->base, 1);
+		return NULL;
+	}
+}
+
+static void panthor_mmu_irq_handler(struct panthor_device *ptdev, u32 status)
+{
+	bool has_unhandled_faults = false;
+
+	status = panthor_mmu_fault_mask(ptdev, status);
+	while (status) {
+		u32 as = ffs(status | (status >> 16)) - 1;
+		u32 mask = panthor_mmu_as_fault_mask(ptdev, as);
+		u32 new_int_mask;
+		u64 addr;
+		u32 fault_status;
+		u32 exception_type;
+		u32 access_type;
+		u32 source_id;
+
+		fault_status = gpu_read(ptdev, AS_FAULTSTATUS(as));
+		addr = gpu_read(ptdev, AS_FAULTADDRESS_LO(as));
+		addr |= (u64)gpu_read(ptdev, AS_FAULTADDRESS_HI(as)) << 32;
+
+		/* decode the fault status */
+		exception_type = fault_status & 0xFF;
+		access_type = (fault_status >> 8) & 0x3;
+		source_id = (fault_status >> 16);
+
+		mutex_lock(&ptdev->mmu->as.slots_lock);
+
+		ptdev->mmu->as.faulty_mask |= mask;
+		new_int_mask =
+			panthor_mmu_fault_mask(ptdev, ~ptdev->mmu->as.faulty_mask);
+
+		/* terminal fault, print info about the fault */
+		drm_err(&ptdev->base,
+			"Unhandled Page fault in AS%d at VA 0x%016llX\n"
+			"raw fault status: 0x%X\n"
+			"decoded fault status: %s\n"
+			"exception type 0x%X: %s\n"
+			"access type 0x%X: %s\n"
+			"source id 0x%X\n",
+			as, addr,
+			fault_status,
+			(fault_status & (1 << 10) ? "DECODER FAULT" : "SLAVE FAULT"),
+			exception_type, panthor_exception_name(ptdev, exception_type),
+			access_type, access_type_name(ptdev, fault_status),
+			source_id);
+
+		/* Ignore MMU interrupts on this AS until it's been
+		 * re-enabled.
+		 */
+		ptdev->mmu->irq.mask = new_int_mask;
+		gpu_write(ptdev, MMU_INT_MASK, new_int_mask);
+
+		if (ptdev->mmu->as.slots[as].vm)
+			ptdev->mmu->as.slots[as].vm->unhandled_fault = true;
+
+		/* Disable the MMU to kill jobs on this AS. */
+		panthor_mmu_as_disable(ptdev, as);
+		mutex_unlock(&ptdev->mmu->as.slots_lock);
+
+		status &= ~mask;
+		has_unhandled_faults = true;
+	}
+
+	if (has_unhandled_faults)
+		panthor_sched_report_mmu_fault(ptdev);
+}
+PANTHOR_IRQ_HANDLER(mmu, MMU, panthor_mmu_irq_handler);
+
+/**
+ * panthor_mmu_suspend() - Suspend the MMU logic
+ * @ptdev: Device.
+ *
+ * All we do here is de-assign the AS slots on all active VMs, so things
+ * get flushed to the main memory, and no further access to these VMs are
+ * possible.
+ *
+ * We also suspend the MMU IRQ.
+ */
+void panthor_mmu_suspend(struct panthor_device *ptdev)
+{
+	mutex_lock(&ptdev->mmu->as.slots_lock);
+	for (u32 i = 0; i < ARRAY_SIZE(ptdev->mmu->as.slots); i++) {
+		struct panthor_vm *vm = ptdev->mmu->as.slots[i].vm;
+
+		if (vm) {
+			drm_WARN_ON(&ptdev->base, panthor_mmu_as_disable(ptdev, i));
+			panthor_vm_release_as_locked(vm);
+		}
+	}
+	mutex_unlock(&ptdev->mmu->as.slots_lock);
+
+	panthor_mmu_irq_suspend(&ptdev->mmu->irq);
+}
+
+/**
+ * panthor_mmu_resume() - Resume the MMU logic
+ * @ptdev: Device.
+ *
+ * Resume the IRQ.
+ *
+ * We don't re-enable previously active VMs. We assume other parts of the
+ * driver will call panthor_vm_active() on the VMs they intend to use.
+ */
+void panthor_mmu_resume(struct panthor_device *ptdev)
+{
+	mutex_lock(&ptdev->mmu->as.slots_lock);
+	ptdev->mmu->as.alloc_mask = 0;
+	ptdev->mmu->as.faulty_mask = 0;
+	mutex_unlock(&ptdev->mmu->as.slots_lock);
+
+	panthor_mmu_irq_resume(&ptdev->mmu->irq, panthor_mmu_fault_mask(ptdev, ~0));
+}
+
+/**
+ * panthor_mmu_pre_reset() - Prepare for a reset
+ * @ptdev: Device.
+ *
+ * Suspend the IRQ, and make sure all VM_BIND queues are stopped, so we
+ * don't get asked to do a VM operation while the GPU is down.
+ *
+ * We don't cleanly shutdown the AS slots here, because the reset might
+ * come from an AS_ACTIVE_BIT stuck situation.
+ */
+void panthor_mmu_pre_reset(struct panthor_device *ptdev)
+{
+	struct panthor_vm *vm;
+
+	panthor_mmu_irq_suspend(&ptdev->mmu->irq);
+
+	mutex_lock(&ptdev->mmu->vm.lock);
+	ptdev->mmu->vm.reset_in_progress = true;
+	list_for_each_entry(vm, &ptdev->mmu->vm.list, node)
+		panthor_vm_stop(vm);
+	mutex_unlock(&ptdev->mmu->vm.lock);
+}
+
+/**
+ * panthor_mmu_post_reset() - Restore things after a reset
+ * @ptdev: Device.
+ *
+ * Put the MMU logic back in action after a reset. That implies resuming the
+ * IRQ and re-enabling the VM_BIND queues.
+ */
+void panthor_mmu_post_reset(struct panthor_device *ptdev)
+{
+	struct panthor_vm *vm;
+
+	mutex_lock(&ptdev->mmu->as.slots_lock);
+
+	/* Now that the reset is effective, we can assume that none of the
+	 * AS slots are setup, and clear the faulty flags too.
+	 */
+	ptdev->mmu->as.alloc_mask = 0;
+	ptdev->mmu->as.faulty_mask = 0;
+
+	for (u32 i = 0; i < ARRAY_SIZE(ptdev->mmu->as.slots); i++) {
+		struct panthor_vm *vm = ptdev->mmu->as.slots[i].vm;
+
+		if (vm)
+			panthor_vm_release_as_locked(vm);
+	}
+
+	mutex_unlock(&ptdev->mmu->as.slots_lock);
+
+	panthor_mmu_irq_resume(&ptdev->mmu->irq, panthor_mmu_fault_mask(ptdev, ~0));
+
+	/* Restart the VM_BIND queues. */
+	mutex_lock(&ptdev->mmu->vm.lock);
+	list_for_each_entry(vm, &ptdev->mmu->vm.list, node) {
+		panthor_vm_start(vm);
+	}
+	ptdev->mmu->vm.reset_in_progress = false;
+	mutex_unlock(&ptdev->mmu->vm.lock);
+}
+
+static void panthor_vm_free(struct drm_gpuvm *gpuvm)
+{
+	struct panthor_vm *vm = container_of(gpuvm, struct panthor_vm, base);
+	struct panthor_device *ptdev = vm->ptdev;
+
+	mutex_lock(&vm->heaps.lock);
+	if (drm_WARN_ON(&ptdev->base, vm->heaps.pool))
+		panthor_heap_pool_destroy(vm->heaps.pool);
+	mutex_unlock(&vm->heaps.lock);
+	mutex_destroy(&vm->heaps.lock);
+
+	mutex_lock(&ptdev->mmu->vm.lock);
+	list_del(&vm->node);
+	/* Restore the scheduler state so we can call drm_sched_entity_destroy()
+	 * and drm_sched_fini(). If get there, that means we have no job left
+	 * and no new jobs can be queued, so we can start the scheduler without
+	 * risking interfering with the reset.
+	 */
+	if (ptdev->mmu->vm.reset_in_progress)
+		panthor_vm_start(vm);
+	mutex_unlock(&ptdev->mmu->vm.lock);
+
+	drm_sched_entity_destroy(&vm->entity);
+	drm_sched_fini(&vm->sched);
+
+	mutex_lock(&ptdev->mmu->as.slots_lock);
+	if (vm->as.id >= 0) {
+		int cookie;
+
+		if (drm_dev_enter(&ptdev->base, &cookie)) {
+			panthor_mmu_as_disable(ptdev, vm->as.id);
+			drm_dev_exit(cookie);
+		}
+
+		ptdev->mmu->as.slots[vm->as.id].vm = NULL;
+		clear_bit(vm->as.id, &ptdev->mmu->as.alloc_mask);
+		list_del(&vm->as.lru_node);
+	}
+	mutex_unlock(&ptdev->mmu->as.slots_lock);
+
+	free_io_pgtable_ops(vm->pgtbl_ops);
+
+	drm_mm_takedown(&vm->mm);
+	kfree(vm);
+}
+
+/**
+ * panthor_vm_put() - Release a reference on a VM
+ * @vm: VM to release the reference on. Can be NULL.
+ */
+void panthor_vm_put(struct panthor_vm *vm)
+{
+	drm_gpuvm_put(vm ? &vm->base : NULL);
+}
+
+/**
+ * panthor_vm_get() - Get a VM reference
+ * @vm: VM to get the reference on. Can be NULL.
+ *
+ * Return: @vm value.
+ */
+struct panthor_vm *panthor_vm_get(struct panthor_vm *vm)
+{
+	if (vm)
+		drm_gpuvm_get(&vm->base);
+
+	return vm;
+}
+
+/**
+ * panthor_vm_get_heap_pool() - Get the heap pool attached to a VM
+ * @vm: VM to query the heap pool on.
+ * @create: True if the heap pool should be created when it doesn't exist.
+ *
+ * Heap pools are per-VM. This function allows one to retrieve the heap pool
+ * attached to a VM.
+ *
+ * If no heap pool exists yet, and @create is true, we create one.
+ *
+ * The returned panthor_heap_pool should be released with panthor_heap_pool_put().
+ *
+ * Return: A valid pointer on success, an ERR_PTR() otherwise.
+ */
+struct panthor_heap_pool *panthor_vm_get_heap_pool(struct panthor_vm *vm, bool create)
+{
+	struct panthor_heap_pool *pool;
+
+	mutex_lock(&vm->heaps.lock);
+	if (!vm->heaps.pool && create) {
+		if (vm->destroyed)
+			pool = ERR_PTR(-EINVAL);
+		else
+			pool = panthor_heap_pool_create(vm->ptdev, vm);
+
+		if (!IS_ERR(pool))
+			vm->heaps.pool = panthor_heap_pool_get(pool);
+	} else {
+		pool = panthor_heap_pool_get(vm->heaps.pool);
+	}
+	mutex_unlock(&vm->heaps.lock);
+
+	return pool;
+}
+
+static u64 mair_to_memattr(u64 mair)
+{
+	u64 memattr = 0;
+	u32 i;
+
+	for (i = 0; i < 8; i++) {
+		u8 in_attr = mair >> (8 * i), out_attr;
+		u8 outer = in_attr >> 4, inner = in_attr & 0xf;
+
+		/* For caching to be enabled, inner and outer caching policy
+		 * have to be both write-back, if one of them is write-through
+		 * or non-cacheable, we just choose non-cacheable. Device
+		 * memory is also translated to non-cacheable.
+		 */
+		if (!(outer & 3) || !(outer & 4) || !(inner & 4)) {
+			out_attr = AS_MEMATTR_AARCH64_INNER_OUTER_NC |
+				   AS_MEMATTR_AARCH64_SH_MIDGARD_INNER |
+				   AS_MEMATTR_AARCH64_INNER_ALLOC_EXPL(false, false);
+		} else {
+			/* Use SH_CPU_INNER mode so SH_IS, which is used when
+			 * IOMMU_CACHE is set, actually maps to the standard
+			 * definition of inner-shareable and not Mali's
+			 * internal-shareable mode.
+			 */
+			out_attr = AS_MEMATTR_AARCH64_INNER_OUTER_WB |
+				   AS_MEMATTR_AARCH64_SH_CPU_INNER |
+				   AS_MEMATTR_AARCH64_INNER_ALLOC_EXPL(inner & 1, inner & 2);
+		}
+
+		memattr |= (u64)out_attr << (8 * i);
+	}
+
+	return memattr;
+}
+
+static void panthor_vma_link(struct panthor_vm *vm,
+			     struct panthor_vma *vma,
+			     struct drm_gpuvm_bo *vm_bo)
+{
+	struct panthor_gem_object *bo = to_panthor_bo(vma->base.gem.obj);
+
+	mutex_lock(&bo->gpuva_list_lock);
+	drm_gpuva_link(&vma->base, vm_bo);
+	drm_WARN_ON(&vm->ptdev->base, drm_gpuvm_bo_put(vm_bo));
+	mutex_unlock(&bo->gpuva_list_lock);
+}
+
+static void panthor_vma_unlink(struct panthor_vm *vm,
+			       struct panthor_vma *vma)
+{
+	struct panthor_gem_object *bo = to_panthor_bo(vma->base.gem.obj);
+	struct drm_gpuvm_bo *vm_bo = drm_gpuvm_bo_get(vma->base.vm_bo);
+
+	mutex_lock(&bo->gpuva_list_lock);
+	drm_gpuva_unlink(&vma->base);
+	mutex_unlock(&bo->gpuva_list_lock);
+
+	/* drm_gpuva_unlink() release the vm_bo, but we manually retained it
+	 * when entering this function, so we can implement deferred VMA
+	 * destruction. Re-assign it here.
+	 */
+	vma->base.vm_bo = vm_bo;
+	list_add_tail(&vma->node, &vm->op_ctx->returned_vmas);
+}
+
+static void panthor_vma_init(struct panthor_vma *vma, u32 flags)
+{
+	INIT_LIST_HEAD(&vma->node);
+	vma->flags = flags;
+}
+
+#define PANTHOR_VM_MAP_FLAGS \
+	(DRM_PANTHOR_VM_BIND_OP_MAP_READONLY | \
+	 DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC | \
+	 DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED)
+
+static int panthor_gpuva_sm_step_map(struct drm_gpuva_op *op, void *priv)
+{
+	struct panthor_vm *vm = priv;
+	struct panthor_vm_op_ctx *op_ctx = vm->op_ctx;
+	struct panthor_vma *vma = panthor_vm_op_ctx_get_vma(op_ctx);
+	int ret;
+
+	if (!vma)
+		return -EINVAL;
+
+	panthor_vma_init(vma, op_ctx->flags & PANTHOR_VM_MAP_FLAGS);
+
+	ret = panthor_vm_map_pages(vm, op->map.va.addr, flags_to_prot(vma->flags),
+				   op_ctx->map.sgt, op->map.gem.offset,
+				   op->map.va.range);
+	if (ret)
+		return ret;
+
+	/* Ref owned by the mapping now, clear the obj field so we don't release the
+	 * pinning/obj ref behind GPUVA's back.
+	 */
+	drm_gpuva_map(&vm->base, &vma->base, &op->map);
+	panthor_vma_link(vm, vma, op_ctx->map.vm_bo);
+	op_ctx->map.vm_bo = NULL;
+	return 0;
+}
+
+static int panthor_gpuva_sm_step_remap(struct drm_gpuva_op *op,
+				       void *priv)
+{
+	struct panthor_vma *unmap_vma = container_of(op->remap.unmap->va, struct panthor_vma, base);
+	struct panthor_vm *vm = priv;
+	struct panthor_vm_op_ctx *op_ctx = vm->op_ctx;
+	struct panthor_vma *prev_vma = NULL, *next_vma = NULL;
+	u64 unmap_start, unmap_range;
+	int ret;
+
+	drm_gpuva_op_remap_to_unmap_range(&op->remap, &unmap_start, &unmap_range);
+	ret = panthor_vm_unmap_pages(vm, unmap_start, unmap_range);
+	if (ret)
+		return ret;
+
+	if (op->remap.prev) {
+		prev_vma = panthor_vm_op_ctx_get_vma(op_ctx);
+		panthor_vma_init(prev_vma, unmap_vma->flags);
+	}
+
+	if (op->remap.next) {
+		next_vma = panthor_vm_op_ctx_get_vma(op_ctx);
+		panthor_vma_init(next_vma, unmap_vma->flags);
+	}
+
+	drm_gpuva_remap(prev_vma ? &prev_vma->base : NULL,
+			next_vma ? &next_vma->base : NULL,
+			&op->remap);
+
+	if (prev_vma) {
+		/* panthor_vma_link() transfers the vm_bo ownership to
+		 * the VMA object. Since the vm_bo we're passing is still
+		 * owned by the old mapping which will be released when this
+		 * mapping is destroyed, we need to grab a ref here.
+		 */
+		panthor_vma_link(vm, prev_vma,
+				 drm_gpuvm_bo_get(op->remap.unmap->va->vm_bo));
+	}
+
+	if (next_vma) {
+		panthor_vma_link(vm, next_vma,
+				 drm_gpuvm_bo_get(op->remap.unmap->va->vm_bo));
+	}
+
+	panthor_vma_unlink(vm, unmap_vma);
+	return 0;
+}
+
+static int panthor_gpuva_sm_step_unmap(struct drm_gpuva_op *op,
+				       void *priv)
+{
+	struct panthor_vma *unmap_vma = container_of(op->unmap.va, struct panthor_vma, base);
+	struct panthor_vm *vm = priv;
+	int ret;
+
+	ret = panthor_vm_unmap_pages(vm, unmap_vma->base.va.addr,
+				     unmap_vma->base.va.range);
+	if (drm_WARN_ON(&vm->ptdev->base, ret))
+		return ret;
+
+	drm_gpuva_unmap(&op->unmap);
+	panthor_vma_unlink(vm, unmap_vma);
+	return 0;
+}
+
+static const struct drm_gpuvm_ops panthor_gpuvm_ops = {
+	.vm_free = panthor_vm_free,
+	.sm_step_map = panthor_gpuva_sm_step_map,
+	.sm_step_remap = panthor_gpuva_sm_step_remap,
+	.sm_step_unmap = panthor_gpuva_sm_step_unmap,
+};
+
+/**
+ * panthor_vm_resv() - Get the dma_resv object attached to a VM.
+ * @vm: VM to get the dma_resv of.
+ *
+ * Return: A dma_resv object.
+ */
+struct dma_resv *panthor_vm_resv(struct panthor_vm *vm)
+{
+	return drm_gpuvm_resv(&vm->base);
+}
+
+struct drm_gem_object *panthor_vm_root_gem(struct panthor_vm *vm)
+{
+	if (!vm)
+		return NULL;
+
+	return vm->base.r_obj;
+}
+
+static int
+panthor_vm_exec_op(struct panthor_vm *vm, struct panthor_vm_op_ctx *op,
+		   bool flag_vm_unusable_on_failure)
+{
+	u32 op_type = op->flags & DRM_PANTHOR_VM_BIND_OP_TYPE_MASK;
+	int ret;
+
+	if (op_type == DRM_PANTHOR_VM_BIND_OP_TYPE_SYNC_ONLY)
+		return 0;
+
+	mutex_lock(&vm->op_lock);
+	vm->op_ctx = op;
+	switch (op_type) {
+	case DRM_PANTHOR_VM_BIND_OP_TYPE_MAP:
+		if (vm->unusable) {
+			ret = -EINVAL;
+			break;
+		}
+
+		ret = drm_gpuvm_sm_map(&vm->base, vm, op->va.addr, op->va.range,
+				       op->map.vm_bo->obj, op->map.bo_offset);
+		break;
+
+	case DRM_PANTHOR_VM_BIND_OP_TYPE_UNMAP:
+		ret = drm_gpuvm_sm_unmap(&vm->base, vm, op->va.addr, op->va.range);
+		break;
+
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	if (ret && flag_vm_unusable_on_failure)
+		vm->unusable = true;
+
+	vm->op_ctx = NULL;
+	mutex_unlock(&vm->op_lock);
+
+	return ret;
+}
+
+static struct dma_fence *
+panthor_vm_bind_run_job(struct drm_sched_job *sched_job)
+{
+	struct panthor_vm_bind_job *job = container_of(sched_job, struct panthor_vm_bind_job, base);
+	bool cookie;
+	int ret;
+
+	/* Not only we report an error whose result is propagated to the
+	 * drm_sched finished fence, but we also flag the VM as unusable, because
+	 * a failure in the async VM_BIND results in an inconsistent state. VM needs
+	 * to be destroyed and recreated.
+	 */
+	cookie = dma_fence_begin_signalling();
+	ret = panthor_vm_exec_op(job->vm, &job->ctx, true);
+	dma_fence_end_signalling(cookie);
+
+	return ret ? ERR_PTR(ret) : NULL;
+}
+
+static void panthor_vm_bind_job_release(struct kref *kref)
+{
+	struct panthor_vm_bind_job *job = container_of(kref, struct panthor_vm_bind_job, refcount);
+
+	if (job->base.s_fence)
+		drm_sched_job_cleanup(&job->base);
+
+	panthor_vm_cleanup_op_ctx(&job->ctx, job->vm);
+	panthor_vm_put(job->vm);
+	kfree(job);
+}
+
+/**
+ * panthor_vm_bind_job_put() - Release a VM_BIND job reference
+ * @sched_job: Job to release the reference on.
+ */
+void panthor_vm_bind_job_put(struct drm_sched_job *sched_job)
+{
+	struct panthor_vm_bind_job *job =
+		container_of(sched_job, struct panthor_vm_bind_job, base);
+
+	if (sched_job)
+		kref_put(&job->refcount, panthor_vm_bind_job_release);
+}
+
+static void
+panthor_vm_bind_free_job(struct drm_sched_job *sched_job)
+{
+	struct panthor_vm_bind_job *job =
+		container_of(sched_job, struct panthor_vm_bind_job, base);
+
+	drm_sched_job_cleanup(sched_job);
+
+	/* Do the heavy cleanups asynchronously, so we're out of the
+	 * dma-signaling path and can acquire dma-resv locks safely.
+	 */
+	queue_work(panthor_cleanup_wq, &job->cleanup_op_ctx_work);
+}
+
+static enum drm_gpu_sched_stat
+panthor_vm_bind_timedout_job(struct drm_sched_job *sched_job)
+{
+	WARN(1, "VM_BIND ops are synchronous for now, there should be no timeout!");
+	return DRM_GPU_SCHED_STAT_NOMINAL;
+}
+
+static const struct drm_sched_backend_ops panthor_vm_bind_ops = {
+	.run_job = panthor_vm_bind_run_job,
+	.free_job = panthor_vm_bind_free_job,
+	.timedout_job = panthor_vm_bind_timedout_job,
+};
+
+/**
+ * panthor_vm_create() - Create a VM
+ * @ptdev: Device.
+ * @for_mcu: True if this is the FW MCU VM.
+ * @kernel_va_start: Start of the range reserved for kernel BO mapping.
+ * @kernel_va_size: Size of the range reserved for kernel BO mapping.
+ * @auto_kernel_va_start: Start of the auto-VA kernel range.
+ * @auto_kernel_va_size: Size of the auto-VA kernel range.
+ *
+ * Return: A valid pointer on success, an ERR_PTR() otherwise.
+ */
+struct panthor_vm *
+panthor_vm_create(struct panthor_device *ptdev, bool for_mcu,
+		  u64 kernel_va_start, u64 kernel_va_size,
+		  u64 auto_kernel_va_start, u64 auto_kernel_va_size)
+{
+	u32 va_bits = GPU_MMU_FEATURES_VA_BITS(ptdev->gpu_info.mmu_features);
+	u32 pa_bits = GPU_MMU_FEATURES_PA_BITS(ptdev->gpu_info.mmu_features);
+	u64 full_va_range = 1ull << va_bits;
+	struct drm_gem_object *dummy_gem;
+	struct drm_gpu_scheduler *sched;
+	struct io_pgtable_cfg pgtbl_cfg;
+	u64 mair, min_va, va_range;
+	struct panthor_vm *vm;
+	int ret;
+
+	vm = kzalloc(sizeof(*vm), GFP_KERNEL);
+	if (!vm)
+		return ERR_PTR(-ENOMEM);
+
+	/* We allocate a dummy GEM for the VM. */
+	dummy_gem = drm_gpuvm_resv_object_alloc(&ptdev->base);
+	if (!dummy_gem) {
+		ret = -ENOMEM;
+		goto err_free_vm;
+	}
+
+	mutex_init(&vm->heaps.lock);
+	vm->for_mcu = for_mcu;
+	vm->ptdev = ptdev;
+	mutex_init(&vm->op_lock);
+
+	if (for_mcu) {
+		/* CSF MCU is a cortex M7, and can only address 4G */
+		min_va = 0;
+		va_range = SZ_4G;
+	} else {
+		min_va = 0;
+		va_range = full_va_range;
+	}
+
+	mutex_init(&vm->mm_lock);
+	drm_mm_init(&vm->mm, kernel_va_start, kernel_va_size);
+	vm->kernel_auto_va.start = auto_kernel_va_start;
+	vm->kernel_auto_va.end = vm->kernel_auto_va.start + auto_kernel_va_size - 1;
+
+	INIT_LIST_HEAD(&vm->node);
+	INIT_LIST_HEAD(&vm->as.lru_node);
+	vm->as.id = -1;
+	refcount_set(&vm->as.active_cnt, 0);
+
+	pgtbl_cfg = (struct io_pgtable_cfg) {
+		.pgsize_bitmap	= SZ_4K | SZ_2M,
+		.ias		= va_bits,
+		.oas		= pa_bits,
+		.coherent_walk	= ptdev->coherent,
+		.tlb		= &mmu_tlb_ops,
+		.iommu_dev	= ptdev->base.dev,
+		.alloc		= alloc_pt,
+		.free		= free_pt,
+	};
+
+	vm->pgtbl_ops = alloc_io_pgtable_ops(ARM_64_LPAE_S1, &pgtbl_cfg, vm);
+	if (!vm->pgtbl_ops) {
+		ret = -EINVAL;
+		goto err_mm_takedown;
+	}
+
+	/* Bind operations are synchronous for now, no timeout needed. */
+	ret = drm_sched_init(&vm->sched, &panthor_vm_bind_ops, ptdev->mmu->vm.wq,
+			     1, 1, 0,
+			     MAX_SCHEDULE_TIMEOUT, NULL, NULL,
+			     "panthor-vm-bind", ptdev->base.dev);
+	if (ret)
+		goto err_free_io_pgtable;
+
+	sched = &vm->sched;
+	ret = drm_sched_entity_init(&vm->entity, 0, &sched, 1, NULL);
+	if (ret)
+		goto err_sched_fini;
+
+	mair = io_pgtable_ops_to_pgtable(vm->pgtbl_ops)->cfg.arm_lpae_s1_cfg.mair;
+	vm->memattr = mair_to_memattr(mair);
+
+	mutex_lock(&ptdev->mmu->vm.lock);
+	list_add_tail(&vm->node, &ptdev->mmu->vm.list);
+
+	/* If a reset is in progress, stop the scheduler. */
+	if (ptdev->mmu->vm.reset_in_progress)
+		panthor_vm_stop(vm);
+	mutex_unlock(&ptdev->mmu->vm.lock);
+
+	/* We intentionally leave the reserved range to zero, because we want kernel VMAs
+	 * to be handled the same way user VMAs are.
+	 */
+	drm_gpuvm_init(&vm->base, for_mcu ? "panthor-MCU-VM" : "panthor-GPU-VM",
+		       DRM_GPUVM_RESV_PROTECTED, &ptdev->base, dummy_gem,
+		       min_va, va_range, 0, 0, &panthor_gpuvm_ops);
+	drm_gem_object_put(dummy_gem);
+	return vm;
+
+err_sched_fini:
+	drm_sched_fini(&vm->sched);
+
+err_free_io_pgtable:
+	free_io_pgtable_ops(vm->pgtbl_ops);
+
+err_mm_takedown:
+	drm_mm_takedown(&vm->mm);
+	drm_gem_object_put(dummy_gem);
+
+err_free_vm:
+	kfree(vm);
+	return ERR_PTR(ret);
+}
+
+static int
+panthor_vm_bind_prepare_op_ctx(struct drm_file *file,
+			       struct panthor_vm *vm,
+			       const struct drm_panthor_vm_bind_op *op,
+			       struct panthor_vm_op_ctx *op_ctx)
+{
+	struct drm_gem_object *gem;
+	int ret;
+
+	/* Aligned on page size. */
+	if ((op->va | op->size) & ~PAGE_MASK)
+		return -EINVAL;
+
+	switch (op->flags & DRM_PANTHOR_VM_BIND_OP_TYPE_MASK) {
+	case DRM_PANTHOR_VM_BIND_OP_TYPE_MAP:
+		gem = drm_gem_object_lookup(file, op->bo_handle);
+		ret = panthor_vm_prepare_map_op_ctx(op_ctx, vm,
+						    gem ? to_panthor_bo(gem) : NULL,
+						    op->bo_offset,
+						    op->size,
+						    op->va,
+						    op->flags);
+		drm_gem_object_put(gem);
+		return ret;
+
+	case DRM_PANTHOR_VM_BIND_OP_TYPE_UNMAP:
+		if (op->flags & ~DRM_PANTHOR_VM_BIND_OP_TYPE_MASK)
+			return -EINVAL;
+
+		if (op->bo_handle || op->bo_offset)
+			return -EINVAL;
+
+		return panthor_vm_prepare_unmap_op_ctx(op_ctx, vm, op->va, op->size);
+
+	case DRM_PANTHOR_VM_BIND_OP_TYPE_SYNC_ONLY:
+		if (op->flags & ~DRM_PANTHOR_VM_BIND_OP_TYPE_MASK)
+			return -EINVAL;
+
+		if (op->bo_handle || op->bo_offset)
+			return -EINVAL;
+
+		if (op->va || op->size)
+			return -EINVAL;
+
+		if (!op->syncs.count)
+			return -EINVAL;
+
+		panthor_vm_prepare_sync_only_op_ctx(op_ctx, vm);
+		return 0;
+
+	default:
+		return -EINVAL;
+	}
+}
+
+static void panthor_vm_bind_job_cleanup_op_ctx_work(struct work_struct *work)
+{
+	struct panthor_vm_bind_job *job =
+		container_of(work, struct panthor_vm_bind_job, cleanup_op_ctx_work);
+
+	panthor_vm_bind_job_put(&job->base);
+}
+
+/**
+ * panthor_vm_bind_job_create() - Create a VM_BIND job
+ * @file: File.
+ * @vm: VM targeted by the VM_BIND job.
+ * @op: VM operation data.
+ *
+ * Return: A valid pointer on success, an ERR_PTR() otherwise.
+ */
+struct drm_sched_job *
+panthor_vm_bind_job_create(struct drm_file *file,
+			   struct panthor_vm *vm,
+			   const struct drm_panthor_vm_bind_op *op)
+{
+	struct panthor_vm_bind_job *job;
+	int ret;
+
+	if (!vm)
+		return ERR_PTR(-EINVAL);
+
+	if (vm->destroyed || vm->unusable)
+		return ERR_PTR(-EINVAL);
+
+	job = kzalloc(sizeof(*job), GFP_KERNEL);
+	if (!job)
+		return ERR_PTR(-ENOMEM);
+
+	ret = panthor_vm_bind_prepare_op_ctx(file, vm, op, &job->ctx);
+	if (ret) {
+		kfree(job);
+		return ERR_PTR(ret);
+	}
+
+	INIT_WORK(&job->cleanup_op_ctx_work, panthor_vm_bind_job_cleanup_op_ctx_work);
+	kref_init(&job->refcount);
+	job->vm = panthor_vm_get(vm);
+
+	ret = drm_sched_job_init(&job->base, &vm->entity, 1, vm);
+	if (ret)
+		goto err_put_job;
+
+	return &job->base;
+
+err_put_job:
+	panthor_vm_bind_job_put(&job->base);
+	return ERR_PTR(ret);
+}
+
+/**
+ * panthor_vm_bind_job_prepare_resvs() - Prepare VM_BIND job dma_resvs
+ * @exec: The locking/preparation context.
+ * @sched_job: The job to prepare resvs on.
+ *
+ * Locks and prepare the VM resv.
+ *
+ * If this is a map operation, locks and prepares the GEM resv.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+int panthor_vm_bind_job_prepare_resvs(struct drm_exec *exec,
+				      struct drm_sched_job *sched_job)
+{
+	struct panthor_vm_bind_job *job = container_of(sched_job, struct panthor_vm_bind_job, base);
+	int ret;
+
+	/* Acquire the VM lock an reserve a slot for this VM bind job. */
+	ret = drm_gpuvm_prepare_vm(&job->vm->base, exec, 1);
+	if (ret)
+		return ret;
+
+	if (job->ctx.map.vm_bo) {
+		/* Lock/prepare the GEM being mapped. */
+		ret = drm_exec_prepare_obj(exec, job->ctx.map.vm_bo->obj, 1);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/**
+ * panthor_vm_bind_job_update_resvs() - Update the resv objects touched by a job
+ * @exec: drm_exec context.
+ * @sched_job: Job to update the resvs on.
+ */
+void panthor_vm_bind_job_update_resvs(struct drm_exec *exec,
+				      struct drm_sched_job *sched_job)
+{
+	struct panthor_vm_bind_job *job = container_of(sched_job, struct panthor_vm_bind_job, base);
+
+	/* Explicit sync => we just register our job finished fence as bookkeep. */
+	drm_gpuvm_resv_add_fence(&job->vm->base, exec,
+				 &sched_job->s_fence->finished,
+				 DMA_RESV_USAGE_BOOKKEEP,
+				 DMA_RESV_USAGE_BOOKKEEP);
+}
+
+void panthor_vm_update_resvs(struct panthor_vm *vm, struct drm_exec *exec,
+			     struct dma_fence *fence,
+			     enum dma_resv_usage private_usage,
+			     enum dma_resv_usage extobj_usage)
+{
+	drm_gpuvm_resv_add_fence(&vm->base, exec, fence, private_usage, extobj_usage);
+}
+
+/**
+ * panthor_vm_bind_exec_sync_op() - Execute a VM_BIND operation synchronously.
+ * @file: File.
+ * @vm: VM targeted by the VM operation.
+ * @op: Data describing the VM operation.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+int panthor_vm_bind_exec_sync_op(struct drm_file *file,
+				 struct panthor_vm *vm,
+				 struct drm_panthor_vm_bind_op *op)
+{
+	struct panthor_vm_op_ctx op_ctx;
+	int ret;
+
+	/* No sync objects allowed on synchronous operations. */
+	if (op->syncs.count)
+		return -EINVAL;
+
+	if (!op->size)
+		return 0;
+
+	ret = panthor_vm_bind_prepare_op_ctx(file, vm, op, &op_ctx);
+	if (ret)
+		return ret;
+
+	ret = panthor_vm_exec_op(vm, &op_ctx, false);
+	panthor_vm_cleanup_op_ctx(&op_ctx, vm);
+
+	return ret;
+}
+
+/**
+ * panthor_vm_map_bo_range() - Map a GEM object range to a VM
+ * @vm: VM to map the GEM to.
+ * @bo: GEM object to map.
+ * @offset: Offset in the GEM object.
+ * @size: Size to map.
+ * @va: Virtual address to map the object to.
+ * @flags: Combination of drm_panthor_vm_bind_op_flags flags.
+ * Only map-related flags are valid.
+ *
+ * Internal use only. For userspace requests, use
+ * panthor_vm_bind_exec_sync_op() instead.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+int panthor_vm_map_bo_range(struct panthor_vm *vm, struct panthor_gem_object *bo,
+			    u64 offset, u64 size, u64 va, u32 flags)
+{
+	struct panthor_vm_op_ctx op_ctx;
+	int ret;
+
+	ret = panthor_vm_prepare_map_op_ctx(&op_ctx, vm, bo, offset, size, va, flags);
+	if (ret)
+		return ret;
+
+	ret = panthor_vm_exec_op(vm, &op_ctx, false);
+	panthor_vm_cleanup_op_ctx(&op_ctx, vm);
+
+	return ret;
+}
+
+/**
+ * panthor_vm_unmap_range() - Unmap a portion of the VA space
+ * @vm: VM to unmap the region from.
+ * @va: Virtual address to unmap. Must be 4k aligned.
+ * @size: Size of the region to unmap. Must be 4k aligned.
+ *
+ * Internal use only. For userspace requests, use
+ * panthor_vm_bind_exec_sync_op() instead.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+int panthor_vm_unmap_range(struct panthor_vm *vm, u64 va, u64 size)
+{
+	struct panthor_vm_op_ctx op_ctx;
+	int ret;
+
+	ret = panthor_vm_prepare_unmap_op_ctx(&op_ctx, vm, va, size);
+	if (ret)
+		return ret;
+
+	ret = panthor_vm_exec_op(vm, &op_ctx, false);
+	panthor_vm_cleanup_op_ctx(&op_ctx, vm);
+
+	return ret;
+}
+
+/**
+ * panthor_vm_prepare_mapped_bos_resvs() - Prepare resvs on VM BOs.
+ * @exec: Locking/preparation context.
+ * @vm: VM targeted by the GPU job.
+ * @slot_count: Number of slots to reserve.
+ *
+ * GPU jobs assume all BOs bound to the VM at the time the job is submitted
+ * are available when the job is executed. In order to guarantee that, we
+ * need to reserve a slot on all BOs mapped to a VM and update this slot with
+ * the job fence after its submission.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+int panthor_vm_prepare_mapped_bos_resvs(struct drm_exec *exec, struct panthor_vm *vm,
+					u32 slot_count)
+{
+	int ret;
+
+	/* Acquire the VM lock and reserve a slot for this GPU job. */
+	ret = drm_gpuvm_prepare_vm(&vm->base, exec, slot_count);
+	if (ret)
+		return ret;
+
+	return drm_gpuvm_prepare_objects(&vm->base, exec, slot_count);
+}
+
+/**
+ * panthor_mmu_unplug() - Unplug the MMU logic
+ * @ptdev: Device.
+ *
+ * No access to the MMU regs should be done after this function is called.
+ * We suspend the IRQ and disable all VMs to guarantee that.
+ */
+void panthor_mmu_unplug(struct panthor_device *ptdev)
+{
+	panthor_mmu_irq_suspend(&ptdev->mmu->irq);
+
+	mutex_lock(&ptdev->mmu->as.slots_lock);
+	for (u32 i = 0; i < ARRAY_SIZE(ptdev->mmu->as.slots); i++) {
+		struct panthor_vm *vm = ptdev->mmu->as.slots[i].vm;
+
+		if (vm) {
+			drm_WARN_ON(&ptdev->base, panthor_mmu_as_disable(ptdev, i));
+			panthor_vm_release_as_locked(vm);
+		}
+	}
+	mutex_unlock(&ptdev->mmu->as.slots_lock);
+}
+
+static void panthor_mmu_release_wq(struct drm_device *ddev, void *res)
+{
+	destroy_workqueue(res);
+}
+
+/**
+ * panthor_mmu_init() - Initialize the MMU logic.
+ * @ptdev: Device.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+int panthor_mmu_init(struct panthor_device *ptdev)
+{
+	u32 va_bits = GPU_MMU_FEATURES_VA_BITS(ptdev->gpu_info.mmu_features);
+	struct panthor_mmu *mmu;
+	int ret, irq;
+
+	mmu = drmm_kzalloc(&ptdev->base, sizeof(*mmu), GFP_KERNEL);
+	if (!mmu)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&mmu->as.lru_list);
+
+	ret = drmm_mutex_init(&ptdev->base, &mmu->as.slots_lock);
+	if (ret)
+		return ret;
+
+	INIT_LIST_HEAD(&mmu->vm.list);
+	ret = drmm_mutex_init(&ptdev->base, &mmu->vm.lock);
+	if (ret)
+		return ret;
+
+	ptdev->mmu = mmu;
+
+	irq = platform_get_irq_byname(to_platform_device(ptdev->base.dev), "mmu");
+	if (irq <= 0)
+		return -ENODEV;
+
+	ret = panthor_request_mmu_irq(ptdev, &mmu->irq, irq,
+				      panthor_mmu_fault_mask(ptdev, ~0));
+	if (ret)
+		return ret;
+
+	mmu->vm.wq = alloc_workqueue("panthor-vm-bind", WQ_UNBOUND, 0);
+	if (!mmu->vm.wq)
+		return -ENOMEM;
+
+	/* On 32-bit kernels, the VA space is limited by the io_pgtable_ops abstraction,
+	 * which passes iova as an unsigned long. Patch the mmu_features to reflect this
+	 * limitation.
+	 */
+	if (sizeof(unsigned long) * 8 < va_bits) {
+		ptdev->gpu_info.mmu_features &= ~GENMASK(7, 0);
+		ptdev->gpu_info.mmu_features |= sizeof(unsigned long) * 8;
+	}
+
+	return drmm_add_action_or_reset(&ptdev->base, panthor_mmu_release_wq, mmu->vm.wq);
+}
+
+/**
+ * panthor_mmu_pt_cache_init() - Initialize the page table cache.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+int panthor_mmu_pt_cache_init(void)
+{
+	pt_cache = kmem_cache_create("panthor-mmu-pt", SZ_4K, SZ_4K, 0, NULL);
+	if (!pt_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/**
+ * panthor_mmu_pt_cache_fini() - Destroy the page table cache.
+ */
+void panthor_mmu_pt_cache_fini(void)
+{
+	kmem_cache_destroy(pt_cache);
+}
diff --git a/drivers/gpu/drm/panthor/panthor_mmu.h b/drivers/gpu/drm/panthor/panthor_mmu.h
new file mode 100644
index 0000000000000..e6a2198b009b7
--- /dev/null
+++ b/drivers/gpu/drm/panthor/panthor_mmu.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0 or MIT */
+/* Copyright 2019 Linaro, Ltd, Rob Herring <robh@kernel.org> */
+/* Copyright 2023 Collabora ltd. */
+
+#ifndef __PANTHOR_MMU_H__
+#define __PANTHOR_MMU_H__
+
+#include <linux/dma-resv.h>
+
+struct drm_exec;
+struct drm_sched_job;
+struct panthor_gem_object;
+struct panthor_heap_pool;
+struct panthor_vm;
+struct panthor_vma;
+struct panthor_mmu;
+
+int panthor_mmu_init(struct panthor_device *ptdev);
+void panthor_mmu_unplug(struct panthor_device *ptdev);
+void panthor_mmu_pre_reset(struct panthor_device *ptdev);
+void panthor_mmu_post_reset(struct panthor_device *ptdev);
+void panthor_mmu_suspend(struct panthor_device *ptdev);
+void panthor_mmu_resume(struct panthor_device *ptdev);
+
+int panthor_vm_map_bo_range(struct panthor_vm *vm, struct panthor_gem_object *bo,
+			    u64 offset, u64 size, u64 va, u32 flags);
+int panthor_vm_unmap_range(struct panthor_vm *vm, u64 va, u64 size);
+struct panthor_gem_object *
+panthor_vm_get_bo_for_va(struct panthor_vm *vm, u64 va, u64 *bo_offset);
+
+int panthor_vm_active(struct panthor_vm *vm);
+void panthor_vm_idle(struct panthor_vm *vm);
+int panthor_vm_as(struct panthor_vm *vm);
+
+struct panthor_heap_pool *
+panthor_vm_get_heap_pool(struct panthor_vm *vm, bool create);
+
+struct panthor_vm *panthor_vm_get(struct panthor_vm *vm);
+void panthor_vm_put(struct panthor_vm *vm);
+struct panthor_vm *panthor_vm_create(struct panthor_device *ptdev, bool for_mcu,
+				     u64 kernel_va_start, u64 kernel_va_size,
+				     u64 kernel_auto_va_start,
+				     u64 kernel_auto_va_size);
+
+int panthor_vm_prepare_mapped_bos_resvs(struct drm_exec *exec,
+					struct panthor_vm *vm,
+					u32 slot_count);
+int panthor_vm_add_bos_resvs_deps_to_job(struct panthor_vm *vm,
+					 struct drm_sched_job *job);
+void panthor_vm_add_job_fence_to_bos_resvs(struct panthor_vm *vm,
+					   struct drm_sched_job *job);
+
+struct dma_resv *panthor_vm_resv(struct panthor_vm *vm);
+struct drm_gem_object *panthor_vm_root_gem(struct panthor_vm *vm);
+
+void panthor_vm_pool_destroy(struct panthor_file *pfile);
+int panthor_vm_pool_create(struct panthor_file *pfile);
+int panthor_vm_pool_create_vm(struct panthor_device *ptdev,
+			      struct panthor_vm_pool *pool,
+			      struct drm_panthor_vm_create *args);
+int panthor_vm_pool_destroy_vm(struct panthor_vm_pool *pool, u32 handle);
+struct panthor_vm *panthor_vm_pool_get_vm(struct panthor_vm_pool *pool, u32 handle);
+
+bool panthor_vm_has_unhandled_faults(struct panthor_vm *vm);
+bool panthor_vm_is_unusable(struct panthor_vm *vm);
+
+/*
+ * PANTHOR_VM_KERNEL_AUTO_VA: Use this magic address when you want the GEM
+ * logic to auto-allocate the virtual address in the reserved kernel VA range.
+ */
+#define PANTHOR_VM_KERNEL_AUTO_VA		~0ull
+
+int panthor_vm_alloc_va(struct panthor_vm *vm, u64 va, u64 size,
+			struct drm_mm_node *va_node);
+void panthor_vm_free_va(struct panthor_vm *vm, struct drm_mm_node *va_node);
+
+int panthor_vm_bind_exec_sync_op(struct drm_file *file,
+				 struct panthor_vm *vm,
+				 struct drm_panthor_vm_bind_op *op);
+
+struct drm_sched_job *
+panthor_vm_bind_job_create(struct drm_file *file,
+			   struct panthor_vm *vm,
+			   const struct drm_panthor_vm_bind_op *op);
+void panthor_vm_bind_job_put(struct drm_sched_job *job);
+int panthor_vm_bind_job_prepare_resvs(struct drm_exec *exec,
+				      struct drm_sched_job *job);
+void panthor_vm_bind_job_update_resvs(struct drm_exec *exec, struct drm_sched_job *job);
+
+void panthor_vm_update_resvs(struct panthor_vm *vm, struct drm_exec *exec,
+			     struct dma_fence *fence,
+			     enum dma_resv_usage private_usage,
+			     enum dma_resv_usage extobj_usage);
+
+int panthor_mmu_pt_cache_init(void);
+void panthor_mmu_pt_cache_fini(void);
+
+#endif
diff --git a/drivers/gpu/drm/panthor/panthor_regs.h b/drivers/gpu/drm/panthor/panthor_regs.h
new file mode 100644
index 0000000000000..b7b3b3add1662
--- /dev/null
+++ b/drivers/gpu/drm/panthor/panthor_regs.h
@@ -0,0 +1,239 @@
+/* SPDX-License-Identifier: GPL-2.0 or MIT */
+/* Copyright 2018 Marty E. Plummer <hanetzer@startmail.com> */
+/* Copyright 2019 Linaro, Ltd, Rob Herring <robh@kernel.org> */
+/* Copyright 2023 Collabora ltd. */
+/*
+ * Register definitions based on mali_kbase_gpu_regmap.h and
+ * mali_kbase_gpu_regmap_csf.h
+ * (C) COPYRIGHT 2010-2022 ARM Limited. All rights reserved.
+ */
+#ifndef __PANTHOR_REGS_H__
+#define __PANTHOR_REGS_H__
+
+#define GPU_ID						0x0
+#define   GPU_ARCH_MAJOR(x)				((x) >> 28)
+#define   GPU_ARCH_MINOR(x)				(((x) & GENMASK(27, 24)) >> 24)
+#define   GPU_ARCH_REV(x)				(((x) & GENMASK(23, 20)) >> 20)
+#define   GPU_PROD_MAJOR(x)				(((x) & GENMASK(19, 16)) >> 16)
+#define   GPU_VER_MAJOR(x)				(((x) & GENMASK(15, 12)) >> 12)
+#define   GPU_VER_MINOR(x)				(((x) & GENMASK(11, 4)) >> 4)
+#define   GPU_VER_STATUS(x)				((x) & GENMASK(3, 0))
+
+#define GPU_L2_FEATURES					0x4
+#define  GPU_L2_FEATURES_LINE_SIZE(x)			(1 << ((x) & GENMASK(7, 0)))
+
+#define GPU_CORE_FEATURES				0x8
+
+#define GPU_TILER_FEATURES				0xC
+#define GPU_MEM_FEATURES				0x10
+#define   GROUPS_L2_COHERENT				BIT(0)
+
+#define GPU_MMU_FEATURES				0x14
+#define  GPU_MMU_FEATURES_VA_BITS(x)			((x) & GENMASK(7, 0))
+#define  GPU_MMU_FEATURES_PA_BITS(x)			(((x) >> 8) & GENMASK(7, 0))
+#define GPU_AS_PRESENT					0x18
+#define GPU_CSF_ID					0x1C
+
+#define GPU_INT_RAWSTAT					0x20
+#define GPU_INT_CLEAR					0x24
+#define GPU_INT_MASK					0x28
+#define GPU_INT_STAT					0x2c
+#define   GPU_IRQ_FAULT					BIT(0)
+#define   GPU_IRQ_PROTM_FAULT				BIT(1)
+#define   GPU_IRQ_RESET_COMPLETED			BIT(8)
+#define   GPU_IRQ_POWER_CHANGED				BIT(9)
+#define   GPU_IRQ_POWER_CHANGED_ALL			BIT(10)
+#define   GPU_IRQ_CLEAN_CACHES_COMPLETED		BIT(17)
+#define   GPU_IRQ_DOORBELL_MIRROR			BIT(18)
+#define   GPU_IRQ_MCU_STATUS_CHANGED			BIT(19)
+#define GPU_CMD						0x30
+#define   GPU_CMD_DEF(type, payload)			((type) | ((payload) << 8))
+#define   GPU_SOFT_RESET				GPU_CMD_DEF(1, 1)
+#define   GPU_HARD_RESET				GPU_CMD_DEF(1, 2)
+#define   CACHE_CLEAN					BIT(0)
+#define   CACHE_INV					BIT(1)
+#define   GPU_FLUSH_CACHES(l2, lsc, oth)		\
+	  GPU_CMD_DEF(4, ((l2) << 0) | ((lsc) << 4) | ((oth) << 8))
+
+#define GPU_STATUS					0x34
+#define   GPU_STATUS_ACTIVE				BIT(0)
+#define   GPU_STATUS_PWR_ACTIVE				BIT(1)
+#define   GPU_STATUS_PAGE_FAULT				BIT(4)
+#define   GPU_STATUS_PROTM_ACTIVE			BIT(7)
+#define   GPU_STATUS_DBG_ENABLED			BIT(8)
+
+#define GPU_FAULT_STATUS				0x3C
+#define GPU_FAULT_ADDR_LO				0x40
+#define GPU_FAULT_ADDR_HI				0x44
+
+#define GPU_PWR_KEY					0x50
+#define  GPU_PWR_KEY_UNLOCK				0x2968A819
+#define GPU_PWR_OVERRIDE0				0x54
+#define GPU_PWR_OVERRIDE1				0x58
+
+#define GPU_TIMESTAMP_OFFSET_LO				0x88
+#define GPU_TIMESTAMP_OFFSET_HI				0x8C
+#define GPU_CYCLE_COUNT_LO				0x90
+#define GPU_CYCLE_COUNT_HI				0x94
+#define GPU_TIMESTAMP_LO				0x98
+#define GPU_TIMESTAMP_HI				0x9C
+
+#define GPU_THREAD_MAX_THREADS				0xA0
+#define GPU_THREAD_MAX_WORKGROUP_SIZE			0xA4
+#define GPU_THREAD_MAX_BARRIER_SIZE			0xA8
+#define GPU_THREAD_FEATURES				0xAC
+
+#define GPU_TEXTURE_FEATURES(n)				(0xB0 + ((n) * 4))
+
+#define GPU_SHADER_PRESENT_LO				0x100
+#define GPU_SHADER_PRESENT_HI				0x104
+#define GPU_TILER_PRESENT_LO				0x110
+#define GPU_TILER_PRESENT_HI				0x114
+#define GPU_L2_PRESENT_LO				0x120
+#define GPU_L2_PRESENT_HI				0x124
+
+#define SHADER_READY_LO					0x140
+#define SHADER_READY_HI					0x144
+#define TILER_READY_LO					0x150
+#define TILER_READY_HI					0x154
+#define L2_READY_LO					0x160
+#define L2_READY_HI					0x164
+
+#define SHADER_PWRON_LO					0x180
+#define SHADER_PWRON_HI					0x184
+#define TILER_PWRON_LO					0x190
+#define TILER_PWRON_HI					0x194
+#define L2_PWRON_LO					0x1A0
+#define L2_PWRON_HI					0x1A4
+
+#define SHADER_PWROFF_LO				0x1C0
+#define SHADER_PWROFF_HI				0x1C4
+#define TILER_PWROFF_LO					0x1D0
+#define TILER_PWROFF_HI					0x1D4
+#define L2_PWROFF_LO					0x1E0
+#define L2_PWROFF_HI					0x1E4
+
+#define SHADER_PWRTRANS_LO				0x200
+#define SHADER_PWRTRANS_HI				0x204
+#define TILER_PWRTRANS_LO				0x210
+#define TILER_PWRTRANS_HI				0x214
+#define L2_PWRTRANS_LO					0x220
+#define L2_PWRTRANS_HI					0x224
+
+#define SHADER_PWRACTIVE_LO				0x240
+#define SHADER_PWRACTIVE_HI				0x244
+#define TILER_PWRACTIVE_LO				0x250
+#define TILER_PWRACTIVE_HI				0x254
+#define L2_PWRACTIVE_LO					0x260
+#define L2_PWRACTIVE_HI					0x264
+
+#define GPU_REVID					0x280
+
+#define GPU_COHERENCY_FEATURES				0x300
+#define GPU_COHERENCY_PROT_BIT(name)			BIT(GPU_COHERENCY_  ## name)
+
+#define GPU_COHERENCY_PROTOCOL				0x304
+#define   GPU_COHERENCY_ACE				0
+#define   GPU_COHERENCY_ACE_LITE			1
+#define   GPU_COHERENCY_NONE				31
+
+#define MCU_CONTROL					0x700
+#define MCU_CONTROL_ENABLE				1
+#define MCU_CONTROL_AUTO				2
+#define MCU_CONTROL_DISABLE				0
+
+#define MCU_STATUS					0x704
+#define MCU_STATUS_DISABLED				0
+#define MCU_STATUS_ENABLED				1
+#define MCU_STATUS_HALT					2
+#define MCU_STATUS_FATAL				3
+
+/* Job Control regs */
+#define JOB_INT_RAWSTAT					0x1000
+#define JOB_INT_CLEAR					0x1004
+#define JOB_INT_MASK					0x1008
+#define JOB_INT_STAT					0x100c
+#define   JOB_INT_GLOBAL_IF				BIT(31)
+#define   JOB_INT_CSG_IF(x)				BIT(x)
+
+/* MMU regs */
+#define MMU_INT_RAWSTAT					0x2000
+#define MMU_INT_CLEAR					0x2004
+#define MMU_INT_MASK					0x2008
+#define MMU_INT_STAT					0x200c
+
+/* AS_COMMAND register commands */
+
+#define MMU_BASE					0x2400
+#define MMU_AS_SHIFT					6
+#define MMU_AS(as)					(MMU_BASE + ((as) << MMU_AS_SHIFT))
+
+#define AS_TRANSTAB_LO(as)				(MMU_AS(as) + 0x0)
+#define AS_TRANSTAB_HI(as)				(MMU_AS(as) + 0x4)
+#define AS_MEMATTR_LO(as)				(MMU_AS(as) + 0x8)
+#define AS_MEMATTR_HI(as)				(MMU_AS(as) + 0xC)
+#define   AS_MEMATTR_AARCH64_INNER_ALLOC_IMPL		(2 << 2)
+#define   AS_MEMATTR_AARCH64_INNER_ALLOC_EXPL(w, r)	((3 << 2) | \
+							 ((w) ? BIT(0) : 0) | \
+							 ((r) ? BIT(1) : 0))
+#define   AS_MEMATTR_AARCH64_SH_MIDGARD_INNER		(0 << 4)
+#define   AS_MEMATTR_AARCH64_SH_CPU_INNER		(1 << 4)
+#define   AS_MEMATTR_AARCH64_SH_CPU_INNER_SHADER_COH	(2 << 4)
+#define   AS_MEMATTR_AARCH64_SHARED			(0 << 6)
+#define   AS_MEMATTR_AARCH64_INNER_OUTER_NC		(1 << 6)
+#define   AS_MEMATTR_AARCH64_INNER_OUTER_WB		(2 << 6)
+#define   AS_MEMATTR_AARCH64_FAULT			(3 << 6)
+#define AS_LOCKADDR_LO(as)				(MMU_AS(as) + 0x10)
+#define AS_LOCKADDR_HI(as)				(MMU_AS(as) + 0x14)
+#define AS_COMMAND(as)					(MMU_AS(as) + 0x18)
+#define   AS_COMMAND_NOP				0
+#define   AS_COMMAND_UPDATE				1
+#define   AS_COMMAND_LOCK				2
+#define   AS_COMMAND_UNLOCK				3
+#define   AS_COMMAND_FLUSH_PT				4
+#define   AS_COMMAND_FLUSH_MEM				5
+#define   AS_LOCK_REGION_MIN_SIZE			(1ULL << 15)
+#define AS_FAULTSTATUS(as)				(MMU_AS(as) + 0x1C)
+#define  AS_FAULTSTATUS_ACCESS_TYPE_MASK		(0x3 << 8)
+#define  AS_FAULTSTATUS_ACCESS_TYPE_ATOMIC		(0x0 << 8)
+#define  AS_FAULTSTATUS_ACCESS_TYPE_EX			(0x1 << 8)
+#define  AS_FAULTSTATUS_ACCESS_TYPE_READ		(0x2 << 8)
+#define  AS_FAULTSTATUS_ACCESS_TYPE_WRITE		(0x3 << 8)
+#define AS_FAULTADDRESS_LO(as)				(MMU_AS(as) + 0x20)
+#define AS_FAULTADDRESS_HI(as)				(MMU_AS(as) + 0x24)
+#define AS_STATUS(as)					(MMU_AS(as) + 0x28)
+#define   AS_STATUS_AS_ACTIVE				BIT(0)
+#define AS_TRANSCFG_LO(as)				(MMU_AS(as) + 0x30)
+#define AS_TRANSCFG_HI(as)				(MMU_AS(as) + 0x34)
+#define   AS_TRANSCFG_ADRMODE_UNMAPPED			(1 << 0)
+#define   AS_TRANSCFG_ADRMODE_IDENTITY			(2 << 0)
+#define   AS_TRANSCFG_ADRMODE_AARCH64_4K		(6 << 0)
+#define   AS_TRANSCFG_ADRMODE_AARCH64_64K		(8 << 0)
+#define   AS_TRANSCFG_INA_BITS(x)			((x) << 6)
+#define   AS_TRANSCFG_OUTA_BITS(x)			((x) << 14)
+#define   AS_TRANSCFG_SL_CONCAT				BIT(22)
+#define   AS_TRANSCFG_PTW_MEMATTR_NC			(1 << 24)
+#define   AS_TRANSCFG_PTW_MEMATTR_WB			(2 << 24)
+#define   AS_TRANSCFG_PTW_SH_NS				(0 << 28)
+#define   AS_TRANSCFG_PTW_SH_OS				(2 << 28)
+#define   AS_TRANSCFG_PTW_SH_IS				(3 << 28)
+#define   AS_TRANSCFG_PTW_RA				BIT(30)
+#define   AS_TRANSCFG_DISABLE_HIER_AP			BIT(33)
+#define   AS_TRANSCFG_DISABLE_AF_FAULT			BIT(34)
+#define   AS_TRANSCFG_WXN				BIT(35)
+#define   AS_TRANSCFG_XREADABLE				BIT(36)
+#define AS_FAULTEXTRA_LO(as)				(MMU_AS(as) + 0x38)
+#define AS_FAULTEXTRA_HI(as)				(MMU_AS(as) + 0x3C)
+
+#define CSF_GPU_LATEST_FLUSH_ID				0x10000
+
+#define CSF_DOORBELL(i)					(0x80000 + ((i) * 0x10000))
+#define CSF_GLB_DOORBELL_ID				0
+
+#define gpu_write(dev, reg, data) \
+	writel(data, (dev)->iomem + (reg))
+
+#define gpu_read(dev, reg) \
+	readl((dev)->iomem + (reg))
+
+#endif
diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
new file mode 100644
index 0000000000000..5f7803b6fc48c
--- /dev/null
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -0,0 +1,3502 @@
+// SPDX-License-Identifier: GPL-2.0 or MIT
+/* Copyright 2023 Collabora ltd. */
+
+#include <drm/drm_drv.h>
+#include <drm/drm_exec.h>
+#include <drm/drm_gem_shmem_helper.h>
+#include <drm/drm_managed.h>
+#include <drm/gpu_scheduler.h>
+#include <drm/panthor_drm.h>
+
+#include <linux/build_bug.h>
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/dma-resv.h>
+#include <linux/firmware.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/iopoll.h>
+#include <linux/iosys-map.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+
+#include "panthor_devfreq.h"
+#include "panthor_device.h"
+#include "panthor_fw.h"
+#include "panthor_gem.h"
+#include "panthor_gpu.h"
+#include "panthor_heap.h"
+#include "panthor_mmu.h"
+#include "panthor_regs.h"
+#include "panthor_sched.h"
+
+/**
+ * DOC: Scheduler
+ *
+ * Mali CSF hardware adopts a firmware-assisted scheduling model, where
+ * the firmware takes care of scheduling aspects, to some extent.
+ *
+ * The scheduling happens at the scheduling group level, each group
+ * contains 1 to N queues (N is FW/hardware dependent, and exposed
+ * through the firmware interface). Each queue is assigned a command
+ * stream ring buffer, which serves as a way to get jobs submitted to
+ * the GPU, among other things.
+ *
+ * The firmware can schedule a maximum of M groups (M is FW/hardware
+ * dependent, and exposed through the firmware interface). Passed
+ * this maximum number of groups, the kernel must take care of
+ * rotating the groups passed to the firmware so every group gets
+ * a chance to have his queues scheduled for execution.
+ *
+ * The current implementation only supports with kernel-mode queues.
+ * In other terms, userspace doesn't have access to the ring-buffer.
+ * Instead, userspace passes indirect command stream buffers that are
+ * called from the queue ring-buffer by the kernel using a pre-defined
+ * sequence of command stream instructions to ensure the userspace driver
+ * always gets consistent results (cache maintenance,
+ * synchronization, ...).
+ *
+ * We rely on the drm_gpu_scheduler framework to deal with job
+ * dependencies and submission. As any other driver dealing with a
+ * FW-scheduler, we use the 1:1 entity:scheduler mode, such that each
+ * entity has its own job scheduler. When a job is ready to be executed
+ * (all its dependencies are met), it is pushed to the appropriate
+ * queue ring-buffer, and the group is scheduled for execution if it
+ * wasn't already active.
+ *
+ * Kernel-side group scheduling is timeslice-based. When we have less
+ * groups than there are slots, the periodic tick is disabled and we
+ * just let the FW schedule the active groups. When there are more
+ * groups than slots, we let each group a chance to execute stuff for
+ * a given amount of time, and then re-evaluate and pick new groups
+ * to schedule. The group selection algorithm is based on
+ * priority+round-robin.
+ *
+ * Even though user-mode queues is out of the scope right now, the
+ * current design takes them into account by avoiding any guess on the
+ * group/queue state that would be based on information we wouldn't have
+ * if userspace was in charge of the ring-buffer. That's also one of the
+ * reason we don't do 'cooperative' scheduling (encoding FW group slot
+ * reservation as dma_fence that would be returned from the
+ * drm_gpu_scheduler::prepare_job() hook, and treating group rotation as
+ * a queue of waiters, ordered by job submission order). This approach
+ * would work for kernel-mode queues, but would make user-mode queues a
+ * lot more complicated to retrofit.
+ */
+
+#define JOB_TIMEOUT_MS				5000
+
+#define MIN_CS_PER_CSG				8
+
+#define MIN_CSGS				3
+#define MAX_CSG_PRIO				0xf
+
+struct panthor_group;
+
+/**
+ * struct panthor_csg_slot - Command stream group slot
+ *
+ * This represents a FW slot for a scheduling group.
+ */
+struct panthor_csg_slot {
+	/** @group: Scheduling group bound to this slot. */
+	struct panthor_group *group;
+
+	/** @priority: Group priority. */
+	u8 priority;
+
+	/**
+	 * @idle: True if the group bound to this slot is idle.
+	 *
+	 * A group is idle when it has nothing waiting for execution on
+	 * all its queues, or when queues are blocked waiting for something
+	 * to happen (synchronization object).
+	 */
+	bool idle;
+};
+
+/**
+ * enum panthor_csg_priority - Group priority
+ */
+enum panthor_csg_priority {
+	/** @PANTHOR_CSG_PRIORITY_LOW: Low priority group. */
+	PANTHOR_CSG_PRIORITY_LOW = 0,
+
+	/** @PANTHOR_CSG_PRIORITY_MEDIUM: Medium priority group. */
+	PANTHOR_CSG_PRIORITY_MEDIUM,
+
+	/** @PANTHOR_CSG_PRIORITY_HIGH: High priority group. */
+	PANTHOR_CSG_PRIORITY_HIGH,
+
+	/**
+	 * @PANTHOR_CSG_PRIORITY_RT: Real-time priority group.
+	 *
+	 * Real-time priority allows one to preempt scheduling of other
+	 * non-real-time groups. When such a group becomes executable,
+	 * it will evict the group with the lowest non-rt priority if
+	 * there's no free group slot available.
+	 *
+	 * Currently not exposed to userspace.
+	 */
+	PANTHOR_CSG_PRIORITY_RT,
+
+	/** @PANTHOR_CSG_PRIORITY_COUNT: Number of priority levels. */
+	PANTHOR_CSG_PRIORITY_COUNT,
+};
+
+/**
+ * struct panthor_scheduler - Object used to manage the scheduler
+ */
+struct panthor_scheduler {
+	/** @ptdev: Device. */
+	struct panthor_device *ptdev;
+
+	/**
+	 * @wq: Workqueue used by our internal scheduler logic and
+	 * drm_gpu_scheduler.
+	 *
+	 * Used for the scheduler tick, group update or other kind of FW
+	 * event processing that can't be handled in the threaded interrupt
+	 * path. Also passed to the drm_gpu_scheduler instances embedded
+	 * in panthor_queue.
+	 */
+	struct workqueue_struct *wq;
+
+	/**
+	 * @heap_alloc_wq: Workqueue used to schedule tiler_oom works.
+	 *
+	 * We have a queue dedicated to heap chunk allocation works to avoid
+	 * blocking the rest of the scheduler if the allocation tries to
+	 * reclaim memory.
+	 */
+	struct workqueue_struct *heap_alloc_wq;
+
+	/** @tick_work: Work executed on a scheduling tick. */
+	struct delayed_work tick_work;
+
+	/**
+	 * @sync_upd_work: Work used to process synchronization object updates.
+	 *
+	 * We use this work to unblock queues/groups that were waiting on a
+	 * synchronization object.
+	 */
+	struct work_struct sync_upd_work;
+
+	/**
+	 * @fw_events_work: Work used to process FW events outside the interrupt path.
+	 *
+	 * Even if the interrupt is threaded, we need any event processing
+	 * that require taking the panthor_scheduler::lock to be processed
+	 * outside the interrupt path so we don't block the tick logic when
+	 * it calls panthor_fw_{csg,wait}_wait_acks(). Since most of the
+	 * event processing requires taking this lock, we just delegate all
+	 * FW event processing to the scheduler workqueue.
+	 */
+	struct work_struct fw_events_work;
+
+	/**
+	 * @fw_events: Bitmask encoding pending FW events.
+	 */
+	atomic_t fw_events;
+
+	/**
+	 * @resched_target: When the next tick should occur.
+	 *
+	 * Expressed in jiffies.
+	 */
+	u64 resched_target;
+
+	/**
+	 * @last_tick: When the last tick occurred.
+	 *
+	 * Expressed in jiffies.
+	 */
+	u64 last_tick;
+
+	/** @tick_period: Tick period in jiffies. */
+	u64 tick_period;
+
+	/**
+	 * @lock: Lock protecting access to all the scheduler fields.
+	 *
+	 * Should be taken in the tick work, the irq handler, and anywhere the @groups
+	 * fields are touched.
+	 */
+	struct mutex lock;
+
+	/** @groups: Various lists used to classify groups. */
+	struct {
+		/**
+		 * @runnable: Runnable group lists.
+		 *
+		 * When a group has queues that want to execute something,
+		 * its panthor_group::run_node should be inserted here.
+		 *
+		 * One list per-priority.
+		 */
+		struct list_head runnable[PANTHOR_CSG_PRIORITY_COUNT];
+
+		/**
+		 * @idle: Idle group lists.
+		 *
+		 * When all queues of a group are idle (either because they
+		 * have nothing to execute, or because they are blocked), the
+		 * panthor_group::run_node field should be inserted here.
+		 *
+		 * One list per-priority.
+		 */
+		struct list_head idle[PANTHOR_CSG_PRIORITY_COUNT];
+
+		/**
+		 * @waiting: List of groups whose queues are blocked on a
+		 * synchronization object.
+		 *
+		 * Insert panthor_group::wait_node here when a group is waiting
+		 * for synchronization objects to be signaled.
+		 *
+		 * This list is evaluated in the @sync_upd_work work.
+		 */
+		struct list_head waiting;
+	} groups;
+
+	/**
+	 * @csg_slots: FW command stream group slots.
+	 */
+	struct panthor_csg_slot csg_slots[MAX_CSGS];
+
+	/** @csg_slot_count: Number of command stream group slots exposed by the FW. */
+	u32 csg_slot_count;
+
+	/** @cs_slot_count: Number of command stream slot per group slot exposed by the FW. */
+	u32 cs_slot_count;
+
+	/** @as_slot_count: Number of address space slots supported by the MMU. */
+	u32 as_slot_count;
+
+	/** @used_csg_slot_count: Number of command stream group slot currently used. */
+	u32 used_csg_slot_count;
+
+	/** @sb_slot_count: Number of scoreboard slots. */
+	u32 sb_slot_count;
+
+	/**
+	 * @might_have_idle_groups: True if an active group might have become idle.
+	 *
+	 * This will force a tick, so other runnable groups can be scheduled if one
+	 * or more active groups became idle.
+	 */
+	bool might_have_idle_groups;
+
+	/** @pm: Power management related fields. */
+	struct {
+		/** @has_ref: True if the scheduler owns a runtime PM reference. */
+		bool has_ref;
+	} pm;
+
+	/** @reset: Reset related fields. */
+	struct {
+		/** @lock: Lock protecting the other reset fields. */
+		struct mutex lock;
+
+		/**
+		 * @in_progress: True if a reset is in progress.
+		 *
+		 * Set to true in panthor_sched_pre_reset() and back to false in
+		 * panthor_sched_post_reset().
+		 */
+		atomic_t in_progress;
+
+		/**
+		 * @stopped_groups: List containing all groups that were stopped
+		 * before a reset.
+		 *
+		 * Insert panthor_group::run_node in the pre_reset path.
+		 */
+		struct list_head stopped_groups;
+	} reset;
+};
+
+/**
+ * struct panthor_syncobj_32b - 32-bit FW synchronization object
+ */
+struct panthor_syncobj_32b {
+	/** @seqno: Sequence number. */
+	u32 seqno;
+
+	/**
+	 * @status: Status.
+	 *
+	 * Not zero on failure.
+	 */
+	u32 status;
+};
+
+/**
+ * struct panthor_syncobj_64b - 64-bit FW synchronization object
+ */
+struct panthor_syncobj_64b {
+	/** @seqno: Sequence number. */
+	u64 seqno;
+
+	/**
+	 * @status: Status.
+	 *
+	 * Not zero on failure.
+	 */
+	u32 status;
+
+	/** @pad: MBZ. */
+	u32 pad;
+};
+
+/**
+ * struct panthor_queue - Execution queue
+ */
+struct panthor_queue {
+	/** @scheduler: DRM scheduler used for this queue. */
+	struct drm_gpu_scheduler scheduler;
+
+	/** @entity: DRM scheduling entity used for this queue. */
+	struct drm_sched_entity entity;
+
+	/**
+	 * @remaining_time: Time remaining before the job timeout expires.
+	 *
+	 * The job timeout is suspended when the queue is not scheduled by the
+	 * FW. Every time we suspend the timer, we need to save the remaining
+	 * time so we can restore it later on.
+	 */
+	unsigned long remaining_time;
+
+	/** @timeout_suspended: True if the job timeout was suspended. */
+	bool timeout_suspended;
+
+	/**
+	 * @doorbell_id: Doorbell assigned to this queue.
+	 *
+	 * Right now, all groups share the same doorbell, and the doorbell ID
+	 * is assigned to group_slot + 1 when the group is assigned a slot. But
+	 * we might decide to provide fine grained doorbell assignment at some
+	 * point, so don't have to wake up all queues in a group every time one
+	 * of them is updated.
+	 */
+	u8 doorbell_id;
+
+	/**
+	 * @priority: Priority of the queue inside the group.
+	 *
+	 * Must be less than 16 (Only 4 bits available).
+	 */
+	u8 priority;
+#define CSF_MAX_QUEUE_PRIO	GENMASK(3, 0)
+
+	/** @ringbuf: Command stream ring-buffer. */
+	struct panthor_kernel_bo *ringbuf;
+
+	/** @iface: Firmware interface. */
+	struct {
+		/** @mem: FW memory allocated for this interface. */
+		struct panthor_kernel_bo *mem;
+
+		/** @input: Input interface. */
+		struct panthor_fw_ringbuf_input_iface *input;
+
+		/** @output: Output interface. */
+		const struct panthor_fw_ringbuf_output_iface *output;
+
+		/** @input_fw_va: FW virtual address of the input interface buffer. */
+		u32 input_fw_va;
+
+		/** @output_fw_va: FW virtual address of the output interface buffer. */
+		u32 output_fw_va;
+	} iface;
+
+	/**
+	 * @syncwait: Stores information about the synchronization object this
+	 * queue is waiting on.
+	 */
+	struct {
+		/** @gpu_va: GPU address of the synchronization object. */
+		u64 gpu_va;
+
+		/** @ref: Reference value to compare against. */
+		u64 ref;
+
+		/** @gt: True if this is a greater-than test. */
+		bool gt;
+
+		/** @sync64: True if this is a 64-bit sync object. */
+		bool sync64;
+
+		/** @bo: Buffer object holding the synchronization object. */
+		struct drm_gem_object *obj;
+
+		/** @offset: Offset of the synchronization object inside @bo. */
+		u64 offset;
+
+		/**
+		 * @kmap: Kernel mapping of the buffer object holding the
+		 * synchronization object.
+		 */
+		void *kmap;
+	} syncwait;
+
+	/** @fence_ctx: Fence context fields. */
+	struct {
+		/** @lock: Used to protect access to all fences allocated by this context. */
+		spinlock_t lock;
+
+		/**
+		 * @id: Fence context ID.
+		 *
+		 * Allocated with dma_fence_context_alloc().
+		 */
+		u64 id;
+
+		/** @seqno: Sequence number of the last initialized fence. */
+		atomic64_t seqno;
+
+		/**
+		 * @in_flight_jobs: List containing all in-flight jobs.
+		 *
+		 * Used to keep track and signal panthor_job::done_fence when the
+		 * synchronization object attached to the queue is signaled.
+		 */
+		struct list_head in_flight_jobs;
+	} fence_ctx;
+};
+
+/**
+ * enum panthor_group_state - Scheduling group state.
+ */
+enum panthor_group_state {
+	/** @PANTHOR_CS_GROUP_CREATED: Group was created, but not scheduled yet. */
+	PANTHOR_CS_GROUP_CREATED,
+
+	/** @PANTHOR_CS_GROUP_ACTIVE: Group is currently scheduled. */
+	PANTHOR_CS_GROUP_ACTIVE,
+
+	/**
+	 * @PANTHOR_CS_GROUP_SUSPENDED: Group was scheduled at least once, but is
+	 * inactive/suspended right now.
+	 */
+	PANTHOR_CS_GROUP_SUSPENDED,
+
+	/**
+	 * @PANTHOR_CS_GROUP_TERMINATED: Group was terminated.
+	 *
+	 * Can no longer be scheduled. The only allowed action is a destruction.
+	 */
+	PANTHOR_CS_GROUP_TERMINATED,
+};
+
+/**
+ * struct panthor_group - Scheduling group object
+ */
+struct panthor_group {
+	/** @refcount: Reference count */
+	struct kref refcount;
+
+	/** @ptdev: Device. */
+	struct panthor_device *ptdev;
+
+	/** @vm: VM bound to the group. */
+	struct panthor_vm *vm;
+
+	/** @compute_core_mask: Mask of shader cores that can be used for compute jobs. */
+	u64 compute_core_mask;
+
+	/** @fragment_core_mask: Mask of shader cores that can be used for fragment jobs. */
+	u64 fragment_core_mask;
+
+	/** @tiler_core_mask: Mask of tiler cores that can be used for tiler jobs. */
+	u64 tiler_core_mask;
+
+	/** @max_compute_cores: Maximum number of shader cores used for compute jobs. */
+	u8 max_compute_cores;
+
+	/** @max_compute_cores: Maximum number of shader cores used for fragment jobs. */
+	u8 max_fragment_cores;
+
+	/** @max_tiler_cores: Maximum number of tiler cores used for tiler jobs. */
+	u8 max_tiler_cores;
+
+	/** @priority: Group priority (check panthor_csg_priority). */
+	u8 priority;
+
+	/** @blocked_queues: Bitmask reflecting the blocked queues. */
+	u32 blocked_queues;
+
+	/** @idle_queues: Bitmask reflecting the idle queues. */
+	u32 idle_queues;
+
+	/** @fatal_lock: Lock used to protect access to fatal fields. */
+	spinlock_t fatal_lock;
+
+	/** @fatal_queues: Bitmask reflecting the queues that hit a fatal exception. */
+	u32 fatal_queues;
+
+	/** @tiler_oom: Mask of queues that have a tiler OOM event to process. */
+	atomic_t tiler_oom;
+
+	/** @queue_count: Number of queues in this group. */
+	u32 queue_count;
+
+	/** @queues: Queues owned by this group. */
+	struct panthor_queue *queues[MAX_CS_PER_CSG];
+
+	/**
+	 * @csg_id: ID of the FW group slot.
+	 *
+	 * -1 when the group is not scheduled/active.
+	 */
+	int csg_id;
+
+	/**
+	 * @destroyed: True when the group has been destroyed.
+	 *
+	 * If a group is destroyed it becomes useless: no further jobs can be submitted
+	 * to its queues. We simply wait for all references to be dropped so we can
+	 * release the group object.
+	 */
+	bool destroyed;
+
+	/**
+	 * @timedout: True when a timeout occurred on any of the queues owned by
+	 * this group.
+	 *
+	 * Timeouts can be reported by drm_sched or by the FW. In any case, any
+	 * timeout situation is unrecoverable, and the group becomes useless.
+	 * We simply wait for all references to be dropped so we can release the
+	 * group object.
+	 */
+	bool timedout;
+
+	/**
+	 * @syncobjs: Pool of per-queue synchronization objects.
+	 *
+	 * One sync object per queue. The position of the sync object is
+	 * determined by the queue index.
+	 */
+	struct panthor_kernel_bo *syncobjs;
+
+	/** @state: Group state. */
+	enum panthor_group_state state;
+
+	/**
+	 * @suspend_buf: Suspend buffer.
+	 *
+	 * Stores the state of the group and its queues when a group is suspended.
+	 * Used at resume time to restore the group in its previous state.
+	 *
+	 * The size of the suspend buffer is exposed through the FW interface.
+	 */
+	struct panthor_kernel_bo *suspend_buf;
+
+	/**
+	 * @protm_suspend_buf: Protection mode suspend buffer.
+	 *
+	 * Stores the state of the group and its queues when a group that's in
+	 * protection mode is suspended.
+	 *
+	 * Used at resume time to restore the group in its previous state.
+	 *
+	 * The size of the protection mode suspend buffer is exposed through the
+	 * FW interface.
+	 */
+	struct panthor_kernel_bo *protm_suspend_buf;
+
+	/** @sync_upd_work: Work used to check/signal job fences. */
+	struct work_struct sync_upd_work;
+
+	/** @tiler_oom_work: Work used to process tiler OOM events happening on this group. */
+	struct work_struct tiler_oom_work;
+
+	/** @term_work: Work used to finish the group termination procedure. */
+	struct work_struct term_work;
+
+	/**
+	 * @release_work: Work used to release group resources.
+	 *
+	 * We need to postpone the group release to avoid a deadlock when
+	 * the last ref is released in the tick work.
+	 */
+	struct work_struct release_work;
+
+	/**
+	 * @run_node: Node used to insert the group in the
+	 * panthor_group::groups::{runnable,idle} and
+	 * panthor_group::reset.stopped_groups lists.
+	 */
+	struct list_head run_node;
+
+	/**
+	 * @wait_node: Node used to insert the group in the
+	 * panthor_group::groups::waiting list.
+	 */
+	struct list_head wait_node;
+};
+
+/**
+ * group_queue_work() - Queue a group work
+ * @group: Group to queue the work for.
+ * @wname: Work name.
+ *
+ * Grabs a ref and queue a work item to the scheduler workqueue. If
+ * the work was already queued, we release the reference we grabbed.
+ *
+ * Work callbacks must release the reference we grabbed here.
+ */
+#define group_queue_work(group, wname) \
+	do { \
+		group_get(group); \
+		if (!queue_work((group)->ptdev->scheduler->wq, &(group)->wname ## _work)) \
+			group_put(group); \
+	} while (0)
+
+/**
+ * sched_queue_work() - Queue a scheduler work.
+ * @sched: Scheduler object.
+ * @wname: Work name.
+ *
+ * Conditionally queues a scheduler work if no reset is pending/in-progress.
+ */
+#define sched_queue_work(sched, wname) \
+	do { \
+		if (!atomic_read(&(sched)->reset.in_progress) && \
+		    !panthor_device_reset_is_pending((sched)->ptdev)) \
+			queue_work((sched)->wq, &(sched)->wname ## _work); \
+	} while (0)
+
+/**
+ * sched_queue_delayed_work() - Queue a scheduler delayed work.
+ * @sched: Scheduler object.
+ * @wname: Work name.
+ * @delay: Work delay in jiffies.
+ *
+ * Conditionally queues a scheduler delayed work if no reset is
+ * pending/in-progress.
+ */
+#define sched_queue_delayed_work(sched, wname, delay) \
+	do { \
+		if (!atomic_read(&sched->reset.in_progress) && \
+		    !panthor_device_reset_is_pending((sched)->ptdev)) \
+			mod_delayed_work((sched)->wq, &(sched)->wname ## _work, delay); \
+	} while (0)
+
+/*
+ * We currently set the maximum of groups per file to an arbitrary low value.
+ * But this can be updated if we need more.
+ */
+#define MAX_GROUPS_PER_POOL 128
+
+/**
+ * struct panthor_group_pool - Group pool
+ *
+ * Each file get assigned a group pool.
+ */
+struct panthor_group_pool {
+	/** @xa: Xarray used to manage group handles. */
+	struct xarray xa;
+};
+
+/**
+ * struct panthor_job - Used to manage GPU job
+ */
+struct panthor_job {
+	/** @base: Inherit from drm_sched_job. */
+	struct drm_sched_job base;
+
+	/** @refcount: Reference count. */
+	struct kref refcount;
+
+	/** @group: Group of the queue this job will be pushed to. */
+	struct panthor_group *group;
+
+	/** @queue_idx: Index of the queue inside @group. */
+	u32 queue_idx;
+
+	/** @call_info: Information about the userspace command stream call. */
+	struct {
+		/** @start: GPU address of the userspace command stream. */
+		u64 start;
+
+		/** @size: Size of the userspace command stream. */
+		u32 size;
+
+		/**
+		 * @latest_flush: Flush ID at the time the userspace command
+		 * stream was built.
+		 *
+		 * Needed for the flush reduction mechanism.
+		 */
+		u32 latest_flush;
+	} call_info;
+
+	/** @ringbuf: Position of this job is in the ring buffer. */
+	struct {
+		/** @start: Start offset. */
+		u64 start;
+
+		/** @end: End offset. */
+		u64 end;
+	} ringbuf;
+
+	/**
+	 * @node: Used to insert the job in the panthor_queue::fence_ctx::in_flight_jobs
+	 * list.
+	 */
+	struct list_head node;
+
+	/** @done_fence: Fence signaled when the job is finished or cancelled. */
+	struct dma_fence *done_fence;
+};
+
+static void
+panthor_queue_put_syncwait_obj(struct panthor_queue *queue)
+{
+	if (queue->syncwait.kmap) {
+		struct iosys_map map = IOSYS_MAP_INIT_VADDR(queue->syncwait.kmap);
+
+		drm_gem_vunmap_unlocked(queue->syncwait.obj, &map);
+		queue->syncwait.kmap = NULL;
+	}
+
+	drm_gem_object_put(queue->syncwait.obj);
+	queue->syncwait.obj = NULL;
+}
+
+static void *
+panthor_queue_get_syncwait_obj(struct panthor_group *group, struct panthor_queue *queue)
+{
+	struct panthor_device *ptdev = group->ptdev;
+	struct panthor_gem_object *bo;
+	struct iosys_map map;
+	int ret;
+
+	if (queue->syncwait.kmap)
+		return queue->syncwait.kmap + queue->syncwait.offset;
+
+	bo = panthor_vm_get_bo_for_va(group->vm,
+				      queue->syncwait.gpu_va,
+				      &queue->syncwait.offset);
+	if (drm_WARN_ON(&ptdev->base, IS_ERR_OR_NULL(bo)))
+		goto err_put_syncwait_obj;
+
+	queue->syncwait.obj = &bo->base.base;
+	ret = drm_gem_vmap_unlocked(queue->syncwait.obj, &map);
+	if (drm_WARN_ON(&ptdev->base, ret))
+		goto err_put_syncwait_obj;
+
+	queue->syncwait.kmap = map.vaddr;
+	if (drm_WARN_ON(&ptdev->base, !queue->syncwait.kmap))
+		goto err_put_syncwait_obj;
+
+	return queue->syncwait.kmap + queue->syncwait.offset;
+
+err_put_syncwait_obj:
+	panthor_queue_put_syncwait_obj(queue);
+	return NULL;
+}
+
+static void group_free_queue(struct panthor_group *group, struct panthor_queue *queue)
+{
+	if (IS_ERR_OR_NULL(queue))
+		return;
+
+	if (queue->entity.fence_context)
+		drm_sched_entity_destroy(&queue->entity);
+
+	if (queue->scheduler.ops)
+		drm_sched_fini(&queue->scheduler);
+
+	panthor_queue_put_syncwait_obj(queue);
+
+	panthor_kernel_bo_destroy(group->vm, queue->ringbuf);
+	panthor_kernel_bo_destroy(panthor_fw_vm(group->ptdev), queue->iface.mem);
+
+	kfree(queue);
+}
+
+static void group_release_work(struct work_struct *work)
+{
+	struct panthor_group *group = container_of(work,
+						   struct panthor_group,
+						   release_work);
+	struct panthor_device *ptdev = group->ptdev;
+	u32 i;
+
+	for (i = 0; i < group->queue_count; i++)
+		group_free_queue(group, group->queues[i]);
+
+	panthor_kernel_bo_destroy(panthor_fw_vm(ptdev), group->suspend_buf);
+	panthor_kernel_bo_destroy(panthor_fw_vm(ptdev), group->protm_suspend_buf);
+	panthor_kernel_bo_destroy(group->vm, group->syncobjs);
+
+	panthor_vm_put(group->vm);
+	kfree(group);
+}
+
+static void group_release(struct kref *kref)
+{
+	struct panthor_group *group = container_of(kref,
+						   struct panthor_group,
+						   refcount);
+	struct panthor_device *ptdev = group->ptdev;
+
+	drm_WARN_ON(&ptdev->base, group->csg_id >= 0);
+	drm_WARN_ON(&ptdev->base, !list_empty(&group->run_node));
+	drm_WARN_ON(&ptdev->base, !list_empty(&group->wait_node));
+
+	queue_work(panthor_cleanup_wq, &group->release_work);
+}
+
+static void group_put(struct panthor_group *group)
+{
+	if (group)
+		kref_put(&group->refcount, group_release);
+}
+
+static struct panthor_group *
+group_get(struct panthor_group *group)
+{
+	if (group)
+		kref_get(&group->refcount);
+
+	return group;
+}
+
+/**
+ * group_bind_locked() - Bind a group to a group slot
+ * @group: Group.
+ * @csg_id: Slot.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+static int
+group_bind_locked(struct panthor_group *group, u32 csg_id)
+{
+	struct panthor_device *ptdev = group->ptdev;
+	struct panthor_csg_slot *csg_slot;
+	int ret;
+
+	lockdep_assert_held(&ptdev->scheduler->lock);
+
+	if (drm_WARN_ON(&ptdev->base, group->csg_id != -1 || csg_id >= MAX_CSGS ||
+			ptdev->scheduler->csg_slots[csg_id].group))
+		return -EINVAL;
+
+	ret = panthor_vm_active(group->vm);
+	if (ret)
+		return ret;
+
+	csg_slot = &ptdev->scheduler->csg_slots[csg_id];
+	group_get(group);
+	group->csg_id = csg_id;
+
+	/* Dummy doorbell allocation: doorbell is assigned to the group and
+	 * all queues use the same doorbell.
+	 *
+	 * TODO: Implement LRU-based doorbell assignment, so the most often
+	 * updated queues get their own doorbell, thus avoiding useless checks
+	 * on queues belonging to the same group that are rarely updated.
+	 */
+	for (u32 i = 0; i < group->queue_count; i++)
+		group->queues[i]->doorbell_id = csg_id + 1;
+
+	csg_slot->group = group;
+
+	return 0;
+}
+
+/**
+ * group_unbind_locked() - Unbind a group from a slot.
+ * @group: Group to unbind.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+static int
+group_unbind_locked(struct panthor_group *group)
+{
+	struct panthor_device *ptdev = group->ptdev;
+	struct panthor_csg_slot *slot;
+
+	lockdep_assert_held(&ptdev->scheduler->lock);
+
+	if (drm_WARN_ON(&ptdev->base, group->csg_id < 0 || group->csg_id >= MAX_CSGS))
+		return -EINVAL;
+
+	if (drm_WARN_ON(&ptdev->base, group->state == PANTHOR_CS_GROUP_ACTIVE))
+		return -EINVAL;
+
+	slot = &ptdev->scheduler->csg_slots[group->csg_id];
+	panthor_vm_idle(group->vm);
+	group->csg_id = -1;
+
+	/* Tiler OOM events will be re-issued next time the group is scheduled. */
+	atomic_set(&group->tiler_oom, 0);
+	cancel_work(&group->tiler_oom_work);
+
+	for (u32 i = 0; i < group->queue_count; i++)
+		group->queues[i]->doorbell_id = -1;
+
+	slot->group = NULL;
+
+	group_put(group);
+	return 0;
+}
+
+/**
+ * cs_slot_prog_locked() - Program a queue slot
+ * @ptdev: Device.
+ * @csg_id: Group slot ID.
+ * @cs_id: Queue slot ID.
+ *
+ * Program a queue slot with the queue information so things can start being
+ * executed on this queue.
+ *
+ * The group slot must have a group bound to it already (group_bind_locked()).
+ */
+static void
+cs_slot_prog_locked(struct panthor_device *ptdev, u32 csg_id, u32 cs_id)
+{
+	struct panthor_queue *queue = ptdev->scheduler->csg_slots[csg_id].group->queues[cs_id];
+	struct panthor_fw_cs_iface *cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id);
+
+	lockdep_assert_held(&ptdev->scheduler->lock);
+
+	queue->iface.input->extract = queue->iface.output->extract;
+	drm_WARN_ON(&ptdev->base, queue->iface.input->insert < queue->iface.input->extract);
+
+	cs_iface->input->ringbuf_base = panthor_kernel_bo_gpuva(queue->ringbuf);
+	cs_iface->input->ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
+	cs_iface->input->ringbuf_input = queue->iface.input_fw_va;
+	cs_iface->input->ringbuf_output = queue->iface.output_fw_va;
+	cs_iface->input->config = CS_CONFIG_PRIORITY(queue->priority) |
+				  CS_CONFIG_DOORBELL(queue->doorbell_id);
+	cs_iface->input->ack_irq_mask = ~0;
+	panthor_fw_update_reqs(cs_iface, req,
+			       CS_IDLE_SYNC_WAIT |
+			       CS_IDLE_EMPTY |
+			       CS_STATE_START |
+			       CS_EXTRACT_EVENT,
+			       CS_IDLE_SYNC_WAIT |
+			       CS_IDLE_EMPTY |
+			       CS_STATE_MASK |
+			       CS_EXTRACT_EVENT);
+	if (queue->iface.input->insert != queue->iface.input->extract && queue->timeout_suspended) {
+		drm_sched_resume_timeout(&queue->scheduler, queue->remaining_time);
+		queue->timeout_suspended = false;
+	}
+}
+
+/**
+ * @cs_slot_reset_locked() - Reset a queue slot
+ * @ptdev: Device.
+ * @csg_id: Group slot.
+ * @cs_id: Queue slot.
+ *
+ * Change the queue slot state to STOP and suspend the queue timeout if
+ * the queue is not blocked.
+ *
+ * The group slot must have a group bound to it (group_bind_locked()).
+ */
+static int
+cs_slot_reset_locked(struct panthor_device *ptdev, u32 csg_id, u32 cs_id)
+{
+	struct panthor_fw_cs_iface *cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id);
+	struct panthor_group *group = ptdev->scheduler->csg_slots[csg_id].group;
+	struct panthor_queue *queue = group->queues[cs_id];
+
+	lockdep_assert_held(&ptdev->scheduler->lock);
+
+	panthor_fw_update_reqs(cs_iface, req,
+			       CS_STATE_STOP,
+			       CS_STATE_MASK);
+
+	/* If the queue is blocked, we want to keep the timeout running, so
+	 * we can detect unbounded waits and kill the group when that happens.
+	 */
+	if (!(group->blocked_queues & BIT(cs_id)) && !queue->timeout_suspended) {
+		queue->remaining_time = drm_sched_suspend_timeout(&queue->scheduler);
+		queue->timeout_suspended = true;
+		WARN_ON(queue->remaining_time > msecs_to_jiffies(JOB_TIMEOUT_MS));
+	}
+
+	return 0;
+}
+
+/**
+ * csg_slot_sync_priority_locked() - Synchronize the group slot priority
+ * @ptdev: Device.
+ * @csg_id: Group slot ID.
+ *
+ * Group slot priority update happens asynchronously. When we receive a
+ * %CSG_ENDPOINT_CONFIG, we know the update is effective, and can
+ * reflect it to our panthor_csg_slot object.
+ */
+static void
+csg_slot_sync_priority_locked(struct panthor_device *ptdev, u32 csg_id)
+{
+	struct panthor_csg_slot *csg_slot = &ptdev->scheduler->csg_slots[csg_id];
+	struct panthor_fw_csg_iface *csg_iface;
+
+	lockdep_assert_held(&ptdev->scheduler->lock);
+
+	csg_iface = panthor_fw_get_csg_iface(ptdev, csg_id);
+	csg_slot->priority = (csg_iface->input->endpoint_req & CSG_EP_REQ_PRIORITY_MASK) >> 28;
+}
+
+/**
+ * cs_slot_sync_queue_state_locked() - Synchronize the queue slot priority
+ * @ptdev: Device.
+ * @csg_id: Group slot.
+ * @cs_id: Queue slot.
+ *
+ * Queue state is updated on group suspend or STATUS_UPDATE event.
+ */
+static void
+cs_slot_sync_queue_state_locked(struct panthor_device *ptdev, u32 csg_id, u32 cs_id)
+{
+	struct panthor_group *group = ptdev->scheduler->csg_slots[csg_id].group;
+	struct panthor_queue *queue = group->queues[cs_id];
+	struct panthor_fw_cs_iface *cs_iface =
+		panthor_fw_get_cs_iface(group->ptdev, csg_id, cs_id);
+
+	u32 status_wait_cond;
+
+	switch (cs_iface->output->status_blocked_reason) {
+	case CS_STATUS_BLOCKED_REASON_UNBLOCKED:
+		if (queue->iface.input->insert == queue->iface.output->extract &&
+		    cs_iface->output->status_scoreboards == 0)
+			group->idle_queues |= BIT(cs_id);
+		break;
+
+	case CS_STATUS_BLOCKED_REASON_SYNC_WAIT:
+		if (list_empty(&group->wait_node)) {
+			list_move_tail(&group->wait_node,
+				       &group->ptdev->scheduler->groups.waiting);
+		}
+		group->blocked_queues |= BIT(cs_id);
+		queue->syncwait.gpu_va = cs_iface->output->status_wait_sync_ptr;
+		queue->syncwait.ref = cs_iface->output->status_wait_sync_value;
+		status_wait_cond = cs_iface->output->status_wait & CS_STATUS_WAIT_SYNC_COND_MASK;
+		queue->syncwait.gt = status_wait_cond == CS_STATUS_WAIT_SYNC_COND_GT;
+		if (cs_iface->output->status_wait & CS_STATUS_WAIT_SYNC_64B) {
+			u64 sync_val_hi = cs_iface->output->status_wait_sync_value_hi;
+
+			queue->syncwait.sync64 = true;
+			queue->syncwait.ref |= sync_val_hi << 32;
+		} else {
+			queue->syncwait.sync64 = false;
+		}
+		break;
+
+	default:
+		/* Other reasons are not blocking. Consider the queue as runnable
+		 * in those cases.
+		 */
+		break;
+	}
+}
+
+static void
+csg_slot_sync_queues_state_locked(struct panthor_device *ptdev, u32 csg_id)
+{
+	struct panthor_csg_slot *csg_slot = &ptdev->scheduler->csg_slots[csg_id];
+	struct panthor_group *group = csg_slot->group;
+	u32 i;
+
+	lockdep_assert_held(&ptdev->scheduler->lock);
+
+	group->idle_queues = 0;
+	group->blocked_queues = 0;
+
+	for (i = 0; i < group->queue_count; i++) {
+		if (group->queues[i])
+			cs_slot_sync_queue_state_locked(ptdev, csg_id, i);
+	}
+}
+
+static void
+csg_slot_sync_state_locked(struct panthor_device *ptdev, u32 csg_id)
+{
+	struct panthor_csg_slot *csg_slot = &ptdev->scheduler->csg_slots[csg_id];
+	struct panthor_fw_csg_iface *csg_iface;
+	struct panthor_group *group;
+	enum panthor_group_state new_state, old_state;
+
+	lockdep_assert_held(&ptdev->scheduler->lock);
+
+	csg_iface = panthor_fw_get_csg_iface(ptdev, csg_id);
+	group = csg_slot->group;
+
+	if (!group)
+		return;
+
+	old_state = group->state;
+	switch (csg_iface->output->ack & CSG_STATE_MASK) {
+	case CSG_STATE_START:
+	case CSG_STATE_RESUME:
+		new_state = PANTHOR_CS_GROUP_ACTIVE;
+		break;
+	case CSG_STATE_TERMINATE:
+		new_state = PANTHOR_CS_GROUP_TERMINATED;
+		break;
+	case CSG_STATE_SUSPEND:
+		new_state = PANTHOR_CS_GROUP_SUSPENDED;
+		break;
+	}
+
+	if (old_state == new_state)
+		return;
+
+	if (new_state == PANTHOR_CS_GROUP_SUSPENDED)
+		csg_slot_sync_queues_state_locked(ptdev, csg_id);
+
+	if (old_state == PANTHOR_CS_GROUP_ACTIVE) {
+		u32 i;
+
+		/* Reset the queue slots so we start from a clean
+		 * state when starting/resuming a new group on this
+		 * CSG slot. No wait needed here, and no ringbell
+		 * either, since the CS slot will only be re-used
+		 * on the next CSG start operation.
+		 */
+		for (i = 0; i < group->queue_count; i++) {
+			if (group->queues[i])
+				cs_slot_reset_locked(ptdev, csg_id, i);
+		}
+	}
+
+	group->state = new_state;
+}
+
+static int
+csg_slot_prog_locked(struct panthor_device *ptdev, u32 csg_id, u32 priority)
+{
+	struct panthor_fw_csg_iface *csg_iface;
+	struct panthor_csg_slot *csg_slot;
+	struct panthor_group *group;
+	u32 queue_mask = 0, i;
+
+	lockdep_assert_held(&ptdev->scheduler->lock);
+
+	if (priority > MAX_CSG_PRIO)
+		return -EINVAL;
+
+	if (drm_WARN_ON(&ptdev->base, csg_id >= MAX_CSGS))
+		return -EINVAL;
+
+	csg_slot = &ptdev->scheduler->csg_slots[csg_id];
+	group = csg_slot->group;
+	if (!group || group->state == PANTHOR_CS_GROUP_ACTIVE)
+		return 0;
+
+	csg_iface = panthor_fw_get_csg_iface(group->ptdev, csg_id);
+
+	for (i = 0; i < group->queue_count; i++) {
+		if (group->queues[i]) {
+			cs_slot_prog_locked(ptdev, csg_id, i);
+			queue_mask |= BIT(i);
+		}
+	}
+
+	csg_iface->input->allow_compute = group->compute_core_mask;
+	csg_iface->input->allow_fragment = group->fragment_core_mask;
+	csg_iface->input->allow_other = group->tiler_core_mask;
+	csg_iface->input->endpoint_req = CSG_EP_REQ_COMPUTE(group->max_compute_cores) |
+					 CSG_EP_REQ_FRAGMENT(group->max_fragment_cores) |
+					 CSG_EP_REQ_TILER(group->max_tiler_cores) |
+					 CSG_EP_REQ_PRIORITY(priority);
+	csg_iface->input->config = panthor_vm_as(group->vm);
+
+	if (group->suspend_buf)
+		csg_iface->input->suspend_buf = panthor_kernel_bo_gpuva(group->suspend_buf);
+	else
+		csg_iface->input->suspend_buf = 0;
+
+	if (group->protm_suspend_buf) {
+		csg_iface->input->protm_suspend_buf =
+			panthor_kernel_bo_gpuva(group->protm_suspend_buf);
+	} else {
+		csg_iface->input->protm_suspend_buf = 0;
+	}
+
+	csg_iface->input->ack_irq_mask = ~0;
+	panthor_fw_toggle_reqs(csg_iface, doorbell_req, doorbell_ack, queue_mask);
+	return 0;
+}
+
+static void
+cs_slot_process_fatal_event_locked(struct panthor_device *ptdev,
+				   u32 csg_id, u32 cs_id)
+{
+	struct panthor_scheduler *sched = ptdev->scheduler;
+	struct panthor_csg_slot *csg_slot = &sched->csg_slots[csg_id];
+	struct panthor_group *group = csg_slot->group;
+	struct panthor_fw_cs_iface *cs_iface;
+	u32 fatal;
+	u64 info;
+
+	lockdep_assert_held(&sched->lock);
+
+	cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id);
+	fatal = cs_iface->output->fatal;
+	info = cs_iface->output->fatal_info;
+
+	if (group)
+		group->fatal_queues |= BIT(cs_id);
+
+	sched_queue_delayed_work(sched, tick, 0);
+	drm_warn(&ptdev->base,
+		 "CSG slot %d CS slot: %d\n"
+		 "CS_FATAL.EXCEPTION_TYPE: 0x%x (%s)\n"
+		 "CS_FATAL.EXCEPTION_DATA: 0x%x\n"
+		 "CS_FATAL_INFO.EXCEPTION_DATA: 0x%llx\n",
+		 csg_id, cs_id,
+		 (unsigned int)CS_EXCEPTION_TYPE(fatal),
+		 panthor_exception_name(ptdev, CS_EXCEPTION_TYPE(fatal)),
+		 (unsigned int)CS_EXCEPTION_DATA(fatal),
+		 info);
+}
+
+static void
+cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
+				   u32 csg_id, u32 cs_id)
+{
+	struct panthor_scheduler *sched = ptdev->scheduler;
+	struct panthor_csg_slot *csg_slot = &sched->csg_slots[csg_id];
+	struct panthor_group *group = csg_slot->group;
+	struct panthor_queue *queue = group && cs_id < group->queue_count ?
+				      group->queues[cs_id] : NULL;
+	struct panthor_fw_cs_iface *cs_iface;
+	u32 fault;
+	u64 info;
+
+	lockdep_assert_held(&sched->lock);
+
+	cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id);
+	fault = cs_iface->output->fault;
+	info = cs_iface->output->fault_info;
+
+	if (queue && CS_EXCEPTION_TYPE(fault) == DRM_PANTHOR_EXCEPTION_CS_INHERIT_FAULT) {
+		u64 cs_extract = queue->iface.output->extract;
+		struct panthor_job *job;
+
+		spin_lock(&queue->fence_ctx.lock);
+		list_for_each_entry(job, &queue->fence_ctx.in_flight_jobs, node) {
+			if (cs_extract >= job->ringbuf.end)
+				continue;
+
+			if (cs_extract < job->ringbuf.start)
+				break;
+
+			dma_fence_set_error(job->done_fence, -EINVAL);
+		}
+		spin_unlock(&queue->fence_ctx.lock);
+	}
+
+	drm_warn(&ptdev->base,
+		 "CSG slot %d CS slot: %d\n"
+		 "CS_FAULT.EXCEPTION_TYPE: 0x%x (%s)\n"
+		 "CS_FAULT.EXCEPTION_DATA: 0x%x\n"
+		 "CS_FAULT_INFO.EXCEPTION_DATA: 0x%llx\n",
+		 csg_id, cs_id,
+		 (unsigned int)CS_EXCEPTION_TYPE(fault),
+		 panthor_exception_name(ptdev, CS_EXCEPTION_TYPE(fault)),
+		 (unsigned int)CS_EXCEPTION_DATA(fault),
+		 info);
+}
+
+static int group_process_tiler_oom(struct panthor_group *group, u32 cs_id)
+{
+	struct panthor_device *ptdev = group->ptdev;
+	struct panthor_scheduler *sched = ptdev->scheduler;
+	u32 renderpasses_in_flight, pending_frag_count;
+	struct panthor_heap_pool *heaps = NULL;
+	u64 heap_address, new_chunk_va = 0;
+	u32 vt_start, vt_end, frag_end;
+	int ret, csg_id;
+
+	mutex_lock(&sched->lock);
+	csg_id = group->csg_id;
+	if (csg_id >= 0) {
+		struct panthor_fw_cs_iface *cs_iface;
+
+		cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id);
+		heaps = panthor_vm_get_heap_pool(group->vm, false);
+		heap_address = cs_iface->output->heap_address;
+		vt_start = cs_iface->output->heap_vt_start;
+		vt_end = cs_iface->output->heap_vt_end;
+		frag_end = cs_iface->output->heap_frag_end;
+		renderpasses_in_flight = vt_start - frag_end;
+		pending_frag_count = vt_end - frag_end;
+	}
+	mutex_unlock(&sched->lock);
+
+	/* The group got scheduled out, we stop here. We will get a new tiler OOM event
+	 * when it's scheduled again.
+	 */
+	if (unlikely(csg_id < 0))
+		return 0;
+
+	if (!heaps || frag_end > vt_end || vt_end >= vt_start) {
+		ret = -EINVAL;
+	} else {
+		/* We do the allocation without holding the scheduler lock to avoid
+		 * blocking the scheduling.
+		 */
+		ret = panthor_heap_grow(heaps, heap_address,
+					renderpasses_in_flight,
+					pending_frag_count, &new_chunk_va);
+	}
+
+	if (ret && ret != -EBUSY) {
+		drm_warn(&ptdev->base, "Failed to extend the tiler heap\n");
+		group->fatal_queues |= BIT(cs_id);
+		sched_queue_delayed_work(sched, tick, 0);
+		goto out_put_heap_pool;
+	}
+
+	mutex_lock(&sched->lock);
+	csg_id = group->csg_id;
+	if (csg_id >= 0) {
+		struct panthor_fw_csg_iface *csg_iface;
+		struct panthor_fw_cs_iface *cs_iface;
+
+		csg_iface = panthor_fw_get_csg_iface(ptdev, csg_id);
+		cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id);
+
+		cs_iface->input->heap_start = new_chunk_va;
+		cs_iface->input->heap_end = new_chunk_va;
+		panthor_fw_update_reqs(cs_iface, req, cs_iface->output->ack, CS_TILER_OOM);
+		panthor_fw_toggle_reqs(csg_iface, doorbell_req, doorbell_ack, BIT(cs_id));
+		panthor_fw_ring_csg_doorbells(ptdev, BIT(csg_id));
+	}
+	mutex_unlock(&sched->lock);
+
+	/* We allocated a chunck, but couldn't link it to the heap
+	 * context because the group was scheduled out while we were
+	 * allocating memory. We need to return this chunk to the heap.
+	 */
+	if (unlikely(csg_id < 0 && new_chunk_va))
+		panthor_heap_return_chunk(heaps, heap_address, new_chunk_va);
+
+	ret = 0;
+
+out_put_heap_pool:
+	panthor_heap_pool_put(heaps);
+	return ret;
+}
+
+static void group_tiler_oom_work(struct work_struct *work)
+{
+	struct panthor_group *group =
+		container_of(work, struct panthor_group, tiler_oom_work);
+	u32 tiler_oom = atomic_xchg(&group->tiler_oom, 0);
+
+	while (tiler_oom) {
+		u32 cs_id = ffs(tiler_oom) - 1;
+
+		group_process_tiler_oom(group, cs_id);
+		tiler_oom &= ~BIT(cs_id);
+	}
+
+	group_put(group);
+}
+
+static void
+cs_slot_process_tiler_oom_event_locked(struct panthor_device *ptdev,
+				       u32 csg_id, u32 cs_id)
+{
+	struct panthor_scheduler *sched = ptdev->scheduler;
+	struct panthor_csg_slot *csg_slot = &sched->csg_slots[csg_id];
+	struct panthor_group *group = csg_slot->group;
+
+	lockdep_assert_held(&sched->lock);
+
+	if (drm_WARN_ON(&ptdev->base, !group))
+		return;
+
+	atomic_or(BIT(cs_id), &group->tiler_oom);
+
+	/* We don't use group_queue_work() here because we want to queue the
+	 * work item to the heap_alloc_wq.
+	 */
+	group_get(group);
+	if (!queue_work(sched->heap_alloc_wq, &group->tiler_oom_work))
+		group_put(group);
+}
+
+static bool cs_slot_process_irq_locked(struct panthor_device *ptdev,
+				       u32 csg_id, u32 cs_id)
+{
+	struct panthor_fw_cs_iface *cs_iface;
+	u32 req, ack, events;
+
+	lockdep_assert_held(&ptdev->scheduler->lock);
+
+	cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id);
+	req = cs_iface->input->req;
+	ack = cs_iface->output->ack;
+	events = (req ^ ack) & CS_EVT_MASK;
+
+	if (events & CS_FATAL)
+		cs_slot_process_fatal_event_locked(ptdev, csg_id, cs_id);
+
+	if (events & CS_FAULT)
+		cs_slot_process_fault_event_locked(ptdev, csg_id, cs_id);
+
+	if (events & CS_TILER_OOM)
+		cs_slot_process_tiler_oom_event_locked(ptdev, csg_id, cs_id);
+
+	/* We don't acknowledge the TILER_OOM event since its handling is
+	 * deferred to a separate work.
+	 */
+	panthor_fw_update_reqs(cs_iface, req, ack, CS_FATAL | CS_FAULT);
+
+	return (events & (CS_FAULT | CS_TILER_OOM)) != 0;
+}
+
+static void csg_slot_sync_idle_state_locked(struct panthor_device *ptdev, u32 csg_id)
+{
+	struct panthor_csg_slot *csg_slot = &ptdev->scheduler->csg_slots[csg_id];
+	struct panthor_fw_csg_iface *csg_iface;
+
+	lockdep_assert_held(&ptdev->scheduler->lock);
+
+	csg_iface = panthor_fw_get_csg_iface(ptdev, csg_id);
+	csg_slot->idle = csg_iface->output->status_state & CSG_STATUS_STATE_IS_IDLE;
+}
+
+static void csg_slot_process_idle_event_locked(struct panthor_device *ptdev, u32 csg_id)
+{
+	struct panthor_scheduler *sched = ptdev->scheduler;
+
+	lockdep_assert_held(&sched->lock);
+
+	sched->might_have_idle_groups = true;
+
+	/* Schedule a tick so we can evict idle groups and schedule non-idle
+	 * ones. This will also update runtime PM and devfreq busy/idle states,
+	 * so the device can lower its frequency or get suspended.
+	 */
+	sched_queue_delayed_work(sched, tick, 0);
+}
+
+static void csg_slot_sync_update_locked(struct panthor_device *ptdev,
+					u32 csg_id)
+{
+	struct panthor_csg_slot *csg_slot = &ptdev->scheduler->csg_slots[csg_id];
+	struct panthor_group *group = csg_slot->group;
+
+	lockdep_assert_held(&ptdev->scheduler->lock);
+
+	if (group)
+		group_queue_work(group, sync_upd);
+
+	sched_queue_work(ptdev->scheduler, sync_upd);
+}
+
+static void
+csg_slot_process_progress_timer_event_locked(struct panthor_device *ptdev, u32 csg_id)
+{
+	struct panthor_scheduler *sched = ptdev->scheduler;
+	struct panthor_csg_slot *csg_slot = &sched->csg_slots[csg_id];
+	struct panthor_group *group = csg_slot->group;
+
+	lockdep_assert_held(&sched->lock);
+
+	drm_warn(&ptdev->base, "CSG slot %d progress timeout\n", csg_id);
+
+	group = csg_slot->group;
+	if (!drm_WARN_ON(&ptdev->base, !group))
+		group->timedout = true;
+
+	sched_queue_delayed_work(sched, tick, 0);
+}
+
+static void sched_process_csg_irq_locked(struct panthor_device *ptdev, u32 csg_id)
+{
+	u32 req, ack, cs_irq_req, cs_irq_ack, cs_irqs, csg_events;
+	struct panthor_fw_csg_iface *csg_iface;
+	u32 ring_cs_db_mask = 0;
+
+	lockdep_assert_held(&ptdev->scheduler->lock);
+
+	if (drm_WARN_ON(&ptdev->base, csg_id >= ptdev->scheduler->csg_slot_count))
+		return;
+
+	csg_iface = panthor_fw_get_csg_iface(ptdev, csg_id);
+	req = READ_ONCE(csg_iface->input->req);
+	ack = READ_ONCE(csg_iface->output->ack);
+	cs_irq_req = READ_ONCE(csg_iface->output->cs_irq_req);
+	cs_irq_ack = READ_ONCE(csg_iface->input->cs_irq_ack);
+	csg_events = (req ^ ack) & CSG_EVT_MASK;
+
+	/* There may not be any pending CSG/CS interrupts to process */
+	if (req == ack && cs_irq_req == cs_irq_ack)
+		return;
+
+	/* Immediately set IRQ_ACK bits to be same as the IRQ_REQ bits before
+	 * examining the CS_ACK & CS_REQ bits. This would ensure that Host
+	 * doesn't miss an interrupt for the CS in the race scenario where
+	 * whilst Host is servicing an interrupt for the CS, firmware sends
+	 * another interrupt for that CS.
+	 */
+	csg_iface->input->cs_irq_ack = cs_irq_req;
+
+	panthor_fw_update_reqs(csg_iface, req, ack,
+			       CSG_SYNC_UPDATE |
+			       CSG_IDLE |
+			       CSG_PROGRESS_TIMER_EVENT);
+
+	if (csg_events & CSG_IDLE)
+		csg_slot_process_idle_event_locked(ptdev, csg_id);
+
+	if (csg_events & CSG_PROGRESS_TIMER_EVENT)
+		csg_slot_process_progress_timer_event_locked(ptdev, csg_id);
+
+	cs_irqs = cs_irq_req ^ cs_irq_ack;
+	while (cs_irqs) {
+		u32 cs_id = ffs(cs_irqs) - 1;
+
+		if (cs_slot_process_irq_locked(ptdev, csg_id, cs_id))
+			ring_cs_db_mask |= BIT(cs_id);
+
+		cs_irqs &= ~BIT(cs_id);
+	}
+
+	if (csg_events & CSG_SYNC_UPDATE)
+		csg_slot_sync_update_locked(ptdev, csg_id);
+
+	if (ring_cs_db_mask)
+		panthor_fw_toggle_reqs(csg_iface, doorbell_req, doorbell_ack, ring_cs_db_mask);
+
+	panthor_fw_ring_csg_doorbells(ptdev, BIT(csg_id));
+}
+
+static void sched_process_idle_event_locked(struct panthor_device *ptdev)
+{
+	struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+
+	lockdep_assert_held(&ptdev->scheduler->lock);
+
+	/* Acknowledge the idle event and schedule a tick. */
+	panthor_fw_update_reqs(glb_iface, req, glb_iface->output->ack, GLB_IDLE);
+	sched_queue_delayed_work(ptdev->scheduler, tick, 0);
+}
+
+/**
+ * panthor_sched_process_global_irq() - Process the scheduling part of a global IRQ
+ * @ptdev: Device.
+ */
+static void sched_process_global_irq_locked(struct panthor_device *ptdev)
+{
+	struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+	u32 req, ack, evts;
+
+	lockdep_assert_held(&ptdev->scheduler->lock);
+
+	req = READ_ONCE(glb_iface->input->req);
+	ack = READ_ONCE(glb_iface->output->ack);
+	evts = (req ^ ack) & GLB_EVT_MASK;
+
+	if (evts & GLB_IDLE)
+		sched_process_idle_event_locked(ptdev);
+}
+
+static void process_fw_events_work(struct work_struct *work)
+{
+	struct panthor_scheduler *sched = container_of(work, struct panthor_scheduler,
+						      fw_events_work);
+	u32 events = atomic_xchg(&sched->fw_events, 0);
+	struct panthor_device *ptdev = sched->ptdev;
+
+	mutex_lock(&sched->lock);
+
+	if (events & JOB_INT_GLOBAL_IF) {
+		sched_process_global_irq_locked(ptdev);
+		events &= ~JOB_INT_GLOBAL_IF;
+	}
+
+	while (events) {
+		u32 csg_id = ffs(events) - 1;
+
+		sched_process_csg_irq_locked(ptdev, csg_id);
+		events &= ~BIT(csg_id);
+	}
+
+	mutex_unlock(&sched->lock);
+}
+
+/**
+ * panthor_sched_report_fw_events() - Report FW events to the scheduler.
+ */
+void panthor_sched_report_fw_events(struct panthor_device *ptdev, u32 events)
+{
+	if (!ptdev->scheduler)
+		return;
+
+	atomic_or(events, &ptdev->scheduler->fw_events);
+	sched_queue_work(ptdev->scheduler, fw_events);
+}
+
+static const char *fence_get_driver_name(struct dma_fence *fence)
+{
+	return "panthor";
+}
+
+static const char *queue_fence_get_timeline_name(struct dma_fence *fence)
+{
+	return "queue-fence";
+}
+
+static const struct dma_fence_ops panthor_queue_fence_ops = {
+	.get_driver_name = fence_get_driver_name,
+	.get_timeline_name = queue_fence_get_timeline_name,
+};
+
+/**
+ */
+struct panthor_csg_slots_upd_ctx {
+	u32 update_mask;
+	u32 timedout_mask;
+	struct {
+		u32 value;
+		u32 mask;
+	} requests[MAX_CSGS];
+};
+
+static void csgs_upd_ctx_init(struct panthor_csg_slots_upd_ctx *ctx)
+{
+	memset(ctx, 0, sizeof(*ctx));
+}
+
+static void csgs_upd_ctx_queue_reqs(struct panthor_device *ptdev,
+				    struct panthor_csg_slots_upd_ctx *ctx,
+				    u32 csg_id, u32 value, u32 mask)
+{
+	if (drm_WARN_ON(&ptdev->base, !mask) ||
+	    drm_WARN_ON(&ptdev->base, csg_id >= ptdev->scheduler->csg_slot_count))
+		return;
+
+	ctx->requests[csg_id].value = (ctx->requests[csg_id].value & ~mask) | (value & mask);
+	ctx->requests[csg_id].mask |= mask;
+	ctx->update_mask |= BIT(csg_id);
+}
+
+static int csgs_upd_ctx_apply_locked(struct panthor_device *ptdev,
+				     struct panthor_csg_slots_upd_ctx *ctx)
+{
+	struct panthor_scheduler *sched = ptdev->scheduler;
+	u32 update_slots = ctx->update_mask;
+
+	lockdep_assert_held(&sched->lock);
+
+	if (!ctx->update_mask)
+		return 0;
+
+	while (update_slots) {
+		struct panthor_fw_csg_iface *csg_iface;
+		u32 csg_id = ffs(update_slots) - 1;
+
+		update_slots &= ~BIT(csg_id);
+		csg_iface = panthor_fw_get_csg_iface(ptdev, csg_id);
+		panthor_fw_update_reqs(csg_iface, req,
+				       ctx->requests[csg_id].value,
+				       ctx->requests[csg_id].mask);
+	}
+
+	panthor_fw_ring_csg_doorbells(ptdev, ctx->update_mask);
+
+	update_slots = ctx->update_mask;
+	while (update_slots) {
+		struct panthor_fw_csg_iface *csg_iface;
+		u32 csg_id = ffs(update_slots) - 1;
+		u32 req_mask = ctx->requests[csg_id].mask, acked;
+		int ret;
+
+		update_slots &= ~BIT(csg_id);
+		csg_iface = panthor_fw_get_csg_iface(ptdev, csg_id);
+
+		ret = panthor_fw_csg_wait_acks(ptdev, csg_id, req_mask, &acked, 100);
+
+		if (acked & CSG_ENDPOINT_CONFIG)
+			csg_slot_sync_priority_locked(ptdev, csg_id);
+
+		if (acked & CSG_STATE_MASK)
+			csg_slot_sync_state_locked(ptdev, csg_id);
+
+		if (acked & CSG_STATUS_UPDATE) {
+			csg_slot_sync_queues_state_locked(ptdev, csg_id);
+			csg_slot_sync_idle_state_locked(ptdev, csg_id);
+		}
+
+		if (ret && acked != req_mask &&
+		    ((csg_iface->input->req ^ csg_iface->output->ack) & req_mask) != 0) {
+			drm_err(&ptdev->base, "CSG %d update request timedout", csg_id);
+			ctx->timedout_mask |= BIT(csg_id);
+		}
+	}
+
+	if (ctx->timedout_mask)
+		return -ETIMEDOUT;
+
+	return 0;
+}
+
+struct panthor_sched_tick_ctx {
+	struct list_head old_groups[PANTHOR_CSG_PRIORITY_COUNT];
+	struct list_head groups[PANTHOR_CSG_PRIORITY_COUNT];
+	u32 idle_group_count;
+	u32 group_count;
+	enum panthor_csg_priority min_priority;
+	struct panthor_vm *vms[MAX_CS_PER_CSG];
+	u32 as_count;
+	bool immediate_tick;
+	u32 csg_upd_failed_mask;
+};
+
+static bool
+tick_ctx_is_full(const struct panthor_scheduler *sched,
+		 const struct panthor_sched_tick_ctx *ctx)
+{
+	return ctx->group_count == sched->csg_slot_count;
+}
+
+static bool
+group_is_idle(struct panthor_group *group)
+{
+	struct panthor_device *ptdev = group->ptdev;
+	u32 inactive_queues;
+
+	if (group->csg_id >= 0)
+		return ptdev->scheduler->csg_slots[group->csg_id].idle;
+
+	inactive_queues = group->idle_queues | group->blocked_queues;
+	return hweight32(inactive_queues) == group->queue_count;
+}
+
+static bool
+group_can_run(struct panthor_group *group)
+{
+	return group->state != PANTHOR_CS_GROUP_TERMINATED &&
+	       !group->destroyed && group->fatal_queues == 0 &&
+	       !group->timedout;
+}
+
+static void
+tick_ctx_pick_groups_from_list(const struct panthor_scheduler *sched,
+			       struct panthor_sched_tick_ctx *ctx,
+			       struct list_head *queue,
+			       bool skip_idle_groups,
+			       bool owned_by_tick_ctx)
+{
+	struct panthor_group *group, *tmp;
+
+	if (tick_ctx_is_full(sched, ctx))
+		return;
+
+	list_for_each_entry_safe(group, tmp, queue, run_node) {
+		u32 i;
+
+		if (!group_can_run(group))
+			continue;
+
+		if (skip_idle_groups && group_is_idle(group))
+			continue;
+
+		for (i = 0; i < ctx->as_count; i++) {
+			if (ctx->vms[i] == group->vm)
+				break;
+		}
+
+		if (i == ctx->as_count && ctx->as_count == sched->as_slot_count)
+			continue;
+
+		if (!owned_by_tick_ctx)
+			group_get(group);
+
+		list_move_tail(&group->run_node, &ctx->groups[group->priority]);
+		ctx->group_count++;
+		if (group_is_idle(group))
+			ctx->idle_group_count++;
+
+		if (i == ctx->as_count)
+			ctx->vms[ctx->as_count++] = group->vm;
+
+		if (ctx->min_priority > group->priority)
+			ctx->min_priority = group->priority;
+
+		if (tick_ctx_is_full(sched, ctx))
+			return;
+	}
+}
+
+static void
+tick_ctx_insert_old_group(struct panthor_scheduler *sched,
+			  struct panthor_sched_tick_ctx *ctx,
+			  struct panthor_group *group,
+			  bool full_tick)
+{
+	struct panthor_csg_slot *csg_slot = &sched->csg_slots[group->csg_id];
+	struct panthor_group *other_group;
+
+	if (!full_tick) {
+		list_add_tail(&group->run_node, &ctx->old_groups[group->priority]);
+		return;
+	}
+
+	/* Rotate to make sure groups with lower CSG slot
+	 * priorities have a chance to get a higher CSG slot
+	 * priority next time they get picked. This priority
+	 * has an impact on resource request ordering, so it's
+	 * important to make sure we don't let one group starve
+	 * all other groups with the same group priority.
+	 */
+	list_for_each_entry(other_group,
+			    &ctx->old_groups[csg_slot->group->priority],
+			    run_node) {
+		struct panthor_csg_slot *other_csg_slot = &sched->csg_slots[other_group->csg_id];
+
+		if (other_csg_slot->priority > csg_slot->priority) {
+			list_add_tail(&csg_slot->group->run_node, &other_group->run_node);
+			return;
+		}
+	}
+
+	list_add_tail(&group->run_node, &ctx->old_groups[group->priority]);
+}
+
+static void
+tick_ctx_init(struct panthor_scheduler *sched,
+	      struct panthor_sched_tick_ctx *ctx,
+	      bool full_tick)
+{
+	struct panthor_device *ptdev = sched->ptdev;
+	struct panthor_csg_slots_upd_ctx upd_ctx;
+	int ret;
+	u32 i;
+
+	memset(ctx, 0, sizeof(*ctx));
+	csgs_upd_ctx_init(&upd_ctx);
+
+	ctx->min_priority = PANTHOR_CSG_PRIORITY_COUNT;
+	for (i = 0; i < ARRAY_SIZE(ctx->groups); i++) {
+		INIT_LIST_HEAD(&ctx->groups[i]);
+		INIT_LIST_HEAD(&ctx->old_groups[i]);
+	}
+
+	for (i = 0; i < sched->csg_slot_count; i++) {
+		struct panthor_csg_slot *csg_slot = &sched->csg_slots[i];
+		struct panthor_group *group = csg_slot->group;
+		struct panthor_fw_csg_iface *csg_iface;
+
+		if (!group)
+			continue;
+
+		csg_iface = panthor_fw_get_csg_iface(ptdev, i);
+		group_get(group);
+
+		/* If there was unhandled faults on the VM, force processing of
+		 * CSG IRQs, so we can flag the faulty queue.
+		 */
+		if (panthor_vm_has_unhandled_faults(group->vm)) {
+			sched_process_csg_irq_locked(ptdev, i);
+
+			/* No fatal fault reported, flag all queues as faulty. */
+			if (!group->fatal_queues)
+				group->fatal_queues |= GENMASK(group->queue_count - 1, 0);
+		}
+
+		tick_ctx_insert_old_group(sched, ctx, group, full_tick);
+		csgs_upd_ctx_queue_reqs(ptdev, &upd_ctx, i,
+					csg_iface->output->ack ^ CSG_STATUS_UPDATE,
+					CSG_STATUS_UPDATE);
+	}
+
+	ret = csgs_upd_ctx_apply_locked(ptdev, &upd_ctx);
+	if (ret) {
+		panthor_device_schedule_reset(ptdev);
+		ctx->csg_upd_failed_mask |= upd_ctx.timedout_mask;
+	}
+}
+
+#define NUM_INSTRS_PER_SLOT		16
+
+static void
+group_term_post_processing(struct panthor_group *group)
+{
+	struct panthor_job *job, *tmp;
+	LIST_HEAD(faulty_jobs);
+	bool cookie;
+	u32 i = 0;
+
+	if (drm_WARN_ON(&group->ptdev->base, group_can_run(group)))
+		return;
+
+	cookie = dma_fence_begin_signalling();
+	for (i = 0; i < group->queue_count; i++) {
+		struct panthor_queue *queue = group->queues[i];
+		struct panthor_syncobj_64b *syncobj;
+		int err;
+
+		if (group->fatal_queues & BIT(i))
+			err = -EINVAL;
+		else if (group->timedout)
+			err = -ETIMEDOUT;
+		else
+			err = -ECANCELED;
+
+		if (!queue)
+			continue;
+
+		spin_lock(&queue->fence_ctx.lock);
+		list_for_each_entry_safe(job, tmp, &queue->fence_ctx.in_flight_jobs, node) {
+			list_move_tail(&job->node, &faulty_jobs);
+			dma_fence_set_error(job->done_fence, err);
+			dma_fence_signal_locked(job->done_fence);
+		}
+		spin_unlock(&queue->fence_ctx.lock);
+
+		/* Manually update the syncobj seqno to unblock waiters. */
+		syncobj = group->syncobjs->kmap + (i * sizeof(*syncobj));
+		syncobj->status = ~0;
+		syncobj->seqno = atomic64_read(&queue->fence_ctx.seqno);
+		sched_queue_work(group->ptdev->scheduler, sync_upd);
+	}
+	dma_fence_end_signalling(cookie);
+
+	list_for_each_entry_safe(job, tmp, &faulty_jobs, node) {
+		list_del_init(&job->node);
+		panthor_job_put(&job->base);
+	}
+}
+
+static void group_term_work(struct work_struct *work)
+{
+	struct panthor_group *group =
+		container_of(work, struct panthor_group, term_work);
+
+	group_term_post_processing(group);
+	group_put(group);
+}
+
+static void
+tick_ctx_cleanup(struct panthor_scheduler *sched,
+		 struct panthor_sched_tick_ctx *ctx)
+{
+	struct panthor_group *group, *tmp;
+	u32 i;
+
+	for (i = 0; i < ARRAY_SIZE(ctx->old_groups); i++) {
+		list_for_each_entry_safe(group, tmp, &ctx->old_groups[i], run_node) {
+			/* If everything went fine, we should only have groups
+			 * to be terminated in the old_groups lists.
+			 */
+			drm_WARN_ON(&group->ptdev->base, !ctx->csg_upd_failed_mask &&
+				    group_can_run(group));
+
+			if (!group_can_run(group)) {
+				list_del_init(&group->run_node);
+				list_del_init(&group->wait_node);
+				group_queue_work(group, term);
+			} else if (group->csg_id >= 0) {
+				list_del_init(&group->run_node);
+			} else {
+				list_move(&group->run_node,
+					  group_is_idle(group) ?
+					  &sched->groups.idle[group->priority] :
+					  &sched->groups.runnable[group->priority]);
+			}
+			group_put(group);
+		}
+	}
+
+	for (i = 0; i < ARRAY_SIZE(ctx->groups); i++) {
+		/* If everything went fine, the groups to schedule lists should
+		 * be empty.
+		 */
+		drm_WARN_ON(&group->ptdev->base,
+			    !ctx->csg_upd_failed_mask && !list_empty(&ctx->groups[i]));
+
+		list_for_each_entry_safe(group, tmp, &ctx->groups[i], run_node) {
+			if (group->csg_id >= 0) {
+				list_del_init(&group->run_node);
+			} else {
+				list_move(&group->run_node,
+					  group_is_idle(group) ?
+					  &sched->groups.idle[group->priority] :
+					  &sched->groups.runnable[group->priority]);
+			}
+			group_put(group);
+		}
+	}
+}
+
+static void
+tick_ctx_apply(struct panthor_scheduler *sched, struct panthor_sched_tick_ctx *ctx)
+{
+	struct panthor_group *group, *tmp;
+	struct panthor_device *ptdev = sched->ptdev;
+	struct panthor_csg_slot *csg_slot;
+	int prio, new_csg_prio = MAX_CSG_PRIO, i;
+	u32 csg_mod_mask = 0, free_csg_slots = 0;
+	struct panthor_csg_slots_upd_ctx upd_ctx;
+	int ret;
+
+	csgs_upd_ctx_init(&upd_ctx);
+
+	for (prio = PANTHOR_CSG_PRIORITY_COUNT - 1; prio >= 0; prio--) {
+		/* Suspend or terminate evicted groups. */
+		list_for_each_entry(group, &ctx->old_groups[prio], run_node) {
+			bool term = !group_can_run(group);
+			int csg_id = group->csg_id;
+
+			if (drm_WARN_ON(&ptdev->base, csg_id < 0))
+				continue;
+
+			csg_slot = &sched->csg_slots[csg_id];
+			csgs_upd_ctx_queue_reqs(ptdev, &upd_ctx, csg_id,
+						term ? CSG_STATE_TERMINATE : CSG_STATE_SUSPEND,
+						CSG_STATE_MASK);
+		}
+
+		/* Update priorities on already running groups. */
+		list_for_each_entry(group, &ctx->groups[prio], run_node) {
+			struct panthor_fw_csg_iface *csg_iface;
+			int csg_id = group->csg_id;
+
+			if (csg_id < 0) {
+				new_csg_prio--;
+				continue;
+			}
+
+			csg_slot = &sched->csg_slots[csg_id];
+			csg_iface = panthor_fw_get_csg_iface(ptdev, csg_id);
+			if (csg_slot->priority == new_csg_prio) {
+				new_csg_prio--;
+				continue;
+			}
+
+			panthor_fw_update_reqs(csg_iface, endpoint_req,
+					       CSG_EP_REQ_PRIORITY(new_csg_prio),
+					       CSG_EP_REQ_PRIORITY_MASK);
+			csgs_upd_ctx_queue_reqs(ptdev, &upd_ctx, csg_id,
+						csg_iface->output->ack ^ CSG_ENDPOINT_CONFIG,
+						CSG_ENDPOINT_CONFIG);
+			new_csg_prio--;
+		}
+	}
+
+	ret = csgs_upd_ctx_apply_locked(ptdev, &upd_ctx);
+	if (ret) {
+		panthor_device_schedule_reset(ptdev);
+		ctx->csg_upd_failed_mask |= upd_ctx.timedout_mask;
+		return;
+	}
+
+	/* Unbind evicted groups. */
+	for (prio = PANTHOR_CSG_PRIORITY_COUNT - 1; prio >= 0; prio--) {
+		list_for_each_entry(group, &ctx->old_groups[prio], run_node) {
+			/* This group is gone. Process interrupts to clear
+			 * any pending interrupts before we start the new
+			 * group.
+			 */
+			if (group->csg_id >= 0)
+				sched_process_csg_irq_locked(ptdev, group->csg_id);
+
+			group_unbind_locked(group);
+		}
+	}
+
+	for (i = 0; i < sched->csg_slot_count; i++) {
+		if (!sched->csg_slots[i].group)
+			free_csg_slots |= BIT(i);
+	}
+
+	csgs_upd_ctx_init(&upd_ctx);
+	new_csg_prio = MAX_CSG_PRIO;
+
+	/* Start new groups. */
+	for (prio = PANTHOR_CSG_PRIORITY_COUNT - 1; prio >= 0; prio--) {
+		list_for_each_entry(group, &ctx->groups[prio], run_node) {
+			int csg_id = group->csg_id;
+			struct panthor_fw_csg_iface *csg_iface;
+
+			if (csg_id >= 0) {
+				new_csg_prio--;
+				continue;
+			}
+
+			csg_id = ffs(free_csg_slots) - 1;
+			if (drm_WARN_ON(&ptdev->base, csg_id < 0))
+				break;
+
+			csg_iface = panthor_fw_get_csg_iface(ptdev, csg_id);
+			csg_slot = &sched->csg_slots[csg_id];
+			csg_mod_mask |= BIT(csg_id);
+			group_bind_locked(group, csg_id);
+			csg_slot_prog_locked(ptdev, csg_id, new_csg_prio--);
+			csgs_upd_ctx_queue_reqs(ptdev, &upd_ctx, csg_id,
+						group->state == PANTHOR_CS_GROUP_SUSPENDED ?
+						CSG_STATE_RESUME : CSG_STATE_START,
+						CSG_STATE_MASK);
+			csgs_upd_ctx_queue_reqs(ptdev, &upd_ctx, csg_id,
+						csg_iface->output->ack ^ CSG_ENDPOINT_CONFIG,
+						CSG_ENDPOINT_CONFIG);
+			free_csg_slots &= ~BIT(csg_id);
+		}
+	}
+
+	ret = csgs_upd_ctx_apply_locked(ptdev, &upd_ctx);
+	if (ret) {
+		panthor_device_schedule_reset(ptdev);
+		ctx->csg_upd_failed_mask |= upd_ctx.timedout_mask;
+		return;
+	}
+
+	for (prio = PANTHOR_CSG_PRIORITY_COUNT - 1; prio >= 0; prio--) {
+		list_for_each_entry_safe(group, tmp, &ctx->groups[prio], run_node) {
+			list_del_init(&group->run_node);
+
+			/* If the group has been destroyed while we were
+			 * scheduling, ask for an immediate tick to
+			 * re-evaluate as soon as possible and get rid of
+			 * this dangling group.
+			 */
+			if (group->destroyed)
+				ctx->immediate_tick = true;
+			group_put(group);
+		}
+
+		/* Return evicted groups to the idle or run queues. Groups
+		 * that can no longer be run (because they've been destroyed
+		 * or experienced an unrecoverable error) will be scheduled
+		 * for destruction in tick_ctx_cleanup().
+		 */
+		list_for_each_entry_safe(group, tmp, &ctx->old_groups[prio], run_node) {
+			if (!group_can_run(group))
+				continue;
+
+			if (group_is_idle(group))
+				list_move_tail(&group->run_node, &sched->groups.idle[prio]);
+			else
+				list_move_tail(&group->run_node, &sched->groups.runnable[prio]);
+			group_put(group);
+		}
+	}
+
+	sched->used_csg_slot_count = ctx->group_count;
+	sched->might_have_idle_groups = ctx->idle_group_count > 0;
+}
+
+static u64
+tick_ctx_update_resched_target(struct panthor_scheduler *sched,
+			       const struct panthor_sched_tick_ctx *ctx)
+{
+	/* We had space left, no need to reschedule until some external event happens. */
+	if (!tick_ctx_is_full(sched, ctx))
+		goto no_tick;
+
+	/* If idle groups were scheduled, no need to wake up until some external
+	 * event happens (group unblocked, new job submitted, ...).
+	 */
+	if (ctx->idle_group_count)
+		goto no_tick;
+
+	if (drm_WARN_ON(&sched->ptdev->base, ctx->min_priority >= PANTHOR_CSG_PRIORITY_COUNT))
+		goto no_tick;
+
+	/* If there are groups of the same priority waiting, we need to
+	 * keep the scheduler ticking, otherwise, we'll just wait for
+	 * new groups with higher priority to be queued.
+	 */
+	if (!list_empty(&sched->groups.runnable[ctx->min_priority])) {
+		u64 resched_target = sched->last_tick + sched->tick_period;
+
+		if (time_before64(sched->resched_target, sched->last_tick) ||
+		    time_before64(resched_target, sched->resched_target))
+			sched->resched_target = resched_target;
+
+		return sched->resched_target - sched->last_tick;
+	}
+
+no_tick:
+	sched->resched_target = U64_MAX;
+	return U64_MAX;
+}
+
+static void tick_work(struct work_struct *work)
+{
+	struct panthor_scheduler *sched = container_of(work, struct panthor_scheduler,
+						      tick_work.work);
+	struct panthor_device *ptdev = sched->ptdev;
+	struct panthor_sched_tick_ctx ctx;
+	u64 remaining_jiffies = 0, resched_delay;
+	u64 now = get_jiffies_64();
+	int prio, ret, cookie;
+
+	if (!drm_dev_enter(&ptdev->base, &cookie))
+		return;
+
+	ret = pm_runtime_resume_and_get(ptdev->base.dev);
+	if (drm_WARN_ON(&ptdev->base, ret))
+		goto out_dev_exit;
+
+	if (time_before64(now, sched->resched_target))
+		remaining_jiffies = sched->resched_target - now;
+
+	mutex_lock(&sched->lock);
+	if (panthor_device_reset_is_pending(sched->ptdev))
+		goto out_unlock;
+
+	tick_ctx_init(sched, &ctx, remaining_jiffies != 0);
+	if (ctx.csg_upd_failed_mask)
+		goto out_cleanup_ctx;
+
+	if (remaining_jiffies) {
+		/* Scheduling forced in the middle of a tick. Only RT groups
+		 * can preempt non-RT ones. Currently running RT groups can't be
+		 * preempted.
+		 */
+		for (prio = PANTHOR_CSG_PRIORITY_COUNT - 1;
+		     prio >= 0 && !tick_ctx_is_full(sched, &ctx);
+		     prio--) {
+			tick_ctx_pick_groups_from_list(sched, &ctx, &ctx.old_groups[prio],
+						       true, true);
+			if (prio == PANTHOR_CSG_PRIORITY_RT) {
+				tick_ctx_pick_groups_from_list(sched, &ctx,
+							       &sched->groups.runnable[prio],
+							       true, false);
+			}
+		}
+	}
+
+	/* First pick non-idle groups */
+	for (prio = PANTHOR_CSG_PRIORITY_COUNT - 1;
+	     prio >= 0 && !tick_ctx_is_full(sched, &ctx);
+	     prio--) {
+		tick_ctx_pick_groups_from_list(sched, &ctx, &sched->groups.runnable[prio],
+					       true, false);
+		tick_ctx_pick_groups_from_list(sched, &ctx, &ctx.old_groups[prio], true, true);
+	}
+
+	/* If we have free CSG slots left, pick idle groups */
+	for (prio = PANTHOR_CSG_PRIORITY_COUNT - 1;
+	     prio >= 0 && !tick_ctx_is_full(sched, &ctx);
+	     prio--) {
+		/* Check the old_group queue first to avoid reprogramming the slots */
+		tick_ctx_pick_groups_from_list(sched, &ctx, &ctx.old_groups[prio], false, true);
+		tick_ctx_pick_groups_from_list(sched, &ctx, &sched->groups.idle[prio],
+					       false, false);
+	}
+
+	tick_ctx_apply(sched, &ctx);
+	if (ctx.csg_upd_failed_mask)
+		goto out_cleanup_ctx;
+
+	if (ctx.idle_group_count == ctx.group_count) {
+		panthor_devfreq_record_idle(sched->ptdev);
+		if (sched->pm.has_ref) {
+			pm_runtime_put_autosuspend(ptdev->base.dev);
+			sched->pm.has_ref = false;
+		}
+	} else {
+		panthor_devfreq_record_busy(sched->ptdev);
+		if (!sched->pm.has_ref) {
+			pm_runtime_get(ptdev->base.dev);
+			sched->pm.has_ref = true;
+		}
+	}
+
+	sched->last_tick = now;
+	resched_delay = tick_ctx_update_resched_target(sched, &ctx);
+	if (ctx.immediate_tick)
+		resched_delay = 0;
+
+	if (resched_delay != U64_MAX)
+		sched_queue_delayed_work(sched, tick, resched_delay);
+
+out_cleanup_ctx:
+	tick_ctx_cleanup(sched, &ctx);
+
+out_unlock:
+	mutex_unlock(&sched->lock);
+	pm_runtime_mark_last_busy(ptdev->base.dev);
+	pm_runtime_put_autosuspend(ptdev->base.dev);
+
+out_dev_exit:
+	drm_dev_exit(cookie);
+}
+
+static int panthor_queue_eval_syncwait(struct panthor_group *group, u8 queue_idx)
+{
+	struct panthor_queue *queue = group->queues[queue_idx];
+	union {
+		struct panthor_syncobj_64b sync64;
+		struct panthor_syncobj_32b sync32;
+	} *syncobj;
+	bool result;
+	u64 value;
+
+	syncobj = panthor_queue_get_syncwait_obj(group, queue);
+	if (!syncobj)
+		return -EINVAL;
+
+	value = queue->syncwait.sync64 ?
+		syncobj->sync64.seqno :
+		syncobj->sync32.seqno;
+
+	if (queue->syncwait.gt)
+		result = value > queue->syncwait.ref;
+	else
+		result = value <= queue->syncwait.ref;
+
+	if (result)
+		panthor_queue_put_syncwait_obj(queue);
+
+	return result;
+}
+
+static void sync_upd_work(struct work_struct *work)
+{
+	struct panthor_scheduler *sched = container_of(work,
+						      struct panthor_scheduler,
+						      sync_upd_work);
+	struct panthor_group *group, *tmp;
+	bool immediate_tick = false;
+
+	mutex_lock(&sched->lock);
+	list_for_each_entry_safe(group, tmp, &sched->groups.waiting, wait_node) {
+		u32 tested_queues = group->blocked_queues;
+		u32 unblocked_queues = 0;
+
+		while (tested_queues) {
+			u32 cs_id = ffs(tested_queues) - 1;
+			int ret;
+
+			ret = panthor_queue_eval_syncwait(group, cs_id);
+			drm_WARN_ON(&group->ptdev->base, ret < 0);
+			if (ret)
+				unblocked_queues |= BIT(cs_id);
+
+			tested_queues &= ~BIT(cs_id);
+		}
+
+		if (unblocked_queues) {
+			group->blocked_queues &= ~unblocked_queues;
+
+			if (group->csg_id < 0) {
+				list_move(&group->run_node,
+					  &sched->groups.runnable[group->priority]);
+				if (group->priority == PANTHOR_CSG_PRIORITY_RT)
+					immediate_tick = true;
+			}
+		}
+
+		if (!group->blocked_queues)
+			list_del_init(&group->wait_node);
+	}
+	mutex_unlock(&sched->lock);
+
+	if (immediate_tick)
+		sched_queue_delayed_work(sched, tick, 0);
+}
+
+static void group_schedule_locked(struct panthor_group *group, u32 queue_mask)
+{
+	struct panthor_device *ptdev = group->ptdev;
+	struct panthor_scheduler *sched = ptdev->scheduler;
+	struct list_head *queue = &sched->groups.runnable[group->priority];
+	u64 delay_jiffies = 0;
+	bool was_idle;
+	u64 now;
+
+	if (!group_can_run(group))
+		return;
+
+	/* All updated queues are blocked, no need to wake up the scheduler. */
+	if ((queue_mask & group->blocked_queues) == queue_mask)
+		return;
+
+	was_idle = group_is_idle(group);
+	group->idle_queues &= ~queue_mask;
+
+	/* Don't mess up with the lists if we're in a middle of a reset. */
+	if (atomic_read(&sched->reset.in_progress))
+		return;
+
+	if (was_idle && !group_is_idle(group))
+		list_move_tail(&group->run_node, queue);
+
+	/* RT groups are preemptive. */
+	if (group->priority == PANTHOR_CSG_PRIORITY_RT) {
+		sched_queue_delayed_work(sched, tick, 0);
+		return;
+	}
+
+	/* Some groups might be idle, force an immediate tick to
+	 * re-evaluate.
+	 */
+	if (sched->might_have_idle_groups) {
+		sched_queue_delayed_work(sched, tick, 0);
+		return;
+	}
+
+	/* Scheduler is ticking, nothing to do. */
+	if (sched->resched_target != U64_MAX) {
+		/* If there are free slots, force immediating ticking. */
+		if (sched->used_csg_slot_count < sched->csg_slot_count)
+			sched_queue_delayed_work(sched, tick, 0);
+
+		return;
+	}
+
+	/* Scheduler tick was off, recalculate the resched_target based on the
+	 * last tick event, and queue the scheduler work.
+	 */
+	now = get_jiffies_64();
+	sched->resched_target = sched->last_tick + sched->tick_period;
+	if (sched->used_csg_slot_count == sched->csg_slot_count &&
+	    time_before64(now, sched->resched_target))
+		delay_jiffies = min_t(unsigned long, sched->resched_target - now, ULONG_MAX);
+
+	sched_queue_delayed_work(sched, tick, delay_jiffies);
+}
+
+static void queue_stop(struct panthor_queue *queue,
+		       struct panthor_job *bad_job)
+{
+	drm_sched_stop(&queue->scheduler, bad_job ? &bad_job->base : NULL);
+}
+
+static void queue_start(struct panthor_queue *queue)
+{
+	struct panthor_job *job;
+
+	/* Re-assign the parent fences. */
+	list_for_each_entry(job, &queue->scheduler.pending_list, base.list)
+		job->base.s_fence->parent = dma_fence_get(job->done_fence);
+
+	drm_sched_start(&queue->scheduler, true);
+}
+
+static void panthor_group_stop(struct panthor_group *group)
+{
+	struct panthor_scheduler *sched = group->ptdev->scheduler;
+
+	lockdep_assert_held(&sched->reset.lock);
+
+	for (u32 i = 0; i < group->queue_count; i++)
+		queue_stop(group->queues[i], NULL);
+
+	group_get(group);
+	list_move_tail(&group->run_node, &sched->reset.stopped_groups);
+}
+
+static void panthor_group_start(struct panthor_group *group)
+{
+	struct panthor_scheduler *sched = group->ptdev->scheduler;
+
+	lockdep_assert_held(&group->ptdev->scheduler->reset.lock);
+
+	for (u32 i = 0; i < group->queue_count; i++)
+		queue_start(group->queues[i]);
+
+	if (group_can_run(group)) {
+		list_move_tail(&group->run_node,
+			       group_is_idle(group) ?
+			       &sched->groups.idle[group->priority] :
+			       &sched->groups.runnable[group->priority]);
+	} else {
+		list_del_init(&group->run_node);
+		list_del_init(&group->wait_node);
+		group_queue_work(group, term);
+	}
+
+	group_put(group);
+}
+
+static void panthor_sched_immediate_tick(struct panthor_device *ptdev)
+{
+	struct panthor_scheduler *sched = ptdev->scheduler;
+
+	sched_queue_delayed_work(sched, tick, 0);
+}
+
+/**
+ * panthor_sched_report_mmu_fault() - Report MMU faults to the scheduler.
+ */
+void panthor_sched_report_mmu_fault(struct panthor_device *ptdev)
+{
+	/* Force a tick to immediately kill faulty groups. */
+	if (ptdev->scheduler)
+		panthor_sched_immediate_tick(ptdev);
+}
+
+void panthor_sched_resume(struct panthor_device *ptdev)
+{
+	/* Force a tick to re-evaluate after a resume. */
+	panthor_sched_immediate_tick(ptdev);
+}
+
+void panthor_sched_suspend(struct panthor_device *ptdev)
+{
+	struct panthor_scheduler *sched = ptdev->scheduler;
+	struct panthor_csg_slots_upd_ctx upd_ctx;
+	u64 suspended_slots, faulty_slots;
+	struct panthor_group *group;
+	u32 i;
+
+	mutex_lock(&sched->lock);
+	csgs_upd_ctx_init(&upd_ctx);
+	for (i = 0; i < sched->csg_slot_count; i++) {
+		struct panthor_csg_slot *csg_slot = &sched->csg_slots[i];
+
+		if (csg_slot->group) {
+			csgs_upd_ctx_queue_reqs(ptdev, &upd_ctx, i,
+						CSG_STATE_SUSPEND,
+						CSG_STATE_MASK);
+		}
+	}
+
+	suspended_slots = upd_ctx.update_mask;
+
+	csgs_upd_ctx_apply_locked(ptdev, &upd_ctx);
+	suspended_slots &= ~upd_ctx.timedout_mask;
+	faulty_slots = upd_ctx.timedout_mask;
+
+	if (faulty_slots) {
+		u32 slot_mask = faulty_slots;
+
+		drm_err(&ptdev->base, "CSG suspend failed, escalating to termination");
+		csgs_upd_ctx_init(&upd_ctx);
+		while (slot_mask) {
+			u32 csg_id = ffs(slot_mask) - 1;
+
+			csgs_upd_ctx_queue_reqs(ptdev, &upd_ctx, csg_id,
+						CSG_STATE_TERMINATE,
+						CSG_STATE_MASK);
+			slot_mask &= ~BIT(csg_id);
+		}
+
+		csgs_upd_ctx_apply_locked(ptdev, &upd_ctx);
+
+		slot_mask = upd_ctx.timedout_mask;
+		while (slot_mask) {
+			u32 csg_id = ffs(slot_mask) - 1;
+			struct panthor_csg_slot *csg_slot = &sched->csg_slots[csg_id];
+
+			/* Terminate command timedout, but the soft-reset will
+			 * automatically terminate all active groups, so let's
+			 * force the state to halted here.
+			 */
+			if (csg_slot->group->state != PANTHOR_CS_GROUP_TERMINATED)
+				csg_slot->group->state = PANTHOR_CS_GROUP_TERMINATED;
+			slot_mask &= ~BIT(csg_id);
+		}
+	}
+
+	/* Flush L2 and LSC caches to make sure suspend state is up-to-date.
+	 * If the flush fails, flag all queues for termination.
+	 */
+	if (suspended_slots) {
+		bool flush_caches_failed = false;
+		u32 slot_mask = suspended_slots;
+
+		if (panthor_gpu_flush_caches(ptdev, CACHE_CLEAN, CACHE_CLEAN, 0))
+			flush_caches_failed = true;
+
+		while (slot_mask) {
+			u32 csg_id = ffs(slot_mask) - 1;
+			struct panthor_csg_slot *csg_slot = &sched->csg_slots[csg_id];
+
+			if (flush_caches_failed)
+				csg_slot->group->state = PANTHOR_CS_GROUP_TERMINATED;
+			else
+				csg_slot_sync_update_locked(ptdev, csg_id);
+
+			slot_mask &= ~BIT(csg_id);
+		}
+
+		if (flush_caches_failed)
+			faulty_slots |= suspended_slots;
+	}
+
+	for (i = 0; i < sched->csg_slot_count; i++) {
+		struct panthor_csg_slot *csg_slot = &sched->csg_slots[i];
+
+		group = csg_slot->group;
+		if (!group)
+			continue;
+
+		group_get(group);
+
+		if (group->csg_id >= 0)
+			sched_process_csg_irq_locked(ptdev, group->csg_id);
+
+		group_unbind_locked(group);
+
+		drm_WARN_ON(&group->ptdev->base, !list_empty(&group->run_node));
+
+		if (group_can_run(group)) {
+			list_add(&group->run_node,
+				 &sched->groups.idle[group->priority]);
+		} else {
+			/* We don't bother stopping the scheduler if the group is
+			 * faulty, the group termination work will finish the job.
+			 */
+			list_del_init(&group->wait_node);
+			group_queue_work(group, term);
+		}
+		group_put(group);
+	}
+	mutex_unlock(&sched->lock);
+}
+
+void panthor_sched_pre_reset(struct panthor_device *ptdev)
+{
+	struct panthor_scheduler *sched = ptdev->scheduler;
+	struct panthor_group *group, *group_tmp;
+	u32 i;
+
+	mutex_lock(&sched->reset.lock);
+	atomic_set(&sched->reset.in_progress, true);
+
+	/* Cancel all scheduler works. Once this is done, these works can't be
+	 * scheduled again until the reset operation is complete.
+	 */
+	cancel_work_sync(&sched->sync_upd_work);
+	cancel_delayed_work_sync(&sched->tick_work);
+
+	panthor_sched_suspend(ptdev);
+
+	/* Stop all groups that might still accept jobs, so we don't get passed
+	 * new jobs while we're resetting.
+	 */
+	for (i = 0; i < ARRAY_SIZE(sched->groups.runnable); i++) {
+		/* All groups should be in the idle lists. */
+		drm_WARN_ON(&ptdev->base, !list_empty(&sched->groups.runnable[i]));
+		list_for_each_entry_safe(group, group_tmp, &sched->groups.runnable[i], run_node)
+			panthor_group_stop(group);
+	}
+
+	for (i = 0; i < ARRAY_SIZE(sched->groups.idle); i++) {
+		list_for_each_entry_safe(group, group_tmp, &sched->groups.idle[i], run_node)
+			panthor_group_stop(group);
+	}
+
+	mutex_unlock(&sched->reset.lock);
+}
+
+void panthor_sched_post_reset(struct panthor_device *ptdev)
+{
+	struct panthor_scheduler *sched = ptdev->scheduler;
+	struct panthor_group *group, *group_tmp;
+
+	mutex_lock(&sched->reset.lock);
+
+	list_for_each_entry_safe(group, group_tmp, &sched->reset.stopped_groups, run_node)
+		panthor_group_start(group);
+
+	/* We're done resetting the GPU, clear the reset.in_progress bit so we can
+	 * kick the scheduler.
+	 */
+	atomic_set(&sched->reset.in_progress, false);
+	mutex_unlock(&sched->reset.lock);
+
+	sched_queue_delayed_work(sched, tick, 0);
+
+	sched_queue_work(sched, sync_upd);
+}
+
+static void group_sync_upd_work(struct work_struct *work)
+{
+	struct panthor_group *group =
+		container_of(work, struct panthor_group, sync_upd_work);
+	struct panthor_job *job, *job_tmp;
+	LIST_HEAD(done_jobs);
+	u32 queue_idx;
+	bool cookie;
+
+	cookie = dma_fence_begin_signalling();
+	for (queue_idx = 0; queue_idx < group->queue_count; queue_idx++) {
+		struct panthor_queue *queue = group->queues[queue_idx];
+		struct panthor_syncobj_64b *syncobj;
+
+		if (!queue)
+			continue;
+
+		syncobj = group->syncobjs->kmap + (queue_idx * sizeof(*syncobj));
+
+		spin_lock(&queue->fence_ctx.lock);
+		list_for_each_entry_safe(job, job_tmp, &queue->fence_ctx.in_flight_jobs, node) {
+			if (!job->call_info.size)
+				continue;
+
+			if (syncobj->seqno < job->done_fence->seqno)
+				break;
+
+			list_move_tail(&job->node, &done_jobs);
+			dma_fence_signal_locked(job->done_fence);
+		}
+		spin_unlock(&queue->fence_ctx.lock);
+	}
+	dma_fence_end_signalling(cookie);
+
+	list_for_each_entry_safe(job, job_tmp, &done_jobs, node) {
+		list_del_init(&job->node);
+		panthor_job_put(&job->base);
+	}
+
+	group_put(group);
+}
+
+static struct dma_fence *
+queue_run_job(struct drm_sched_job *sched_job)
+{
+	struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
+	struct panthor_group *group = job->group;
+	struct panthor_queue *queue = group->queues[job->queue_idx];
+	struct panthor_device *ptdev = group->ptdev;
+	struct panthor_scheduler *sched = ptdev->scheduler;
+	u32 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
+	u32 ringbuf_insert = queue->iface.input->insert & (ringbuf_size - 1);
+	u64 addr_reg = ptdev->csif_info.cs_reg_count -
+		       ptdev->csif_info.unpreserved_cs_reg_count;
+	u64 val_reg = addr_reg + 2;
+	u64 sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) +
+			job->queue_idx * sizeof(struct panthor_syncobj_64b);
+	u32 waitall_mask = GENMASK(sched->sb_slot_count - 1, 0);
+	struct dma_fence *done_fence;
+	int ret;
+
+	u64 call_instrs[NUM_INSTRS_PER_SLOT] = {
+		/* MOV32 rX+2, cs.latest_flush */
+		(2ull << 56) | (val_reg << 48) | job->call_info.latest_flush,
+
+		/* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */
+		(36ull << 56) | (0ull << 48) | (val_reg << 40) | (0 << 16) | 0x233,
+
+		/* MOV48 rX:rX+1, cs.start */
+		(1ull << 56) | (addr_reg << 48) | job->call_info.start,
+
+		/* MOV32 rX+2, cs.size */
+		(2ull << 56) | (val_reg << 48) | job->call_info.size,
+
+		/* WAIT(0) => waits for FLUSH_CACHE2 instruction */
+		(3ull << 56) | (1 << 16),
+
+		/* CALL rX:rX+1, rX+2 */
+		(32ull << 56) | (addr_reg << 40) | (val_reg << 32),
+
+		/* MOV48 rX:rX+1, sync_addr */
+		(1ull << 56) | (addr_reg << 48) | sync_addr,
+
+		/* MOV48 rX+2, #1 */
+		(1ull << 56) | (val_reg << 48) | 1,
+
+		/* WAIT(all) */
+		(3ull << 56) | (waitall_mask << 16),
+
+		/* SYNC_ADD64.system_scope.propage_err.nowait rX:rX+1, rX+2*/
+		(51ull << 56) | (0ull << 48) | (addr_reg << 40) | (val_reg << 32) | (0 << 16) | 1,
+
+		/* ERROR_BARRIER, so we can recover from faults at job
+		 * boundaries.
+		 */
+		(47ull << 56),
+	};
+
+	/* Need to be cacheline aligned to please the prefetcher. */
+	static_assert(sizeof(call_instrs) % 64 == 0,
+		      "call_instrs is not aligned on a cacheline");
+
+	/* Stream size is zero, nothing to do => return a NULL fence and let
+	 * drm_sched signal the parent.
+	 */
+	if (!job->call_info.size)
+		return NULL;
+
+	ret = pm_runtime_resume_and_get(ptdev->base.dev);
+	if (drm_WARN_ON(&ptdev->base, ret))
+		return ERR_PTR(ret);
+
+	mutex_lock(&sched->lock);
+	if (!group_can_run(group)) {
+		done_fence = ERR_PTR(-ECANCELED);
+		goto out_unlock;
+	}
+
+	dma_fence_init(job->done_fence,
+		       &panthor_queue_fence_ops,
+		       &queue->fence_ctx.lock,
+		       queue->fence_ctx.id,
+		       atomic64_inc_return(&queue->fence_ctx.seqno));
+
+	memcpy(queue->ringbuf->kmap + ringbuf_insert,
+	       call_instrs, sizeof(call_instrs));
+
+	panthor_job_get(&job->base);
+	spin_lock(&queue->fence_ctx.lock);
+	list_add_tail(&job->node, &queue->fence_ctx.in_flight_jobs);
+	spin_unlock(&queue->fence_ctx.lock);
+
+	job->ringbuf.start = queue->iface.input->insert;
+	job->ringbuf.end = job->ringbuf.start + sizeof(call_instrs);
+
+	/* Make sure the ring buffer is updated before the INSERT
+	 * register.
+	 */
+	wmb();
+
+	queue->iface.input->extract = queue->iface.output->extract;
+	queue->iface.input->insert = job->ringbuf.end;
+
+	if (group->csg_id < 0) {
+		/* If the queue is blocked, we want to keep the timeout running, so we
+		 * can detect unbounded waits and kill the group when that happens.
+		 * Otherwise, we suspend the timeout so the time we spend waiting for
+		 * a CSG slot is not counted.
+		 */
+		if (!(group->blocked_queues & BIT(job->queue_idx)) &&
+		    !queue->timeout_suspended) {
+			queue->remaining_time = drm_sched_suspend_timeout(&queue->scheduler);
+			queue->timeout_suspended = true;
+		}
+
+		group_schedule_locked(group, BIT(job->queue_idx));
+	} else {
+		gpu_write(ptdev, CSF_DOORBELL(queue->doorbell_id), 1);
+		if (!sched->pm.has_ref &&
+		    !(group->blocked_queues & BIT(job->queue_idx))) {
+			pm_runtime_get(ptdev->base.dev);
+			sched->pm.has_ref = true;
+		}
+	}
+
+	done_fence = dma_fence_get(job->done_fence);
+
+out_unlock:
+	mutex_unlock(&sched->lock);
+	pm_runtime_mark_last_busy(ptdev->base.dev);
+	pm_runtime_put_autosuspend(ptdev->base.dev);
+
+	return done_fence;
+}
+
+static enum drm_gpu_sched_stat
+queue_timedout_job(struct drm_sched_job *sched_job)
+{
+	struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
+	struct panthor_group *group = job->group;
+	struct panthor_device *ptdev = group->ptdev;
+	struct panthor_scheduler *sched = ptdev->scheduler;
+	struct panthor_queue *queue = group->queues[job->queue_idx];
+
+	drm_warn(&ptdev->base, "job timeout\n");
+
+	drm_WARN_ON(&ptdev->base, atomic_read(&sched->reset.in_progress));
+
+	queue_stop(queue, job);
+
+	mutex_lock(&sched->lock);
+	group->timedout = true;
+	if (group->csg_id >= 0) {
+		sched_queue_delayed_work(ptdev->scheduler, tick, 0);
+	} else {
+		/* Remove from the run queues, so the scheduler can't
+		 * pick the group on the next tick.
+		 */
+		list_del_init(&group->run_node);
+		list_del_init(&group->wait_node);
+
+		group_queue_work(group, term);
+	}
+	mutex_unlock(&sched->lock);
+
+	queue_start(queue);
+
+	return DRM_GPU_SCHED_STAT_NOMINAL;
+}
+
+static void queue_free_job(struct drm_sched_job *sched_job)
+{
+	drm_sched_job_cleanup(sched_job);
+	panthor_job_put(sched_job);
+}
+
+static const struct drm_sched_backend_ops panthor_queue_sched_ops = {
+	.run_job = queue_run_job,
+	.timedout_job = queue_timedout_job,
+	.free_job = queue_free_job,
+};
+
+static struct panthor_queue *
+group_create_queue(struct panthor_group *group,
+		   const struct drm_panthor_queue_create *args)
+{
+	struct drm_gpu_scheduler *drm_sched;
+	struct panthor_queue *queue;
+	int ret;
+
+	if (args->pad[0] || args->pad[1] || args->pad[2])
+		return ERR_PTR(-EINVAL);
+
+	if (args->ringbuf_size < SZ_4K || args->ringbuf_size > SZ_64K ||
+	    !is_power_of_2(args->ringbuf_size))
+		return ERR_PTR(-EINVAL);
+
+	if (args->priority > CSF_MAX_QUEUE_PRIO)
+		return ERR_PTR(-EINVAL);
+
+	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
+	if (!queue)
+		return ERR_PTR(-ENOMEM);
+
+	queue->fence_ctx.id = dma_fence_context_alloc(1);
+	spin_lock_init(&queue->fence_ctx.lock);
+	INIT_LIST_HEAD(&queue->fence_ctx.in_flight_jobs);
+
+	queue->priority = args->priority;
+
+	queue->ringbuf = panthor_kernel_bo_create(group->ptdev, group->vm,
+						  args->ringbuf_size,
+						  DRM_PANTHOR_BO_NO_MMAP,
+						  DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC |
+						  DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED,
+						  PANTHOR_VM_KERNEL_AUTO_VA);
+	if (IS_ERR(queue->ringbuf)) {
+		ret = PTR_ERR(queue->ringbuf);
+		goto err_free_queue;
+	}
+
+	ret = panthor_kernel_bo_vmap(queue->ringbuf);
+	if (ret)
+		goto err_free_queue;
+
+	queue->iface.mem = panthor_fw_alloc_queue_iface_mem(group->ptdev,
+							    &queue->iface.input,
+							    &queue->iface.output,
+							    &queue->iface.input_fw_va,
+							    &queue->iface.output_fw_va);
+	if (IS_ERR(queue->iface.mem)) {
+		ret = PTR_ERR(queue->iface.mem);
+		goto err_free_queue;
+	}
+
+	ret = drm_sched_init(&queue->scheduler, &panthor_queue_sched_ops,
+			     group->ptdev->scheduler->wq, 1,
+			     args->ringbuf_size / (NUM_INSTRS_PER_SLOT * sizeof(u64)),
+			     0, msecs_to_jiffies(JOB_TIMEOUT_MS),
+			     group->ptdev->reset.wq,
+			     NULL, "panthor-queue", group->ptdev->base.dev);
+	if (ret)
+		goto err_free_queue;
+
+	drm_sched = &queue->scheduler;
+	ret = drm_sched_entity_init(&queue->entity, 0, &drm_sched, 1, NULL);
+
+	return queue;
+
+err_free_queue:
+	group_free_queue(group, queue);
+	return ERR_PTR(ret);
+}
+
+#define MAX_GROUPS_PER_POOL		128
+
+int panthor_group_create(struct panthor_file *pfile,
+			 const struct drm_panthor_group_create *group_args,
+			 const struct drm_panthor_queue_create *queue_args)
+{
+	struct panthor_device *ptdev = pfile->ptdev;
+	struct panthor_group_pool *gpool = pfile->groups;
+	struct panthor_scheduler *sched = ptdev->scheduler;
+	struct panthor_fw_csg_iface *csg_iface = panthor_fw_get_csg_iface(ptdev, 0);
+	struct panthor_group *group = NULL;
+	u32 gid, i, suspend_size;
+	int ret;
+
+	if (group_args->pad)
+		return -EINVAL;
+
+	if (group_args->priority > PANTHOR_CSG_PRIORITY_HIGH)
+		return -EINVAL;
+
+	if ((group_args->compute_core_mask & ~ptdev->gpu_info.shader_present) ||
+	    (group_args->fragment_core_mask & ~ptdev->gpu_info.shader_present) ||
+	    (group_args->tiler_core_mask & ~ptdev->gpu_info.tiler_present))
+		return -EINVAL;
+
+	if (hweight64(group_args->compute_core_mask) < group_args->max_compute_cores ||
+	    hweight64(group_args->fragment_core_mask) < group_args->max_fragment_cores ||
+	    hweight64(group_args->tiler_core_mask) < group_args->max_tiler_cores)
+		return -EINVAL;
+
+	group = kzalloc(sizeof(*group), GFP_KERNEL);
+	if (!group)
+		return -ENOMEM;
+
+	spin_lock_init(&group->fatal_lock);
+	kref_init(&group->refcount);
+	group->state = PANTHOR_CS_GROUP_CREATED;
+	group->csg_id = -1;
+
+	group->ptdev = ptdev;
+	group->max_compute_cores = group_args->max_compute_cores;
+	group->compute_core_mask = group_args->compute_core_mask;
+	group->max_fragment_cores = group_args->max_fragment_cores;
+	group->fragment_core_mask = group_args->fragment_core_mask;
+	group->max_tiler_cores = group_args->max_tiler_cores;
+	group->tiler_core_mask = group_args->tiler_core_mask;
+	group->priority = group_args->priority;
+
+	INIT_LIST_HEAD(&group->wait_node);
+	INIT_LIST_HEAD(&group->run_node);
+	INIT_WORK(&group->term_work, group_term_work);
+	INIT_WORK(&group->sync_upd_work, group_sync_upd_work);
+	INIT_WORK(&group->tiler_oom_work, group_tiler_oom_work);
+	INIT_WORK(&group->release_work, group_release_work);
+
+	group->vm = panthor_vm_pool_get_vm(pfile->vms, group_args->vm_id);
+	if (!group->vm) {
+		ret = -EINVAL;
+		goto err_put_group;
+	}
+
+	suspend_size = csg_iface->control->suspend_size;
+	group->suspend_buf = panthor_fw_alloc_suspend_buf_mem(ptdev, suspend_size);
+	if (IS_ERR(group->suspend_buf)) {
+		ret = PTR_ERR(group->suspend_buf);
+		group->suspend_buf = NULL;
+		goto err_put_group;
+	}
+
+	suspend_size = csg_iface->control->protm_suspend_size;
+	group->protm_suspend_buf = panthor_fw_alloc_suspend_buf_mem(ptdev, suspend_size);
+	if (IS_ERR(group->protm_suspend_buf)) {
+		ret = PTR_ERR(group->protm_suspend_buf);
+		group->protm_suspend_buf = NULL;
+		goto err_put_group;
+	}
+
+	group->syncobjs = panthor_kernel_bo_create(ptdev, group->vm,
+						   group_args->queues.count *
+						   sizeof(struct panthor_syncobj_64b),
+						   DRM_PANTHOR_BO_NO_MMAP,
+						   DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC |
+						   DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED,
+						   PANTHOR_VM_KERNEL_AUTO_VA);
+	if (IS_ERR(group->syncobjs)) {
+		ret = PTR_ERR(group->syncobjs);
+		goto err_put_group;
+	}
+
+	ret = panthor_kernel_bo_vmap(group->syncobjs);
+	if (ret)
+		goto err_put_group;
+
+	memset(group->syncobjs->kmap, 0,
+	       group_args->queues.count * sizeof(struct panthor_syncobj_64b));
+
+	for (i = 0; i < group_args->queues.count; i++) {
+		group->queues[i] = group_create_queue(group, &queue_args[i]);
+		if (IS_ERR(group->queues[i])) {
+			ret = PTR_ERR(group->queues[i]);
+			group->queues[i] = NULL;
+			goto err_put_group;
+		}
+
+		group->queue_count++;
+	}
+
+	group->idle_queues = GENMASK(group->queue_count - 1, 0);
+
+	ret = xa_alloc(&gpool->xa, &gid, group, XA_LIMIT(1, MAX_GROUPS_PER_POOL), GFP_KERNEL);
+	if (ret)
+		goto err_put_group;
+
+	mutex_lock(&sched->reset.lock);
+	if (atomic_read(&sched->reset.in_progress)) {
+		panthor_group_stop(group);
+	} else {
+		mutex_lock(&sched->lock);
+		list_add_tail(&group->run_node,
+			      &sched->groups.idle[group->priority]);
+		mutex_unlock(&sched->lock);
+	}
+	mutex_unlock(&sched->reset.lock);
+
+	return gid;
+
+err_put_group:
+	group_put(group);
+	return ret;
+}
+
+int panthor_group_destroy(struct panthor_file *pfile, u32 group_handle)
+{
+	struct panthor_group_pool *gpool = pfile->groups;
+	struct panthor_device *ptdev = pfile->ptdev;
+	struct panthor_scheduler *sched = ptdev->scheduler;
+	struct panthor_group *group;
+
+	group = xa_erase(&gpool->xa, group_handle);
+	if (!group)
+		return -EINVAL;
+
+	for (u32 i = 0; i < group->queue_count; i++) {
+		if (group->queues[i])
+			drm_sched_entity_destroy(&group->queues[i]->entity);
+	}
+
+	mutex_lock(&sched->reset.lock);
+	mutex_lock(&sched->lock);
+	group->destroyed = true;
+	if (group->csg_id >= 0) {
+		sched_queue_delayed_work(sched, tick, 0);
+	} else if (!atomic_read(&sched->reset.in_progress)) {
+		/* Remove from the run queues, so the scheduler can't
+		 * pick the group on the next tick.
+		 */
+		list_del_init(&group->run_node);
+		list_del_init(&group->wait_node);
+		group_queue_work(group, term);
+	}
+	mutex_unlock(&sched->lock);
+	mutex_unlock(&sched->reset.lock);
+
+	group_put(group);
+	return 0;
+}
+
+int panthor_group_get_state(struct panthor_file *pfile,
+			    struct drm_panthor_group_get_state *get_state)
+{
+	struct panthor_group_pool *gpool = pfile->groups;
+	struct panthor_device *ptdev = pfile->ptdev;
+	struct panthor_scheduler *sched = ptdev->scheduler;
+	struct panthor_group *group;
+
+	if (get_state->pad)
+		return -EINVAL;
+
+	group = group_get(xa_load(&gpool->xa, get_state->group_handle));
+	if (!group)
+		return -EINVAL;
+
+	memset(get_state, 0, sizeof(*get_state));
+
+	mutex_lock(&sched->lock);
+	if (group->timedout)
+		get_state->state |= DRM_PANTHOR_GROUP_STATE_TIMEDOUT;
+	if (group->fatal_queues) {
+		get_state->state |= DRM_PANTHOR_GROUP_STATE_FATAL_FAULT;
+		get_state->fatal_queues = group->fatal_queues;
+	}
+	mutex_unlock(&sched->lock);
+
+	group_put(group);
+	return 0;
+}
+
+int panthor_group_pool_create(struct panthor_file *pfile)
+{
+	struct panthor_group_pool *gpool;
+
+	gpool = kzalloc(sizeof(*gpool), GFP_KERNEL);
+	if (!gpool)
+		return -ENOMEM;
+
+	xa_init_flags(&gpool->xa, XA_FLAGS_ALLOC1);
+	pfile->groups = gpool;
+	return 0;
+}
+
+void panthor_group_pool_destroy(struct panthor_file *pfile)
+{
+	struct panthor_group_pool *gpool = pfile->groups;
+	struct panthor_group *group;
+	unsigned long i;
+
+	if (IS_ERR_OR_NULL(gpool))
+		return;
+
+	xa_for_each(&gpool->xa, i, group)
+		panthor_group_destroy(pfile, i);
+
+	xa_destroy(&gpool->xa);
+	kfree(gpool);
+	pfile->groups = NULL;
+}
+
+static void job_release(struct kref *ref)
+{
+	struct panthor_job *job = container_of(ref, struct panthor_job, refcount);
+
+	drm_WARN_ON(&job->group->ptdev->base, !list_empty(&job->node));
+
+	if (job->base.s_fence)
+		drm_sched_job_cleanup(&job->base);
+
+	if (job->done_fence && job->done_fence->ops)
+		dma_fence_put(job->done_fence);
+	else
+		dma_fence_free(job->done_fence);
+
+	group_put(job->group);
+
+	kfree(job);
+}
+
+struct drm_sched_job *panthor_job_get(struct drm_sched_job *sched_job)
+{
+	if (sched_job) {
+		struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
+
+		kref_get(&job->refcount);
+	}
+
+	return sched_job;
+}
+
+void panthor_job_put(struct drm_sched_job *sched_job)
+{
+	struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
+
+	if (sched_job)
+		kref_put(&job->refcount, job_release);
+}
+
+struct panthor_vm *panthor_job_vm(struct drm_sched_job *sched_job)
+{
+	struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
+
+	return job->group->vm;
+}
+
+struct drm_sched_job *
+panthor_job_create(struct panthor_file *pfile,
+		   u16 group_handle,
+		   const struct drm_panthor_queue_submit *qsubmit)
+{
+	struct panthor_group_pool *gpool = pfile->groups;
+	struct panthor_job *job;
+	int ret;
+
+	if (qsubmit->pad)
+		return ERR_PTR(-EINVAL);
+
+	/* If stream_addr is zero, so stream_size should be. */
+	if ((qsubmit->stream_size == 0) != (qsubmit->stream_addr == 0))
+		return ERR_PTR(-EINVAL);
+
+	/* Make sure the address is aligned on 64-byte (cacheline) and the size is
+	 * aligned on 8-byte (instruction size).
+	 */
+	if ((qsubmit->stream_addr & 63) || (qsubmit->stream_size & 7))
+		return ERR_PTR(-EINVAL);
+
+	/* bits 24:30 must be zero. */
+	if (qsubmit->latest_flush & GENMASK(30, 24))
+		return ERR_PTR(-EINVAL);
+
+	job = kzalloc(sizeof(*job), GFP_KERNEL);
+	if (!job)
+		return ERR_PTR(-ENOMEM);
+
+	kref_init(&job->refcount);
+	job->queue_idx = qsubmit->queue_index;
+	job->call_info.size = qsubmit->stream_size;
+	job->call_info.start = qsubmit->stream_addr;
+	job->call_info.latest_flush = qsubmit->latest_flush;
+	INIT_LIST_HEAD(&job->node);
+
+	job->group = group_get(xa_load(&gpool->xa, group_handle));
+	if (!job->group) {
+		ret = -EINVAL;
+		goto err_put_job;
+	}
+
+	if (job->queue_idx >= job->group->queue_count ||
+	    !job->group->queues[job->queue_idx]) {
+		ret = -EINVAL;
+		goto err_put_job;
+	}
+
+	job->done_fence = kzalloc(sizeof(*job->done_fence), GFP_KERNEL);
+	if (!job->done_fence) {
+		ret = -ENOMEM;
+		goto err_put_job;
+	}
+
+	ret = drm_sched_job_init(&job->base,
+				 &job->group->queues[job->queue_idx]->entity,
+				 1, job->group);
+	if (ret)
+		goto err_put_job;
+
+	return &job->base;
+
+err_put_job:
+	panthor_job_put(&job->base);
+	return ERR_PTR(ret);
+}
+
+void panthor_job_update_resvs(struct drm_exec *exec, struct drm_sched_job *sched_job)
+{
+	struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
+
+	/* Still not sure why we want USAGE_WRITE for external objects, since I
+	 * was assuming this would be handled through explicit syncs being imported
+	 * to external BOs with DMA_BUF_IOCTL_IMPORT_SYNC_FILE, but other drivers
+	 * seem to pass DMA_RESV_USAGE_WRITE, so there must be a good reason.
+	 */
+	panthor_vm_update_resvs(job->group->vm, exec, &sched_job->s_fence->finished,
+				DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_WRITE);
+}
+
+void panthor_sched_unplug(struct panthor_device *ptdev)
+{
+	struct panthor_scheduler *sched = ptdev->scheduler;
+
+	cancel_delayed_work_sync(&sched->tick_work);
+
+	mutex_lock(&sched->lock);
+	if (sched->pm.has_ref) {
+		pm_runtime_put(ptdev->base.dev);
+		sched->pm.has_ref = false;
+	}
+	mutex_unlock(&sched->lock);
+}
+
+static void panthor_sched_fini(struct drm_device *ddev, void *res)
+{
+	struct panthor_scheduler *sched = res;
+	int prio;
+
+	if (!sched || !sched->csg_slot_count)
+		return;
+
+	cancel_delayed_work_sync(&sched->tick_work);
+
+	if (sched->wq)
+		destroy_workqueue(sched->wq);
+
+	if (sched->heap_alloc_wq)
+		destroy_workqueue(sched->heap_alloc_wq);
+
+	for (prio = PANTHOR_CSG_PRIORITY_COUNT - 1; prio >= 0; prio--) {
+		drm_WARN_ON(ddev, !list_empty(&sched->groups.runnable[prio]));
+		drm_WARN_ON(ddev, !list_empty(&sched->groups.idle[prio]));
+	}
+
+	drm_WARN_ON(ddev, !list_empty(&sched->groups.waiting));
+}
+
+int panthor_sched_init(struct panthor_device *ptdev)
+{
+	struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+	struct panthor_fw_csg_iface *csg_iface = panthor_fw_get_csg_iface(ptdev, 0);
+	struct panthor_fw_cs_iface *cs_iface = panthor_fw_get_cs_iface(ptdev, 0, 0);
+	struct panthor_scheduler *sched;
+	u32 gpu_as_count, num_groups;
+	int prio, ret;
+
+	sched = drmm_kzalloc(&ptdev->base, sizeof(*sched), GFP_KERNEL);
+	if (!sched)
+		return -ENOMEM;
+
+	/* The highest bit in JOB_INT_* is reserved for globabl IRQs. That
+	 * leaves 31 bits for CSG IRQs, hence the MAX_CSGS clamp here.
+	 */
+	num_groups = min_t(u32, MAX_CSGS, glb_iface->control->group_num);
+
+	/* The FW-side scheduler might deadlock if two groups with the same
+	 * priority try to access a set of resources that overlaps, with part
+	 * of the resources being allocated to one group and the other part to
+	 * the other group, both groups waiting for the remaining resources to
+	 * be allocated. To avoid that, it is recommended to assign each CSG a
+	 * different priority. In theory we could allow several groups to have
+	 * the same CSG priority if they don't request the same resources, but
+	 * that makes the scheduling logic more complicated, so let's clamp
+	 * the number of CSG slots to MAX_CSG_PRIO + 1 for now.
+	 */
+	num_groups = min_t(u32, MAX_CSG_PRIO + 1, num_groups);
+
+	/* We need at least one AS for the MCU and one for the GPU contexts. */
+	gpu_as_count = hweight32(ptdev->gpu_info.as_present & GENMASK(31, 1));
+	if (!gpu_as_count) {
+		drm_err(&ptdev->base, "Not enough AS (%d, expected at least 2)",
+			gpu_as_count + 1);
+		return -EINVAL;
+	}
+
+	sched->ptdev = ptdev;
+	sched->sb_slot_count = CS_FEATURES_SCOREBOARDS(cs_iface->control->features);
+	sched->csg_slot_count = num_groups;
+	sched->cs_slot_count = csg_iface->control->stream_num;
+	sched->as_slot_count = gpu_as_count;
+	ptdev->csif_info.csg_slot_count = sched->csg_slot_count;
+	ptdev->csif_info.cs_slot_count = sched->cs_slot_count;
+	ptdev->csif_info.scoreboard_slot_count = sched->sb_slot_count;
+
+	sched->last_tick = 0;
+	sched->resched_target = U64_MAX;
+	sched->tick_period = msecs_to_jiffies(10);
+	INIT_DELAYED_WORK(&sched->tick_work, tick_work);
+	INIT_WORK(&sched->sync_upd_work, sync_upd_work);
+	INIT_WORK(&sched->fw_events_work, process_fw_events_work);
+
+	ret = drmm_mutex_init(&ptdev->base, &sched->lock);
+	if (ret)
+		return ret;
+
+	for (prio = PANTHOR_CSG_PRIORITY_COUNT - 1; prio >= 0; prio--) {
+		INIT_LIST_HEAD(&sched->groups.runnable[prio]);
+		INIT_LIST_HEAD(&sched->groups.idle[prio]);
+	}
+	INIT_LIST_HEAD(&sched->groups.waiting);
+
+	ret = drmm_mutex_init(&ptdev->base, &sched->reset.lock);
+	if (ret)
+		return ret;
+
+	INIT_LIST_HEAD(&sched->reset.stopped_groups);
+
+	/* sched->heap_alloc_wq will be used for heap chunk allocation on
+	 * tiler OOM events, which means we can't use the same workqueue for
+	 * the scheduler because works queued by the scheduler are in
+	 * the dma-signalling path. Allocate a dedicated heap_alloc_wq to
+	 * work around this limitation.
+	 *
+	 * FIXME: Ultimately, what we need is a failable/non-blocking GEM
+	 * allocation path that we can call when a heap OOM is reported. The
+	 * FW is smart enough to fall back on other methods if the kernel can't
+	 * allocate memory, and fail the tiling job if none of these
+	 * countermeasures worked.
+	 *
+	 * Set WQ_MEM_RECLAIM on sched->wq to unblock the situation when the
+	 * system is running out of memory.
+	 */
+	sched->heap_alloc_wq = alloc_workqueue("panthor-heap-alloc", WQ_UNBOUND, 0);
+	sched->wq = alloc_workqueue("panthor-csf-sched", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
+	if (!sched->wq || !sched->heap_alloc_wq) {
+		panthor_sched_fini(&ptdev->base, sched);
+		drm_err(&ptdev->base, "Failed to allocate the workqueues");
+		return -ENOMEM;
+	}
+
+	ret = drmm_add_action_or_reset(&ptdev->base, panthor_sched_fini, sched);
+	if (ret)
+		return ret;
+
+	ptdev->scheduler = sched;
+	return 0;
+}
diff --git a/drivers/gpu/drm/panthor/panthor_sched.h b/drivers/gpu/drm/panthor/panthor_sched.h
new file mode 100644
index 0000000000000..66438b1f331f6
--- /dev/null
+++ b/drivers/gpu/drm/panthor/panthor_sched.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 or MIT */
+/* Copyright 2023 Collabora ltd. */
+
+#ifndef __PANTHOR_SCHED_H__
+#define __PANTHOR_SCHED_H__
+
+struct drm_exec;
+struct dma_fence;
+struct drm_file;
+struct drm_gem_object;
+struct drm_sched_job;
+struct drm_panthor_group_create;
+struct drm_panthor_queue_create;
+struct drm_panthor_group_get_state;
+struct drm_panthor_queue_submit;
+struct panthor_device;
+struct panthor_file;
+struct panthor_group_pool;
+struct panthor_job;
+
+int panthor_group_create(struct panthor_file *pfile,
+			 const struct drm_panthor_group_create *group_args,
+			 const struct drm_panthor_queue_create *queue_args);
+int panthor_group_destroy(struct panthor_file *pfile, u32 group_handle);
+int panthor_group_get_state(struct panthor_file *pfile,
+			    struct drm_panthor_group_get_state *get_state);
+
+struct drm_sched_job *
+panthor_job_create(struct panthor_file *pfile,
+		   u16 group_handle,
+		   const struct drm_panthor_queue_submit *qsubmit);
+struct drm_sched_job *panthor_job_get(struct drm_sched_job *job);
+struct panthor_vm *panthor_job_vm(struct drm_sched_job *sched_job);
+void panthor_job_put(struct drm_sched_job *job);
+void panthor_job_update_resvs(struct drm_exec *exec, struct drm_sched_job *job);
+
+int panthor_group_pool_create(struct panthor_file *pfile);
+void panthor_group_pool_destroy(struct panthor_file *pfile);
+
+int panthor_sched_init(struct panthor_device *ptdev);
+void panthor_sched_unplug(struct panthor_device *ptdev);
+void panthor_sched_pre_reset(struct panthor_device *ptdev);
+void panthor_sched_post_reset(struct panthor_device *ptdev);
+void panthor_sched_suspend(struct panthor_device *ptdev);
+void panthor_sched_resume(struct panthor_device *ptdev);
+
+void panthor_sched_report_mmu_fault(struct panthor_device *ptdev);
+void panthor_sched_report_fw_events(struct panthor_device *ptdev, u32 events);
+
+#endif
diff --git a/drivers/gpu/drm/pl111/pl111_drv.c b/drivers/gpu/drm/pl111/pl111_drv.c
index eb25eedb5ee00..fbc1352eb63a5 100644
--- a/drivers/gpu/drm/pl111/pl111_drv.c
+++ b/drivers/gpu/drm/pl111/pl111_drv.c
@@ -224,8 +224,6 @@ static const struct drm_driver pl111_drm_driver = {
 	.minor = 0,
 	.patchlevel = 0,
 	.dumb_create = drm_gem_dma_dumb_create,
-	.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
-	.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
 	.gem_prime_import_sg_table = pl111_gem_import_sg_table,
 	.gem_prime_mmap = drm_gem_prime_mmap,
 
diff --git a/drivers/gpu/drm/qxl/qxl_drv.c b/drivers/gpu/drm/qxl/qxl_drv.c
index 3044ca948ce22..c259968a5b1ef 100644
--- a/drivers/gpu/drm/qxl/qxl_drv.c
+++ b/drivers/gpu/drm/qxl/qxl_drv.c
@@ -289,8 +289,6 @@ static struct drm_driver qxl_driver = {
 #if defined(CONFIG_DEBUG_FS)
 	.debugfs_init = qxl_debugfs_init,
 #endif
-	.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
-	.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
 	.gem_prime_import_sg_table = qxl_gem_prime_import_sg_table,
 	.fops = &qxl_fops,
 	.ioctls = qxl_ioctls,
diff --git a/drivers/gpu/drm/qxl/qxl_object.c b/drivers/gpu/drm/qxl/qxl_object.c
index 695d9308d1f08..06a58dad5f5cf 100644
--- a/drivers/gpu/drm/qxl/qxl_object.c
+++ b/drivers/gpu/drm/qxl/qxl_object.c
@@ -168,9 +168,16 @@ int qxl_bo_vmap_locked(struct qxl_bo *bo, struct iosys_map *map)
 		bo->map_count++;
 		goto out;
 	}
-	r = ttm_bo_vmap(&bo->tbo, &bo->map);
+
+	r = __qxl_bo_pin(bo);
 	if (r)
 		return r;
+
+	r = ttm_bo_vmap(&bo->tbo, &bo->map);
+	if (r) {
+		__qxl_bo_unpin(bo);
+		return r;
+	}
 	bo->map_count = 1;
 
 	/* TODO: Remove kptr in favor of map everywhere. */
@@ -192,12 +199,6 @@ int qxl_bo_vmap(struct qxl_bo *bo, struct iosys_map *map)
 	if (r)
 		return r;
 
-	r = __qxl_bo_pin(bo);
-	if (r) {
-		qxl_bo_unreserve(bo);
-		return r;
-	}
-
 	r = qxl_bo_vmap_locked(bo, map);
 	qxl_bo_unreserve(bo);
 	return r;
@@ -247,6 +248,7 @@ void qxl_bo_vunmap_locked(struct qxl_bo *bo)
 		return;
 	bo->kptr = NULL;
 	ttm_bo_vunmap(&bo->tbo, &bo->map);
+	__qxl_bo_unpin(bo);
 }
 
 int qxl_bo_vunmap(struct qxl_bo *bo)
@@ -258,7 +260,6 @@ int qxl_bo_vunmap(struct qxl_bo *bo)
 		return r;
 
 	qxl_bo_vunmap_locked(bo);
-	__qxl_bo_unpin(bo);
 	qxl_bo_unreserve(bo);
 	return 0;
 }
diff --git a/drivers/gpu/drm/qxl/qxl_prime.c b/drivers/gpu/drm/qxl/qxl_prime.c
index 142d01415acb3..9169c26357d36 100644
--- a/drivers/gpu/drm/qxl/qxl_prime.c
+++ b/drivers/gpu/drm/qxl/qxl_prime.c
@@ -59,7 +59,7 @@ int qxl_gem_prime_vmap(struct drm_gem_object *obj, struct iosys_map *map)
 	struct qxl_bo *bo = gem_to_qxl_bo(obj);
 	int ret;
 
-	ret = qxl_bo_vmap(bo, map);
+	ret = qxl_bo_vmap_locked(bo, map);
 	if (ret < 0)
 		return ret;
 
@@ -71,5 +71,5 @@ void qxl_gem_prime_vunmap(struct drm_gem_object *obj,
 {
 	struct qxl_bo *bo = gem_to_qxl_bo(obj);
 
-	qxl_bo_vunmap(bo);
+	qxl_bo_vunmap_locked(bo);
 }
diff --git a/drivers/gpu/drm/radeon/radeon_drv.c b/drivers/gpu/drm/radeon/radeon_drv.c
index 6cbe1ab81abad..67345bafa3c89 100644
--- a/drivers/gpu/drm/radeon/radeon_drv.c
+++ b/drivers/gpu/drm/radeon/radeon_drv.c
@@ -605,8 +605,6 @@ static const struct drm_driver kms_driver = {
 	.dumb_map_offset = radeon_mode_dumb_mmap,
 	.fops = &radeon_driver_kms_fops,
 
-	.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
-	.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
 	.gem_prime_import_sg_table = radeon_gem_prime_import_sg_table,
 	.gem_prime_mmap = drm_gem_prime_mmap,
 
diff --git a/drivers/gpu/drm/rcar-du/rcar_du_drv.c b/drivers/gpu/drm/rcar-du/rcar_du_drv.c
index 6381578c4db58..f8ad042b4ac67 100644
--- a/drivers/gpu/drm/rcar-du/rcar_du_drv.c
+++ b/drivers/gpu/drm/rcar-du/rcar_du_drv.c
@@ -626,8 +626,6 @@ DEFINE_DRM_GEM_DMA_FOPS(rcar_du_fops);
 static const struct drm_driver rcar_du_driver = {
 	.driver_features	= DRIVER_GEM | DRIVER_MODESET | DRIVER_ATOMIC,
 	.dumb_create		= rcar_du_dumb_create,
-	.prime_handle_to_fd	= drm_gem_prime_handle_to_fd,
-	.prime_fd_to_handle	= drm_gem_prime_fd_to_handle,
 	.gem_prime_import_sg_table = rcar_du_gem_prime_import_sg_table,
 	.gem_prime_mmap		= drm_gem_prime_mmap,
 	.fops			= &rcar_du_fops,
diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_drv.c b/drivers/gpu/drm/rockchip/rockchip_drm_drv.c
index b6c34f94cc8b6..36edac612d9a8 100644
--- a/drivers/gpu/drm/rockchip/rockchip_drm_drv.c
+++ b/drivers/gpu/drm/rockchip/rockchip_drm_drv.c
@@ -1852,8 +1852,6 @@ static struct drm_driver rockchip_drm_driver = {
 	.lastclose		= rockchip_drm_lastclose,
 	.open			= rockchip_drm_open,
 	.dumb_create		= rockchip_gem_dumb_create,
-	.prime_handle_to_fd	= drm_gem_prime_handle_to_fd,
-	.prime_fd_to_handle	= drm_gem_prime_fd_to_handle,
 	.gem_prime_import	= rockchip_drm_gem_prime_import,
 	.gem_prime_import_sg_table	= rockchip_gem_prime_import_sg_table,
 	.gem_prime_mmap		= drm_gem_prime_mmap,
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h
index 3143ecaaff862..f8ed093b7356e 100644
--- a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h
+++ b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h
@@ -51,7 +51,7 @@ DECLARE_EVENT_CLASS(drm_sched_job,
 			   __assign_str(name, sched_job->sched->name);
 			   __entry->job_count = spsc_queue_count(&entity->job_queue);
 			   __entry->hw_job_count = atomic_read(
-				   &sched_job->sched->hw_rq_count);
+				   &sched_job->sched->credit_count);
 			   ),
 	    TP_printk("entity=%p, id=%llu, fence=%p, ring=%s, job count:%u, hw job count:%d",
 		      __entry->entity, __entry->id,
diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
index 4b913dbb7d7b6..3c4f5a392b064 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -72,15 +72,32 @@ int drm_sched_entity_init(struct drm_sched_entity *entity,
 	entity->num_sched_list = num_sched_list;
 	entity->priority = priority;
 	entity->sched_list = num_sched_list > 1 ? sched_list : NULL;
-	entity->last_scheduled = NULL;
+	RCU_INIT_POINTER(entity->last_scheduled, NULL);
+	RB_CLEAR_NODE(&entity->rb_tree_node);
 
-	if(num_sched_list)
-		entity->rq = &sched_list[0]->sched_rq[entity->priority];
+	if (!sched_list[0]->sched_rq) {
+		/* Warn drivers not to do this and to fix their DRM
+		 * calling order.
+		 */
+		pr_warn("%s: called with uninitialized scheduler\n", __func__);
+	} else if (num_sched_list) {
+		/* The "priority" of an entity cannot exceed the number of run-queues of a
+		 * scheduler. Protect against num_rqs being 0, by converting to signed. Choose
+		 * the lowest priority available.
+		 */
+		if (entity->priority >= sched_list[0]->num_rqs) {
+			drm_err(sched_list[0], "entity with out-of-bounds priority:%u num_rqs:%u\n",
+				entity->priority, sched_list[0]->num_rqs);
+			entity->priority = max_t(s32, (s32) sched_list[0]->num_rqs - 1,
+						 (s32) DRM_SCHED_PRIORITY_KERNEL);
+		}
+		entity->rq = sched_list[0]->sched_rq[entity->priority];
+	}
 
 	init_completion(&entity->entity_idle);
 
 	/* We start in an idle state. */
-	complete(&entity->entity_idle);
+	complete_all(&entity->entity_idle);
 
 	spin_lock_init(&entity->rq_lock);
 	spsc_queue_init(&entity->job_queue);
@@ -139,6 +156,110 @@ bool drm_sched_entity_is_ready(struct drm_sched_entity *entity)
 	return true;
 }
 
+/**
+ * drm_sched_entity_error - return error of last scheduled job
+ * @entity: scheduler entity to check
+ *
+ * Opportunistically return the error of the last scheduled job. Result can
+ * change any time when new jobs are pushed to the hw.
+ */
+int drm_sched_entity_error(struct drm_sched_entity *entity)
+{
+	struct dma_fence *fence;
+	int r;
+
+	rcu_read_lock();
+	fence = rcu_dereference(entity->last_scheduled);
+	r = fence ? fence->error : 0;
+	rcu_read_unlock();
+
+	return r;
+}
+EXPORT_SYMBOL(drm_sched_entity_error);
+
+static void drm_sched_entity_kill_jobs_work(struct work_struct *wrk)
+{
+	struct drm_sched_job *job = container_of(wrk, typeof(*job), work);
+
+	drm_sched_fence_finished(job->s_fence, -ESRCH);
+	WARN_ON(job->s_fence->parent);
+	job->sched->ops->free_job(job);
+}
+
+/* Signal the scheduler finished fence when the entity in question is killed. */
+static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
+					  struct dma_fence_cb *cb)
+{
+	struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
+						 finish_cb);
+	unsigned long index;
+
+	dma_fence_put(f);
+
+	/* Wait for all dependencies to avoid data corruptions */
+	xa_for_each(&job->dependencies, index, f) {
+		struct drm_sched_fence *s_fence = to_drm_sched_fence(f);
+
+		if (s_fence && f == &s_fence->scheduled) {
+			/* The dependencies array had a reference on the scheduled
+			 * fence, and the finished fence refcount might have
+			 * dropped to zero. Use dma_fence_get_rcu() so we get
+			 * a NULL fence in that case.
+			 */
+			f = dma_fence_get_rcu(&s_fence->finished);
+
+			/* Now that we have a reference on the finished fence,
+			 * we can release the reference the dependencies array
+			 * had on the scheduled fence.
+			 */
+			dma_fence_put(&s_fence->scheduled);
+		}
+
+		xa_erase(&job->dependencies, index);
+		if (f && !dma_fence_add_callback(f, &job->finish_cb,
+						 drm_sched_entity_kill_jobs_cb))
+			return;
+
+		dma_fence_put(f);
+	}
+
+	INIT_WORK(&job->work, drm_sched_entity_kill_jobs_work);
+	schedule_work(&job->work);
+}
+
+/* Remove the entity from the scheduler and kill all pending jobs */
+static void drm_sched_entity_kill(struct drm_sched_entity *entity)
+{
+	struct drm_sched_job *job;
+	struct dma_fence *prev;
+
+	if (!entity->rq)
+		return;
+
+	spin_lock(&entity->rq_lock);
+	entity->stopped = true;
+	drm_sched_rq_remove_entity(entity->rq, entity);
+	spin_unlock(&entity->rq_lock);
+
+	/* Make sure this entity is not used by the scheduler at the moment */
+	wait_for_completion(&entity->entity_idle);
+
+	/* The entity is guaranteed to not be used by the scheduler */
+	prev = rcu_dereference_check(entity->last_scheduled, true);
+	dma_fence_get(prev);
+	while ((job = to_drm_sched_job(spsc_queue_pop(&entity->job_queue)))) {
+		struct drm_sched_fence *s_fence = job->s_fence;
+
+		dma_fence_get(&s_fence->finished);
+		if (!prev || dma_fence_add_callback(prev, &job->finish_cb,
+					   drm_sched_entity_kill_jobs_cb))
+			drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb);
+
+		prev = &s_fence->finished;
+	}
+	dma_fence_put(prev);
+}
+
 /**
  * drm_sched_entity_flush - Flush a context entity
  *
@@ -179,91 +300,13 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout)
 	/* For killed process disable any more IBs enqueue right now */
 	last_user = cmpxchg(&entity->last_user, current->group_leader, NULL);
 	if ((!last_user || last_user == current->group_leader) &&
-	    (current->flags & PF_EXITING) && (current->exit_code == SIGKILL)) {
-		spin_lock(&entity->rq_lock);
-		entity->stopped = true;
-		drm_sched_rq_remove_entity(entity->rq, entity);
-		spin_unlock(&entity->rq_lock);
-	}
+	    (current->flags & PF_EXITING) && (current->exit_code == SIGKILL))
+		drm_sched_entity_kill(entity);
 
 	return ret;
 }
 EXPORT_SYMBOL(drm_sched_entity_flush);
 
-static void drm_sched_entity_kill_jobs_work(struct work_struct *wrk)
-{
-	struct drm_sched_job *job = container_of(wrk, typeof(*job), work);
-
-	drm_sched_fence_finished(job->s_fence);
-	WARN_ON(job->s_fence->parent);
-	job->sched->ops->free_job(job);
-}
-
-
-/* Signal the scheduler finished fence when the entity in question is killed. */
-static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
-					  struct dma_fence_cb *cb)
-{
-	struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
-						 finish_cb);
-
-	dma_fence_put(f);
-	INIT_WORK(&job->work, drm_sched_entity_kill_jobs_work);
-	schedule_work(&job->work);
-}
-
-static struct dma_fence *
-drm_sched_job_dependency(struct drm_sched_job *job,
-			 struct drm_sched_entity *entity)
-{
-	if (!xa_empty(&job->dependencies))
-		return xa_erase(&job->dependencies, job->last_dependency++);
-
-	if (job->sched->ops->dependency)
-		return job->sched->ops->dependency(job, entity);
-
-	return NULL;
-}
-
-static void drm_sched_entity_kill_jobs(struct drm_sched_entity *entity)
-{
-	struct drm_sched_job *job;
-	struct dma_fence *f;
-	int r;
-
-	while ((job = to_drm_sched_job(spsc_queue_pop(&entity->job_queue)))) {
-		struct drm_sched_fence *s_fence = job->s_fence;
-
-		/* Wait for all dependencies to avoid data corruptions */
-		while ((f = drm_sched_job_dependency(job, entity))) {
-			dma_fence_wait(f, false);
-			dma_fence_put(f);
-		}
-
-		drm_sched_fence_scheduled(s_fence);
-		dma_fence_set_error(&s_fence->finished, -ESRCH);
-
-		/*
-		 * When pipe is hanged by older entity, new entity might
-		 * not even have chance to submit it's first job to HW
-		 * and so entity->last_scheduled will remain NULL
-		 */
-		if (!entity->last_scheduled) {
-			drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb);
-			continue;
-		}
-
-		dma_fence_get(entity->last_scheduled);
-		r = dma_fence_add_callback(entity->last_scheduled,
-					   &job->finish_cb,
-					   drm_sched_entity_kill_jobs_cb);
-		if (r == -ENOENT)
-			drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb);
-		else if (r)
-			DRM_ERROR("fence add callback failed (%d)\n", r);
-	}
-}
-
 /**
  * drm_sched_entity_fini - Destroy a context entity
  *
@@ -277,37 +320,21 @@ static void drm_sched_entity_kill_jobs(struct drm_sched_entity *entity)
  */
 void drm_sched_entity_fini(struct drm_sched_entity *entity)
 {
-	struct drm_gpu_scheduler *sched = NULL;
-
-	if (entity->rq) {
-		sched = entity->rq->sched;
-		drm_sched_rq_remove_entity(entity->rq, entity);
-	}
-
-	/* Consumption of existing IBs wasn't completed. Forcefully
-	 * remove them here.
+	/*
+	 * If consumption of existing IBs wasn't completed. Forcefully remove
+	 * them here. Also makes sure that the scheduler won't touch this entity
+	 * any more.
 	 */
-	if (spsc_queue_count(&entity->job_queue)) {
-		if (sched) {
-			/*
-			 * Wait for thread to idle to make sure it isn't processing
-			 * this entity.
-			 */
-			wait_for_completion(&entity->entity_idle);
+	drm_sched_entity_kill(entity);
 
-		}
-		if (entity->dependency) {
-			dma_fence_remove_callback(entity->dependency,
-						  &entity->cb);
-			dma_fence_put(entity->dependency);
-			entity->dependency = NULL;
-		}
-
-		drm_sched_entity_kill_jobs(entity);
+	if (entity->dependency) {
+		dma_fence_remove_callback(entity->dependency, &entity->cb);
+		dma_fence_put(entity->dependency);
+		entity->dependency = NULL;
 	}
 
-	dma_fence_put(entity->last_scheduled);
-	entity->last_scheduled = NULL;
+	dma_fence_put(rcu_dereference_check(entity->last_scheduled, true));
+	RCU_INIT_POINTER(entity->last_scheduled, NULL);
 }
 EXPORT_SYMBOL(drm_sched_entity_fini);
 
@@ -347,7 +374,7 @@ static void drm_sched_entity_wakeup(struct dma_fence *f,
 		container_of(cb, struct drm_sched_entity, cb);
 
 	drm_sched_entity_clear_dep(f, cb);
-	drm_sched_wakeup(entity->rq->sched);
+	drm_sched_wakeup(entity->rq->sched, entity);
 }
 
 /**
@@ -389,7 +416,7 @@ static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity)
 	}
 
 	s_fence = to_drm_sched_fence(fence);
-	if (s_fence && s_fence->sched == sched &&
+	if (!fence->error && s_fence && s_fence->sched == sched &&
 	    !test_bit(DRM_SCHED_FENCE_DONT_PIPELINE, &fence->flags)) {
 
 		/*
@@ -416,6 +443,28 @@ static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity)
 	return false;
 }
 
+static struct dma_fence *
+drm_sched_job_dependency(struct drm_sched_job *job,
+			 struct drm_sched_entity *entity)
+{
+	struct dma_fence *f;
+
+	/* We keep the fence around, so we can iterate over all dependencies
+	 * in drm_sched_entity_kill_jobs_cb() to ensure all deps are signaled
+	 * before killing the job.
+	 */
+	f = xa_load(&job->dependencies, job->last_dependency);
+	if (f) {
+		job->last_dependency++;
+		return dma_fence_get(f);
+	}
+
+	if (job->sched->ops->prepare_job)
+		return job->sched->ops->prepare_job(job, entity);
+
+	return NULL;
+}
+
 struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity)
 {
 	struct drm_sched_job *sched_job;
@@ -436,9 +485,9 @@ struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity)
 	if (entity->guilty && atomic_read(entity->guilty))
 		dma_fence_set_error(&sched_job->s_fence->finished, -ECANCELED);
 
-	dma_fence_put(entity->last_scheduled);
-
-	entity->last_scheduled = dma_fence_get(&sched_job->s_fence->finished);
+	dma_fence_put(rcu_dereference_check(entity->last_scheduled, true));
+	rcu_assign_pointer(entity->last_scheduled,
+			   dma_fence_get(&sched_job->s_fence->finished));
 
 	/*
 	 * If the queue is empty we allow drm_sched_entity_select_rq() to
@@ -448,6 +497,25 @@ struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity)
 	smp_wmb();
 
 	spsc_queue_pop(&entity->job_queue);
+
+	/*
+	 * Update the entity's location in the min heap according to
+	 * the timestamp of the next job, if any.
+	 */
+	if (drm_sched_policy == DRM_SCHED_POLICY_FIFO) {
+		struct drm_sched_job *next;
+
+		next = to_drm_sched_job(spsc_queue_peek(&entity->job_queue));
+		if (next)
+			drm_sched_rq_update_fifo(entity, next->submit_ts);
+	}
+
+	/* Jobs and entities might have different lifecycles. Since we're
+	 * removing the job from the entities queue, set the jobs entity pointer
+	 * to NULL to prevent any future access of the entity through this job.
+	 */
+	sched_job->entity = NULL;
+
 	return sched_job;
 }
 
@@ -473,7 +541,7 @@ void drm_sched_entity_select_rq(struct drm_sched_entity *entity)
 	 */
 	smp_rmb();
 
-	fence = entity->last_scheduled;
+	fence = rcu_dereference_check(entity->last_scheduled, true);
 
 	/* stay on the same engine if the previous job hasn't finished */
 	if (fence && !dma_fence_is_signaled(fence))
@@ -481,7 +549,7 @@ void drm_sched_entity_select_rq(struct drm_sched_entity *entity)
 
 	spin_lock(&entity->rq_lock);
 	sched = drm_sched_pick_best(entity->sched_list, entity->num_sched_list);
-	rq = sched ? &sched->sched_rq[entity->priority] : NULL;
+	rq = sched ? sched->sched_rq[entity->priority] : NULL;
 	if (rq != entity->rq) {
 		drm_sched_rq_remove_entity(entity->rq, entity);
 		entity->rq = rq;
@@ -507,10 +575,18 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
 {
 	struct drm_sched_entity *entity = sched_job->entity;
 	bool first;
+	ktime_t submit_ts;
 
 	trace_drm_sched_job(sched_job, entity);
 	atomic_inc(entity->rq->sched->score);
 	WRITE_ONCE(entity->last_user, current->group_leader);
+
+	/*
+	 * After the sched_job is pushed into the entity queue, it may be
+	 * completed and freed up at any time. We can no longer access it.
+	 * Make sure to set the submit_ts first, to avoid a race.
+	 */
+	sched_job->submit_ts = submit_ts = ktime_get();
 	first = spsc_queue_push(&entity->job_queue, &sched_job->queue_node);
 
 	/* first job wakes up scheduler */
@@ -523,9 +599,14 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
 			DRM_ERROR("Trying to push to a killed entity\n");
 			return;
 		}
+
 		drm_sched_rq_add_entity(entity->rq, entity);
 		spin_unlock(&entity->rq_lock);
-		drm_sched_wakeup(entity->rq->sched);
+
+		if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
+			drm_sched_rq_update_fifo(entity, submit_ts);
+
+		drm_sched_wakeup(entity->rq->sched, entity);
 	}
 }
 EXPORT_SYMBOL(drm_sched_entity_push_job);
diff --git a/drivers/gpu/drm/scheduler/sched_fence.c b/drivers/gpu/drm/scheduler/sched_fence.c
index 7fd869520ef2c..06cedfe4b4867 100644
--- a/drivers/gpu/drm/scheduler/sched_fence.c
+++ b/drivers/gpu/drm/scheduler/sched_fence.c
@@ -48,13 +48,39 @@ static void __exit drm_sched_fence_slab_fini(void)
 	kmem_cache_destroy(sched_fence_slab);
 }
 
-void drm_sched_fence_scheduled(struct drm_sched_fence *fence)
+static void drm_sched_fence_set_parent(struct drm_sched_fence *s_fence,
+				       struct dma_fence *fence)
 {
+	/*
+	 * smp_store_release() to ensure another thread racing us
+	 * in drm_sched_fence_set_deadline_finished() sees the
+	 * fence's parent set before test_bit()
+	 */
+	smp_store_release(&s_fence->parent, dma_fence_get(fence));
+	if (test_bit(DRM_SCHED_FENCE_FLAG_HAS_DEADLINE_BIT,
+		     &s_fence->finished.flags))
+		dma_fence_set_deadline(fence, s_fence->deadline);
+}
+
+void drm_sched_fence_scheduled(struct drm_sched_fence *fence,
+			       struct dma_fence *parent)
+{
+	/* Set the parent before signaling the scheduled fence, such that,
+	 * any waiter expecting the parent to be filled after the job has
+	 * been scheduled (which is the case for drivers delegating waits
+	 * to some firmware) doesn't have to busy wait for parent to show
+	 * up.
+	 */
+	if (!IS_ERR_OR_NULL(parent))
+		drm_sched_fence_set_parent(fence, parent);
+
 	dma_fence_signal(&fence->scheduled);
 }
 
-void drm_sched_fence_finished(struct drm_sched_fence *fence)
+void drm_sched_fence_finished(struct drm_sched_fence *fence, int result)
 {
+	if (result)
+		dma_fence_set_error(&fence->finished, result);
 	dma_fence_signal(&fence->finished);
 }
 
@@ -123,6 +149,37 @@ static void drm_sched_fence_release_finished(struct dma_fence *f)
 	dma_fence_put(&fence->scheduled);
 }
 
+static void drm_sched_fence_set_deadline_finished(struct dma_fence *f,
+						  ktime_t deadline)
+{
+	struct drm_sched_fence *fence = to_drm_sched_fence(f);
+	struct dma_fence *parent;
+	unsigned long flags;
+
+	spin_lock_irqsave(&fence->lock, flags);
+
+	/* If we already have an earlier deadline, keep it: */
+	if (test_bit(DRM_SCHED_FENCE_FLAG_HAS_DEADLINE_BIT, &f->flags) &&
+	    ktime_before(fence->deadline, deadline)) {
+		spin_unlock_irqrestore(&fence->lock, flags);
+		return;
+	}
+
+	fence->deadline = deadline;
+	set_bit(DRM_SCHED_FENCE_FLAG_HAS_DEADLINE_BIT, &f->flags);
+
+	spin_unlock_irqrestore(&fence->lock, flags);
+
+	/*
+	 * smp_load_aquire() to ensure that if we are racing another
+	 * thread calling drm_sched_fence_set_parent(), that we see
+	 * the parent set before it calls test_bit(HAS_DEADLINE_BIT)
+	 */
+	parent = smp_load_acquire(&fence->parent);
+	if (parent)
+		dma_fence_set_deadline(parent, deadline);
+}
+
 static const struct dma_fence_ops drm_sched_fence_ops_scheduled = {
 	.get_driver_name = drm_sched_fence_get_driver_name,
 	.get_timeline_name = drm_sched_fence_get_timeline_name,
@@ -133,6 +190,7 @@ static const struct dma_fence_ops drm_sched_fence_ops_finished = {
 	.get_driver_name = drm_sched_fence_get_driver_name,
 	.get_timeline_name = drm_sched_fence_get_timeline_name,
 	.release = drm_sched_fence_release_finished,
+	.set_deadline = drm_sched_fence_set_deadline_finished,
 };
 
 struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f)
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index e5a4ecde0063d..8acbef7ae53d1 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -42,9 +42,36 @@
  *    the hardware.
  *
  * The jobs in a entity are always scheduled in the order that they were pushed.
+ *
+ * Note that once a job was taken from the entities queue and pushed to the
+ * hardware, i.e. the pending queue, the entity must not be referenced anymore
+ * through the jobs entity pointer.
+ */
+
+/**
+ * DOC: Flow Control
+ *
+ * The DRM GPU scheduler provides a flow control mechanism to regulate the rate
+ * in which the jobs fetched from scheduler entities are executed.
+ *
+ * In this context the &drm_gpu_scheduler keeps track of a driver specified
+ * credit limit representing the capacity of this scheduler and a credit count;
+ * every &drm_sched_job carries a driver specified number of credits.
+ *
+ * Once a job is executed (but not yet finished), the job's credits contribute
+ * to the scheduler's credit count until the job is finished. If by executing
+ * one more job the scheduler's credit count would exceed the scheduler's
+ * credit limit, the job won't be executed. Instead, the scheduler will wait
+ * until the credit count has decreased enough to not overflow its credit limit.
+ * This implies waiting for previously executed jobs.
+ *
+ * Optionally, drivers may register a callback (update_job_credits) provided by
+ * struct drm_sched_backend_ops to update the job's credits dynamically. The
+ * scheduler executes this callback every time the scheduler considers a job for
+ * execution and subsequently checks whether the job fits the scheduler's credit
+ * limit.
  */
 
-#include <linux/kthread.h>
 #include <linux/wait.h>
 #include <linux/sched.h>
 #include <linux/completion.h>
@@ -53,6 +80,7 @@
 
 #include <drm/drm_print.h>
 #include <drm/drm_gem.h>
+#include <drm/drm_syncobj.h>
 #include <drm/gpu_scheduler.h>
 #include <drm/spsc_queue.h>
 
@@ -62,6 +90,100 @@
 #define to_drm_sched_job(sched_job)		\
 		container_of((sched_job), struct drm_sched_job, queue_node)
 
+int drm_sched_policy = DRM_SCHED_POLICY_FIFO;
+
+/**
+ * DOC: sched_policy (int)
+ * Used to override default entities scheduling policy in a run queue.
+ */
+MODULE_PARM_DESC(sched_policy, "Specify the scheduling policy for entities on a run-queue, " __stringify(DRM_SCHED_POLICY_RR) " = Round Robin, " __stringify(DRM_SCHED_POLICY_FIFO) " = FIFO (default).");
+module_param_named(sched_policy, drm_sched_policy, int, 0444);
+
+static u32 drm_sched_available_credits(struct drm_gpu_scheduler *sched)
+{
+	u32 credits;
+
+	drm_WARN_ON(sched, check_sub_overflow(sched->credit_limit,
+					      atomic_read(&sched->credit_count),
+					      &credits));
+
+	return credits;
+}
+
+/**
+ * drm_sched_can_queue -- Can we queue more to the hardware?
+ * @sched: scheduler instance
+ * @entity: the scheduler entity
+ *
+ * Return true if we can push at least one more job from @entity, false
+ * otherwise.
+ */
+static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched,
+				struct drm_sched_entity *entity)
+{
+	struct drm_sched_job *s_job;
+
+	s_job = to_drm_sched_job(spsc_queue_peek(&entity->job_queue));
+	if (!s_job)
+		return false;
+
+	if (sched->ops->update_job_credits) {
+		s_job->credits = sched->ops->update_job_credits(s_job);
+
+		drm_WARN(sched, !s_job->credits,
+			 "Jobs with zero credits bypass job-flow control.\n");
+	}
+
+	/* If a job exceeds the credit limit, truncate it to the credit limit
+	 * itself to guarantee forward progress.
+	 */
+	if (drm_WARN(sched, s_job->credits > sched->credit_limit,
+		     "Jobs may not exceed the credit limit, truncate.\n"))
+		s_job->credits = sched->credit_limit;
+
+	return drm_sched_available_credits(sched) >= s_job->credits;
+}
+
+static __always_inline bool drm_sched_entity_compare_before(struct rb_node *a,
+							    const struct rb_node *b)
+{
+	struct drm_sched_entity *ent_a =  rb_entry((a), struct drm_sched_entity, rb_tree_node);
+	struct drm_sched_entity *ent_b =  rb_entry((b), struct drm_sched_entity, rb_tree_node);
+
+	return ktime_before(ent_a->oldest_job_waiting, ent_b->oldest_job_waiting);
+}
+
+static inline void drm_sched_rq_remove_fifo_locked(struct drm_sched_entity *entity)
+{
+	struct drm_sched_rq *rq = entity->rq;
+
+	if (!RB_EMPTY_NODE(&entity->rb_tree_node)) {
+		rb_erase_cached(&entity->rb_tree_node, &rq->rb_tree_root);
+		RB_CLEAR_NODE(&entity->rb_tree_node);
+	}
+}
+
+void drm_sched_rq_update_fifo(struct drm_sched_entity *entity, ktime_t ts)
+{
+	/*
+	 * Both locks need to be grabbed, one to protect from entity->rq change
+	 * for entity from within concurrent drm_sched_entity_select_rq and the
+	 * other to update the rb tree structure.
+	 */
+	spin_lock(&entity->rq_lock);
+	spin_lock(&entity->rq->lock);
+
+	drm_sched_rq_remove_fifo_locked(entity);
+
+	entity->oldest_job_waiting = ts;
+
+	rb_add_cached(&entity->rb_tree_node, &entity->rq->rb_tree_root,
+		      drm_sched_entity_compare_before);
+
+	spin_unlock(&entity->rq->lock);
+	spin_unlock(&entity->rq_lock);
+}
+
 /**
  * drm_sched_rq_init - initialize a given run queue struct
  *
@@ -75,6 +197,7 @@ static void drm_sched_rq_init(struct drm_gpu_scheduler *sched,
 {
 	spin_lock_init(&rq->lock);
 	INIT_LIST_HEAD(&rq->entities);
+	rq->rb_tree_root = RB_ROOT_CACHED;
 	rq->current_entity = NULL;
 	rq->sched = sched;
 }
@@ -92,9 +215,12 @@ void drm_sched_rq_add_entity(struct drm_sched_rq *rq,
 {
 	if (!list_empty(&entity->list))
 		return;
+
 	spin_lock(&rq->lock);
+
 	atomic_inc(rq->sched->score);
 	list_add_tail(&entity->list, &rq->entities);
+
 	spin_unlock(&rq->lock);
 }
 
@@ -111,23 +237,36 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
 {
 	if (list_empty(&entity->list))
 		return;
+
 	spin_lock(&rq->lock);
+
 	atomic_dec(rq->sched->score);
 	list_del_init(&entity->list);
+
 	if (rq->current_entity == entity)
 		rq->current_entity = NULL;
+
+	if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
+		drm_sched_rq_remove_fifo_locked(entity);
+
 	spin_unlock(&rq->lock);
 }
 
 /**
- * drm_sched_rq_select_entity - Select an entity which could provide a job to run
+ * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
  *
+ * @sched: the gpu scheduler
  * @rq: scheduler run queue to check.
  *
- * Try to find a ready entity, returns NULL if none found.
+ * Try to find the next ready entity.
+ *
+ * Return an entity if one is found; return an error-pointer (!NULL) if an
+ * entity was ready, but the scheduler had insufficient credits to accommodate
+ * its job; return NULL, if no ready entity was found.
  */
 static struct drm_sched_entity *
-drm_sched_rq_select_entity(struct drm_sched_rq *rq)
+drm_sched_rq_select_entity_rr(struct drm_gpu_scheduler *sched,
+			      struct drm_sched_rq *rq)
 {
 	struct drm_sched_entity *entity;
 
@@ -137,6 +276,14 @@ drm_sched_rq_select_entity(struct drm_sched_rq *rq)
 	if (entity) {
 		list_for_each_entry_continue(entity, &rq->entities, list) {
 			if (drm_sched_entity_is_ready(entity)) {
+				/* If we can't queue yet, preserve the current
+				 * entity in terms of fairness.
+				 */
+				if (!drm_sched_can_queue(sched, entity)) {
+					spin_unlock(&rq->lock);
+					return ERR_PTR(-ENOSPC);
+				}
+
 				rq->current_entity = entity;
 				reinit_completion(&entity->entity_idle);
 				spin_unlock(&rq->lock);
@@ -146,8 +293,15 @@ drm_sched_rq_select_entity(struct drm_sched_rq *rq)
 	}
 
 	list_for_each_entry(entity, &rq->entities, list) {
-
 		if (drm_sched_entity_is_ready(entity)) {
+			/* If we can't queue yet, preserve the current entity in
+			 * terms of fairness.
+			 */
+			if (!drm_sched_can_queue(sched, entity)) {
+				spin_unlock(&rq->lock);
+				return ERR_PTR(-ENOSPC);
+			}
+
 			rq->current_entity = entity;
 			reinit_completion(&entity->entity_idle);
 			spin_unlock(&rq->lock);
@@ -163,26 +317,104 @@ drm_sched_rq_select_entity(struct drm_sched_rq *rq)
 	return NULL;
 }
 
+/**
+ * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
+ *
+ * @sched: the gpu scheduler
+ * @rq: scheduler run queue to check.
+ *
+ * Find oldest waiting ready entity.
+ *
+ * Return an entity if one is found; return an error-pointer (!NULL) if an
+ * entity was ready, but the scheduler had insufficient credits to accommodate
+ * its job; return NULL, if no ready entity was found.
+ */
+static struct drm_sched_entity *
+drm_sched_rq_select_entity_fifo(struct drm_gpu_scheduler *sched,
+				struct drm_sched_rq *rq)
+{
+	struct rb_node *rb;
+
+	spin_lock(&rq->lock);
+	for (rb = rb_first_cached(&rq->rb_tree_root); rb; rb = rb_next(rb)) {
+		struct drm_sched_entity *entity;
+
+		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
+		if (drm_sched_entity_is_ready(entity)) {
+			/* If we can't queue yet, preserve the current entity in
+			 * terms of fairness.
+			 */
+			if (!drm_sched_can_queue(sched, entity)) {
+				spin_unlock(&rq->lock);
+				return ERR_PTR(-ENOSPC);
+			}
+
+			rq->current_entity = entity;
+			reinit_completion(&entity->entity_idle);
+			break;
+		}
+	}
+	spin_unlock(&rq->lock);
+
+	return rb ? rb_entry(rb, struct drm_sched_entity, rb_tree_node) : NULL;
+}
+
+/**
+ * drm_sched_run_job_queue - enqueue run-job work
+ * @sched: scheduler instance
+ */
+static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
+{
+	if (!READ_ONCE(sched->pause_submit))
+		queue_work(sched->submit_wq, &sched->work_run_job);
+}
+
+/**
+ * __drm_sched_run_free_queue - enqueue free-job work
+ * @sched: scheduler instance
+ */
+static void __drm_sched_run_free_queue(struct drm_gpu_scheduler *sched)
+{
+	if (!READ_ONCE(sched->pause_submit))
+		queue_work(sched->submit_wq, &sched->work_free_job);
+}
+
+/**
+ * drm_sched_run_free_queue - enqueue free-job work if ready
+ * @sched: scheduler instance
+ */
+static void drm_sched_run_free_queue(struct drm_gpu_scheduler *sched)
+{
+	struct drm_sched_job *job;
+
+	spin_lock(&sched->job_list_lock);
+	job = list_first_entry_or_null(&sched->pending_list,
+				       struct drm_sched_job, list);
+	if (job && dma_fence_is_signaled(&job->s_fence->finished))
+		__drm_sched_run_free_queue(sched);
+	spin_unlock(&sched->job_list_lock);
+}
+
 /**
  * drm_sched_job_done - complete a job
  * @s_job: pointer to the job which is done
  *
  * Finish the job's fence and wake up the worker thread.
  */
-static void drm_sched_job_done(struct drm_sched_job *s_job)
+static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
 {
 	struct drm_sched_fence *s_fence = s_job->s_fence;
 	struct drm_gpu_scheduler *sched = s_fence->sched;
 
-	atomic_dec(&sched->hw_rq_count);
+	atomic_sub(s_job->credits, &sched->credit_count);
 	atomic_dec(sched->score);
 
 	trace_drm_sched_process_job(s_fence);
 
 	dma_fence_get(&s_fence->finished);
-	drm_sched_fence_finished(s_fence);
+	drm_sched_fence_finished(s_fence, result);
 	dma_fence_put(&s_fence->finished);
-	wake_up_interruptible(&sched->wake_up_worker);
+	__drm_sched_run_free_queue(sched);
 }
 
 /**
@@ -194,48 +426,47 @@ static void drm_sched_job_done_cb(struct dma_fence *f, struct dma_fence_cb *cb)
 {
 	struct drm_sched_job *s_job = container_of(cb, struct drm_sched_job, cb);
 
-	drm_sched_job_done(s_job);
+	drm_sched_job_done(s_job, f->error);
 }
 
 /**
- * drm_sched_dependency_optimized - test if the dependency can be optimized
+ * drm_sched_start_timeout - start timeout for reset worker
  *
- * @fence: the dependency fence
- * @entity: the entity which depends on the above fence
+ * @sched: scheduler instance to start the worker for
  *
- * Returns true if the dependency can be optimized and false otherwise
+ * Start the timeout for the given scheduler.
  */
-bool drm_sched_dependency_optimized(struct dma_fence* fence,
-				    struct drm_sched_entity *entity)
+static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched)
 {
-	struct drm_gpu_scheduler *sched = entity->rq->sched;
-	struct drm_sched_fence *s_fence;
+	lockdep_assert_held(&sched->job_list_lock);
 
-	if (!fence || dma_fence_is_signaled(fence))
-		return false;
-	if (fence->context == entity->fence_context)
-		return true;
-	s_fence = to_drm_sched_fence(fence);
-	if (s_fence && s_fence->sched == sched)
-		return true;
+	if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
+	    !list_empty(&sched->pending_list))
+		mod_delayed_work(sched->timeout_wq, &sched->work_tdr, sched->timeout);
+}
 
-	return false;
+static void drm_sched_start_timeout_unlocked(struct drm_gpu_scheduler *sched)
+{
+	spin_lock(&sched->job_list_lock);
+	drm_sched_start_timeout(sched);
+	spin_unlock(&sched->job_list_lock);
 }
-EXPORT_SYMBOL(drm_sched_dependency_optimized);
 
 /**
- * drm_sched_start_timeout - start timeout for reset worker
+ * drm_sched_tdr_queue_imm: - immediately start job timeout handler
  *
- * @sched: scheduler instance to start the worker for
+ * @sched: scheduler for which the timeout handling should be started.
  *
- * Start the timeout for the given scheduler.
+ * Start timeout handling immediately for the named scheduler.
  */
-static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched)
+void drm_sched_tdr_queue_imm(struct drm_gpu_scheduler *sched)
 {
-	if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
-	    !list_empty(&sched->pending_list))
-		queue_delayed_work(sched->timeout_wq, &sched->work_tdr, sched->timeout);
+	spin_lock(&sched->job_list_lock);
+	sched->timeout = 0;
+	drm_sched_start_timeout(sched);
+	spin_unlock(&sched->job_list_lock);
 }
+EXPORT_SYMBOL(drm_sched_tdr_queue_imm);
 
 /**
  * drm_sched_fault - immediately start timeout handler
@@ -246,7 +477,8 @@ static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched)
  */
 void drm_sched_fault(struct drm_gpu_scheduler *sched)
 {
-	mod_delayed_work(sched->timeout_wq, &sched->work_tdr, 0);
+	if (sched->timeout_wq)
+		mod_delayed_work(sched->timeout_wq, &sched->work_tdr, 0);
 }
 EXPORT_SYMBOL(drm_sched_fault);
 
@@ -320,7 +552,7 @@ static void drm_sched_job_timedout(struct work_struct *work)
 
 	sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
 
-	/* Protects against concurrent deletion in drm_sched_get_cleanup_job */
+	/* Protects against concurrent deletion in drm_sched_get_finished_job */
 	spin_lock(&sched->job_list_lock);
 	job = list_first_entry_or_null(&sched->pending_list,
 				       struct drm_sched_job, list);
@@ -348,33 +580,9 @@ static void drm_sched_job_timedout(struct work_struct *work)
 		spin_unlock(&sched->job_list_lock);
 	}
 
-	if (status != DRM_GPU_SCHED_STAT_ENODEV) {
-		spin_lock(&sched->job_list_lock);
-		drm_sched_start_timeout(sched);
-		spin_unlock(&sched->job_list_lock);
-	}
-}
-
- /**
-  * drm_sched_increase_karma - Update sched_entity guilty flag
-  *
-  * @bad: The job guilty of time out
-  *
-  * Increment on every hang caused by the 'bad' job. If this exceeds the hang
-  * limit of the scheduler then the respective sched entity is marked guilty and
-  * jobs from it will not be scheduled further
-  */
-void drm_sched_increase_karma(struct drm_sched_job *bad)
-{
-	drm_sched_increase_karma_ext(bad, 1);
+	if (status != DRM_GPU_SCHED_STAT_ENODEV)
+		drm_sched_start_timeout_unlocked(sched);
 }
-EXPORT_SYMBOL(drm_sched_increase_karma);
-
-void drm_sched_reset_karma(struct drm_sched_job *bad)
-{
-	drm_sched_increase_karma_ext(bad, 0);
-}
-EXPORT_SYMBOL(drm_sched_reset_karma);
 
 /**
  * drm_sched_stop - stop the scheduler
@@ -392,13 +600,13 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
 {
 	struct drm_sched_job *s_job, *tmp;
 
-	kthread_park(sched->thread);
+	drm_sched_wqueue_stop(sched);
 
 	/*
 	 * Reinsert back the bad job here - now it's safe as
-	 * drm_sched_get_cleanup_job cannot race against us and release the
+	 * drm_sched_get_finished_job cannot race against us and release the
 	 * bad job at this point - we parked (waited for) any in progress
-	 * (earlier) cleanups and drm_sched_get_cleanup_job will not be called
+	 * (earlier) cleanups and drm_sched_get_finished_job will not be called
 	 * now until the scheduler thread is unparked.
 	 */
 	if (bad && bad->sched == sched)
@@ -421,7 +629,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
 					      &s_job->cb)) {
 			dma_fence_put(s_job->s_fence->parent);
 			s_job->s_fence->parent = NULL;
-			atomic_dec(&sched->hw_rq_count);
+			atomic_sub(s_job->credits, &sched->credit_count);
 		} else {
 			/*
 			 * remove job from pending_list.
@@ -482,7 +690,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
 	list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
 		struct dma_fence *fence = s_job->s_fence->parent;
 
-		atomic_inc(&sched->hw_rq_count);
+		atomic_add(s_job->credits, &sched->credit_count);
 
 		if (!full_recovery)
 			continue;
@@ -491,57 +699,48 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
 			r = dma_fence_add_callback(fence, &s_job->cb,
 						   drm_sched_job_done_cb);
 			if (r == -ENOENT)
-				drm_sched_job_done(s_job);
+				drm_sched_job_done(s_job, fence->error);
 			else if (r)
 				DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
 					  r);
 		} else
-			drm_sched_job_done(s_job);
+			drm_sched_job_done(s_job, -ECANCELED);
 	}
 
-	if (full_recovery) {
-		spin_lock(&sched->job_list_lock);
-		drm_sched_start_timeout(sched);
-		spin_unlock(&sched->job_list_lock);
-	}
+	if (full_recovery)
+		drm_sched_start_timeout_unlocked(sched);
 
-	kthread_unpark(sched->thread);
+	drm_sched_wqueue_start(sched);
 }
 EXPORT_SYMBOL(drm_sched_start);
 
 /**
- * drm_sched_resubmit_jobs - helper to relaunch jobs from the pending list
+ * drm_sched_resubmit_jobs - Deprecated, don't use in new code!
  *
  * @sched: scheduler instance
  *
- */
-void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched)
-{
-	drm_sched_resubmit_jobs_ext(sched, INT_MAX);
-}
-EXPORT_SYMBOL(drm_sched_resubmit_jobs);
-
-/**
- * drm_sched_resubmit_jobs_ext - helper to relunch certain number of jobs from mirror ring list
+ * Re-submitting jobs was a concept AMD came up as cheap way to implement
+ * recovery after a job timeout.
  *
- * @sched: scheduler instance
- * @max: job numbers to relaunch
+ * This turned out to be not working very well. First of all there are many
+ * problem with the dma_fence implementation and requirements. Either the
+ * implementation is risking deadlocks with core memory management or violating
+ * documented implementation details of the dma_fence object.
  *
+ * Drivers can still save and restore their state for recovery operations, but
+ * we shouldn't make this a general scheduler feature around the dma_fence
+ * interface.
  */
-void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
+void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched)
 {
 	struct drm_sched_job *s_job, *tmp;
 	uint64_t guilty_context;
 	bool found_guilty = false;
 	struct dma_fence *fence;
-	int i = 0;
 
 	list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
 		struct drm_sched_fence *s_fence = s_job->s_fence;
 
-		if (i >= max)
-			break;
-
 		if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
 			found_guilty = true;
 			guilty_context = s_job->s_fence->scheduled.context;
@@ -551,7 +750,6 @@ void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
 			dma_fence_set_error(&s_fence->finished, -ECANCELED);
 
 		fence = sched->ops->run_job(s_job);
-		i++;
 
 		if (IS_ERR_OR_NULL(fence)) {
 			if (IS_ERR(fence))
@@ -567,12 +765,14 @@ void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
 		}
 	}
 }
-EXPORT_SYMBOL(drm_sched_resubmit_jobs_ext);
+EXPORT_SYMBOL(drm_sched_resubmit_jobs);
 
 /**
  * drm_sched_job_init - init a scheduler job
  * @job: scheduler job to init
  * @entity: scheduler entity to use
+ * @credits: the number of credits this job contributes to the schedulers
+ * credit limit
  * @owner: job owner for debugging
  *
  * Refer to drm_sched_entity_push_job() documentation
@@ -590,12 +790,24 @@ EXPORT_SYMBOL(drm_sched_resubmit_jobs_ext);
  */
 int drm_sched_job_init(struct drm_sched_job *job,
 		       struct drm_sched_entity *entity,
-		       void *owner)
+		       u32 credits, void *owner)
 {
-	if (!entity->rq)
+	if (!entity->rq) {
+		/* This will most likely be followed by missing frames
+		 * or worse--a blank screen--leave a trail in the
+		 * logs, so this can be debugged easier.
+		 */
+		drm_err(job->sched, "%s: entity has no rq!\n", __func__);
 		return -ENOENT;
+	}
+
+	if (unlikely(!credits)) {
+		pr_err("*ERROR* %s: credits cannot be 0!\n", __func__);
+		return -EINVAL;
+	}
 
 	job->entity = entity;
+	job->credits = credits;
 	job->s_fence = drm_sched_fence_alloc(entity, owner);
 	if (!job->s_fence)
 		return -ENOMEM;
@@ -631,7 +843,7 @@ void drm_sched_job_arm(struct drm_sched_job *job)
 	sched = entity->rq->sched;
 
 	job->sched = sched;
-	job->s_priority = entity->rq - sched->sched_rq;
+	job->s_priority = entity->priority;
 	job->id = atomic64_inc_return(&sched->job_id_count);
 
 	drm_sched_fence_init(job->s_fence, job->entity);
@@ -685,32 +897,56 @@ int drm_sched_job_add_dependency(struct drm_sched_job *job,
 EXPORT_SYMBOL(drm_sched_job_add_dependency);
 
 /**
- * drm_sched_job_add_implicit_dependencies - adds implicit dependencies as job
- *   dependencies
+ * drm_sched_job_add_syncobj_dependency - adds a syncobj's fence as a job dependency
  * @job: scheduler job to add the dependencies to
- * @obj: the gem object to add new dependencies from.
- * @write: whether the job might write the object (so we need to depend on
- * shared fences in the reservation object).
+ * @file: drm file private pointer
+ * @handle: syncobj handle to lookup
+ * @point: timeline point
  *
- * This should be called after drm_gem_lock_reservations() on your array of
- * GEM objects used in the job but before updating the reservations with your
- * own fences.
+ * This adds the fence matching the given syncobj to @job.
  *
  * Returns:
  * 0 on success, or an error on failing to expand the array.
  */
-int drm_sched_job_add_implicit_dependencies(struct drm_sched_job *job,
-					    struct drm_gem_object *obj,
-					    bool write)
+int drm_sched_job_add_syncobj_dependency(struct drm_sched_job *job,
+					 struct drm_file *file,
+					 u32 handle,
+					 u32 point)
+{
+	struct dma_fence *fence;
+	int ret;
+
+	ret = drm_syncobj_find_fence(file, handle, point, 0, &fence);
+	if (ret)
+		return ret;
+
+	return drm_sched_job_add_dependency(job, fence);
+}
+EXPORT_SYMBOL(drm_sched_job_add_syncobj_dependency);
+
+/**
+ * drm_sched_job_add_resv_dependencies - add all fences from the resv to the job
+ * @job: scheduler job to add the dependencies to
+ * @resv: the dma_resv object to get the fences from
+ * @usage: the dma_resv_usage to use to filter the fences
+ *
+ * This adds all fences matching the given usage from @resv to @job.
+ * Must be called with the @resv lock held.
+ *
+ * Returns:
+ * 0 on success, or an error on failing to expand the array.
+ */
+int drm_sched_job_add_resv_dependencies(struct drm_sched_job *job,
+					struct dma_resv *resv,
+					enum dma_resv_usage usage)
 {
 	struct dma_resv_iter cursor;
 	struct dma_fence *fence;
 	int ret;
 
-	dma_resv_assert_held(obj->resv);
+	dma_resv_assert_held(resv);
 
-	dma_resv_for_each_fence(&cursor, obj->resv, dma_resv_usage_rw(write),
-				fence) {
+	dma_resv_for_each_fence(&cursor, resv, usage, fence) {
 		/* Make sure to grab an additional ref on the added fence */
 		dma_fence_get(fence);
 		ret = drm_sched_job_add_dependency(job, fence);
@@ -721,8 +957,31 @@ int drm_sched_job_add_implicit_dependencies(struct drm_sched_job *job,
 	}
 	return 0;
 }
-EXPORT_SYMBOL(drm_sched_job_add_implicit_dependencies);
+EXPORT_SYMBOL(drm_sched_job_add_resv_dependencies);
 
+/**
+ * drm_sched_job_add_implicit_dependencies - adds implicit dependencies as job
+ *   dependencies
+ * @job: scheduler job to add the dependencies to
+ * @obj: the gem object to add new dependencies from.
+ * @write: whether the job might write the object (so we need to depend on
+ * shared fences in the reservation object).
+ *
+ * This should be called after drm_gem_lock_reservations() on your array of
+ * GEM objects used in the job but before updating the reservations with your
+ * own fences.
+ *
+ * Returns:
+ * 0 on success, or an error on failing to expand the array.
+ */
+int drm_sched_job_add_implicit_dependencies(struct drm_sched_job *job,
+					    struct drm_gem_object *obj,
+					    bool write)
+{
+	return drm_sched_job_add_resv_dependencies(job, obj->resv,
+						   dma_resv_usage_rw(write));
+}
+EXPORT_SYMBOL(drm_sched_job_add_implicit_dependencies);
 
 /**
  * drm_sched_job_cleanup - clean up scheduler job resources
@@ -761,28 +1020,17 @@ void drm_sched_job_cleanup(struct drm_sched_job *job)
 EXPORT_SYMBOL(drm_sched_job_cleanup);
 
 /**
- * drm_sched_ready - is the scheduler ready
- *
+ * drm_sched_wakeup - Wake up the scheduler if it is ready to queue
  * @sched: scheduler instance
+ * @entity: the scheduler entity
  *
- * Return true if we can push more jobs to the hw, otherwise false.
+ * Wake up the scheduler if we can queue jobs.
  */
-static bool drm_sched_ready(struct drm_gpu_scheduler *sched)
+void drm_sched_wakeup(struct drm_gpu_scheduler *sched,
+		      struct drm_sched_entity *entity)
 {
-	return atomic_read(&sched->hw_rq_count) <
-		sched->hw_submission_limit;
-}
-
-/**
- * drm_sched_wakeup - Wake up the scheduler when it is ready
- *
- * @sched: scheduler instance
- *
- */
-void drm_sched_wakeup(struct drm_gpu_scheduler *sched)
-{
-	if (drm_sched_ready(sched))
-		wake_up_interruptible(&sched->wake_up_worker);
+	if (drm_sched_can_queue(sched, entity))
+		drm_sched_run_job_queue(sched);
 }
 
 /**
@@ -790,7 +1038,11 @@ void drm_sched_wakeup(struct drm_gpu_scheduler *sched)
  *
  * @sched: scheduler instance
  *
- * Returns the entity to process or NULL if none are found.
+ * Return an entity to process or NULL if none are found.
+ *
+ * Note, that we break out of the for-loop when "entity" is non-null, which can
+ * also be an error-pointer--this assures we don't process lower priority
+ * run-queues. See comments in the respectively called functions.
  */
 static struct drm_sched_entity *
 drm_sched_select_entity(struct drm_gpu_scheduler *sched)
@@ -798,21 +1050,21 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
 	struct drm_sched_entity *entity;
 	int i;
 
-	if (!drm_sched_ready(sched))
-		return NULL;
-
-	/* Kernel run queue has higher priority than normal run queue*/
-	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
-		entity = drm_sched_rq_select_entity(&sched->sched_rq[i]);
+	/* Start with the highest priority.
+	 */
+	for (i = DRM_SCHED_PRIORITY_KERNEL; i < sched->num_rqs; i++) {
+		entity = drm_sched_policy == DRM_SCHED_POLICY_FIFO ?
+			drm_sched_rq_select_entity_fifo(sched, sched->sched_rq[i]) :
+			drm_sched_rq_select_entity_rr(sched, sched->sched_rq[i]);
 		if (entity)
 			break;
 	}
 
-	return entity;
+	return IS_ERR(entity) ? NULL : entity;
 }
 
 /**
- * drm_sched_get_cleanup_job - fetch the next finished job to be destroyed
+ * drm_sched_get_finished_job - fetch the next finished job to be destroyed
  *
  * @sched: scheduler instance
  *
@@ -820,7 +1072,7 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
  * ready for it to be destroyed.
  */
 static struct drm_sched_job *
-drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
+drm_sched_get_finished_job(struct drm_gpu_scheduler *sched)
 {
 	struct drm_sched_job *job, *next;
 
@@ -840,8 +1092,10 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
 						typeof(*next), list);
 
 		if (next) {
-			next->s_fence->scheduled.timestamp =
-				job->s_fence->finished.timestamp;
+			if (test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT,
+				     &next->s_fence->scheduled.flags))
+				next->s_fence->scheduled.timestamp =
+					dma_fence_timestamp(&job->s_fence->finished);
 			/* start TO timer for next job */
 			drm_sched_start_timeout(sched);
 		}
@@ -891,94 +1145,81 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
 EXPORT_SYMBOL(drm_sched_pick_best);
 
 /**
- * drm_sched_blocked - check if the scheduler is blocked
+ * drm_sched_free_job_work - worker to call free_job
  *
- * @sched: scheduler instance
- *
- * Returns true if blocked, otherwise false.
+ * @w: free job work
  */
-static bool drm_sched_blocked(struct drm_gpu_scheduler *sched)
+static void drm_sched_free_job_work(struct work_struct *w)
 {
-	if (kthread_should_park()) {
-		kthread_parkme();
-		return true;
-	}
+	struct drm_gpu_scheduler *sched =
+		container_of(w, struct drm_gpu_scheduler, work_free_job);
+	struct drm_sched_job *job;
 
-	return false;
+	if (READ_ONCE(sched->pause_submit))
+		return;
+
+	job = drm_sched_get_finished_job(sched);
+	if (job)
+		sched->ops->free_job(job);
+
+	drm_sched_run_free_queue(sched);
+	drm_sched_run_job_queue(sched);
 }
 
 /**
- * drm_sched_main - main scheduler thread
- *
- * @param: scheduler instance
+ * drm_sched_run_job_work - worker to call run_job
  *
- * Returns 0.
+ * @w: run job work
  */
-static int drm_sched_main(void *param)
+static void drm_sched_run_job_work(struct work_struct *w)
 {
-	struct drm_gpu_scheduler *sched = (struct drm_gpu_scheduler *)param;
+	struct drm_gpu_scheduler *sched =
+		container_of(w, struct drm_gpu_scheduler, work_run_job);
+	struct drm_sched_entity *entity;
+	struct dma_fence *fence;
+	struct drm_sched_fence *s_fence;
+	struct drm_sched_job *sched_job = NULL;
 	int r;
 
-	sched_set_fifo_low(current);
-
-	while (!kthread_should_stop()) {
-		struct drm_sched_entity *entity = NULL;
-		struct drm_sched_fence *s_fence;
-		struct drm_sched_job *sched_job;
-		struct dma_fence *fence;
-		struct drm_sched_job *cleanup_job = NULL;
-
-		wait_event_interruptible(sched->wake_up_worker,
-					 (cleanup_job = drm_sched_get_cleanup_job(sched)) ||
-					 (!drm_sched_blocked(sched) &&
-					  (entity = drm_sched_select_entity(sched))) ||
-					 kthread_should_stop());
-
-		if (cleanup_job)
-			sched->ops->free_job(cleanup_job);
-
-		if (!entity)
-			continue;
+	if (READ_ONCE(sched->pause_submit))
+		return;
 
+	/* Find entity with a ready job */
+	while (!sched_job && (entity = drm_sched_select_entity(sched))) {
 		sched_job = drm_sched_entity_pop_job(entity);
+		if (!sched_job)
+			complete_all(&entity->entity_idle);
+	}
+	if (!entity)
+		return;	/* No more work */
 
-		if (!sched_job) {
-			complete(&entity->entity_idle);
-			continue;
-		}
-
-		s_fence = sched_job->s_fence;
-
-		atomic_inc(&sched->hw_rq_count);
-		drm_sched_job_begin(sched_job);
-
-		trace_drm_run_job(sched_job, entity);
-		fence = sched->ops->run_job(sched_job);
-		complete(&entity->entity_idle);
-		drm_sched_fence_scheduled(s_fence);
+	s_fence = sched_job->s_fence;
 
-		if (!IS_ERR_OR_NULL(fence)) {
-			s_fence->parent = dma_fence_get(fence);
-			/* Drop for original kref_init of the fence */
-			dma_fence_put(fence);
+	atomic_add(sched_job->credits, &sched->credit_count);
+	drm_sched_job_begin(sched_job);
 
-			r = dma_fence_add_callback(fence, &sched_job->cb,
-						   drm_sched_job_done_cb);
-			if (r == -ENOENT)
-				drm_sched_job_done(sched_job);
-			else if (r)
-				DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
-					  r);
-		} else {
-			if (IS_ERR(fence))
-				dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
+	trace_drm_run_job(sched_job, entity);
+	fence = sched->ops->run_job(sched_job);
+	complete_all(&entity->entity_idle);
+	drm_sched_fence_scheduled(s_fence, fence);
 
-			drm_sched_job_done(sched_job);
-		}
+	if (!IS_ERR_OR_NULL(fence)) {
+		/* Drop for original kref_init of the fence */
+		dma_fence_put(fence);
 
-		wake_up(&sched->job_scheduled);
+		r = dma_fence_add_callback(fence, &sched_job->cb,
+					   drm_sched_job_done_cb);
+		if (r == -ENOENT)
+			drm_sched_job_done(sched_job, fence->error);
+		else if (r)
+			DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", r);
+	} else {
+		drm_sched_job_done(sched_job, IS_ERR(fence) ?
+				   PTR_ERR(fence) : 0);
 	}
-	return 0;
+
+	wake_up(&sched->job_scheduled);
+	drm_sched_run_job_queue(sched);
 }
 
 /**
@@ -986,7 +1227,10 @@ static int drm_sched_main(void *param)
  *
  * @sched: scheduler instance
  * @ops: backend operations for this scheduler
- * @hw_submission: number of hw submissions that can be in flight
+ * @submit_wq: workqueue to use for submission. If NULL, an ordered wq is
+ *	       allocated and used
+ * @num_rqs: number of runqueues, one for each priority, up to DRM_SCHED_PRIORITY_COUNT
+ * @credit_limit: the number of credits this scheduler can hold from all jobs
  * @hang_limit: number of times to allow a job to hang before dropping it
  * @timeout: timeout value in jiffies for the scheduler
  * @timeout_wq: workqueue to use for timeout work. If NULL, the system_wq is
@@ -999,42 +1243,84 @@ static int drm_sched_main(void *param)
  */
 int drm_sched_init(struct drm_gpu_scheduler *sched,
 		   const struct drm_sched_backend_ops *ops,
-		   unsigned hw_submission, unsigned hang_limit,
+		   struct workqueue_struct *submit_wq,
+		   u32 num_rqs, u32 credit_limit, unsigned int hang_limit,
 		   long timeout, struct workqueue_struct *timeout_wq,
 		   atomic_t *score, const char *name, struct device *dev)
 {
-	int i, ret;
+	int i;
+
 	sched->ops = ops;
-	sched->hw_submission_limit = hw_submission;
+	sched->credit_limit = credit_limit;
 	sched->name = name;
 	sched->timeout = timeout;
 	sched->timeout_wq = timeout_wq ? : system_wq;
 	sched->hang_limit = hang_limit;
 	sched->score = score ? score : &sched->_score;
 	sched->dev = dev;
-	for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT; i++)
-		drm_sched_rq_init(sched, &sched->sched_rq[i]);
 
-	init_waitqueue_head(&sched->wake_up_worker);
+	if (num_rqs > DRM_SCHED_PRIORITY_COUNT) {
+		/* This is a gross violation--tell drivers what the  problem is.
+		 */
+		drm_err(sched, "%s: num_rqs cannot be greater than DRM_SCHED_PRIORITY_COUNT\n",
+			__func__);
+		return -EINVAL;
+	} else if (sched->sched_rq) {
+		/* Not an error, but warn anyway so drivers can
+		 * fine-tune their DRM calling order, and return all
+		 * is good.
+		 */
+		drm_warn(sched, "%s: scheduler already initialized!\n", __func__);
+		return 0;
+	}
+
+	if (submit_wq) {
+		sched->submit_wq = submit_wq;
+		sched->own_submit_wq = false;
+	} else {
+		sched->submit_wq = alloc_ordered_workqueue(name, 0);
+		if (!sched->submit_wq)
+			return -ENOMEM;
+
+		sched->own_submit_wq = true;
+	}
+
+	sched->sched_rq = kmalloc_array(num_rqs, sizeof(*sched->sched_rq),
+					GFP_KERNEL | __GFP_ZERO);
+	if (!sched->sched_rq)
+		goto Out_check_own;
+	sched->num_rqs = num_rqs;
+	for (i = DRM_SCHED_PRIORITY_KERNEL; i < sched->num_rqs; i++) {
+		sched->sched_rq[i] = kzalloc(sizeof(*sched->sched_rq[i]), GFP_KERNEL);
+		if (!sched->sched_rq[i])
+			goto Out_unroll;
+		drm_sched_rq_init(sched, sched->sched_rq[i]);
+	}
+
 	init_waitqueue_head(&sched->job_scheduled);
 	INIT_LIST_HEAD(&sched->pending_list);
 	spin_lock_init(&sched->job_list_lock);
-	atomic_set(&sched->hw_rq_count, 0);
+	atomic_set(&sched->credit_count, 0);
 	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
+	INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
+	INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
 	atomic_set(&sched->_score, 0);
 	atomic64_set(&sched->job_id_count, 0);
-
-	/* Each scheduler will run on a seperate kernel thread */
-	sched->thread = kthread_run(drm_sched_main, sched, sched->name);
-	if (IS_ERR(sched->thread)) {
-		ret = PTR_ERR(sched->thread);
-		sched->thread = NULL;
-		DRM_DEV_ERROR(sched->dev, "Failed to create scheduler for %s.\n", name);
-		return ret;
-	}
+	sched->pause_submit = false;
 
 	sched->ready = true;
 	return 0;
+Out_unroll:
+	for (--i ; i >= DRM_SCHED_PRIORITY_KERNEL; i--)
+		kfree(sched->sched_rq[i]);
+
+	kfree(sched->sched_rq);
+	sched->sched_rq = NULL;
+Out_check_own:
+	if (sched->own_submit_wq)
+		destroy_workqueue(sched->submit_wq);
+	drm_err(sched, "%s: Failed to setup GPU scheduler--out of memory\n", __func__);
+	return -ENOMEM;
 }
 EXPORT_SYMBOL(drm_sched_init);
 
@@ -1050,14 +1336,10 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched)
 	struct drm_sched_entity *s_entity;
 	int i;
 
-	if (sched->thread)
-		kthread_stop(sched->thread);
+	drm_sched_wqueue_stop(sched);
 
-	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
-		struct drm_sched_rq *rq = &sched->sched_rq[i];
-
-		if (!rq)
-			continue;
+	for (i = DRM_SCHED_PRIORITY_KERNEL; i < sched->num_rqs; i++) {
+		struct drm_sched_rq *rq = sched->sched_rq[i];
 
 		spin_lock(&rq->lock);
 		list_for_each_entry(s_entity, &rq->entities, list)
@@ -1068,7 +1350,7 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched)
 			 */
 			s_entity->stopped = true;
 		spin_unlock(&rq->lock);
-
+		kfree(sched->sched_rq[i]);
 	}
 
 	/* Wakeup everyone stuck in drm_sched_entity_flush for this scheduler */
@@ -1077,18 +1359,24 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched)
 	/* Confirm no work left behind accessing device structures */
 	cancel_delayed_work_sync(&sched->work_tdr);
 
+	if (sched->own_submit_wq)
+		destroy_workqueue(sched->submit_wq);
 	sched->ready = false;
+	kfree(sched->sched_rq);
+	sched->sched_rq = NULL;
 }
 EXPORT_SYMBOL(drm_sched_fini);
 
 /**
- * drm_sched_increase_karma_ext - Update sched_entity guilty flag
+ * drm_sched_increase_karma - Update sched_entity guilty flag
  *
  * @bad: The job guilty of time out
- * @type: type for increase/reset karma
  *
+ * Increment on every hang caused by the 'bad' job. If this exceeds the hang
+ * limit of the scheduler then the respective sched entity is marked guilty and
+ * jobs from it will not be scheduled further
  */
-void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
+void drm_sched_increase_karma(struct drm_sched_job *bad)
 {
 	int i;
 	struct drm_sched_entity *tmp;
@@ -1100,21 +1388,17 @@ void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
 	 * corrupt but keep in mind that kernel jobs always considered good.
 	 */
 	if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) {
-		if (type == 0)
-			atomic_set(&bad->karma, 0);
-		else if (type == 1)
-			atomic_inc(&bad->karma);
+		atomic_inc(&bad->karma);
 
-		for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL;
-		     i++) {
-			struct drm_sched_rq *rq = &sched->sched_rq[i];
+		for (i = DRM_SCHED_PRIORITY_HIGH; i < sched->num_rqs; i++) {
+			struct drm_sched_rq *rq = sched->sched_rq[i];
 
 			spin_lock(&rq->lock);
 			list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
 				if (bad->s_fence->scheduled.context ==
 				    entity->fence_context) {
 					if (entity->guilty)
-						atomic_set(entity->guilty, type);
+						atomic_set(entity->guilty, 1);
 					break;
 				}
 			}
@@ -1124,4 +1408,43 @@ void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
 		}
 	}
 }
-EXPORT_SYMBOL(drm_sched_increase_karma_ext);
+EXPORT_SYMBOL(drm_sched_increase_karma);
+
+/**
+ * drm_sched_wqueue_ready - Is the scheduler ready for submission
+ *
+ * @sched: scheduler instance
+ *
+ * Returns true if submission is ready
+ */
+bool drm_sched_wqueue_ready(struct drm_gpu_scheduler *sched)
+{
+	return sched->ready;
+}
+EXPORT_SYMBOL(drm_sched_wqueue_ready);
+
+/**
+ * drm_sched_wqueue_stop - stop scheduler submission
+ *
+ * @sched: scheduler instance
+ */
+void drm_sched_wqueue_stop(struct drm_gpu_scheduler *sched)
+{
+	WRITE_ONCE(sched->pause_submit, true);
+	cancel_work_sync(&sched->work_run_job);
+	cancel_work_sync(&sched->work_free_job);
+}
+EXPORT_SYMBOL(drm_sched_wqueue_stop);
+
+/**
+ * drm_sched_wqueue_start - start scheduler submission
+ *
+ * @sched: scheduler instance
+ */
+void drm_sched_wqueue_start(struct drm_gpu_scheduler *sched)
+{
+	WRITE_ONCE(sched->pause_submit, false);
+	queue_work(sched->submit_wq, &sched->work_run_job);
+	queue_work(sched->submit_wq, &sched->work_free_job);
+}
+EXPORT_SYMBOL(drm_sched_wqueue_start);
diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
index a1f909dac89a7..80f2078607752 100644
--- a/drivers/gpu/drm/tegra/drm.c
+++ b/drivers/gpu/drm/tegra/drm.c
@@ -891,8 +891,6 @@ static const struct drm_driver tegra_drm_driver = {
 	.debugfs_init = tegra_debugfs_init,
 #endif
 
-	.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
-	.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
 	.gem_prime_import = tegra_gem_prime_import,
 
 	.dumb_create = tegra_bo_dumb_create,
diff --git a/drivers/gpu/drm/v3d/v3d_drv.c b/drivers/gpu/drm/v3d/v3d_drv.c
index e8c975b815859..9433a6bf6edf1 100644
--- a/drivers/gpu/drm/v3d/v3d_drv.c
+++ b/drivers/gpu/drm/v3d/v3d_drv.c
@@ -172,8 +172,6 @@ static const struct drm_driver v3d_drm_driver = {
 #endif
 
 	.gem_create_object = v3d_create_object,
-	.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
-	.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
 	.gem_prime_import_sg_table = v3d_prime_import_sg_table,
 	.gem_prime_mmap = drm_gem_prime_mmap,
 
diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c
index b8980440d137f..1ea266f9a4d66 100644
--- a/drivers/gpu/drm/v3d/v3d_gem.c
+++ b/drivers/gpu/drm/v3d/v3d_gem.c
@@ -473,7 +473,7 @@ v3d_job_init(struct v3d_dev *v3d, struct drm_file *file_priv,
 	job->free = free;
 
 	ret = drm_sched_job_init(&job->base, &v3d_priv->sched_entity[queue],
-				 v3d_priv);
+				 1, v3d_priv);
 	if (ret)
 		goto fail;
 
diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
index 06238e6d7f5cd..0b6696b0d882f 100644
--- a/drivers/gpu/drm/v3d/v3d_sched.c
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -388,7 +388,8 @@ v3d_sched_init(struct v3d_dev *v3d)
 	int ret;
 
 	ret = drm_sched_init(&v3d->queue[V3D_BIN].sched,
-			     &v3d_bin_sched_ops,
+			     &v3d_bin_sched_ops, NULL,
+			     DRM_SCHED_PRIORITY_COUNT,
 			     hw_jobs_limit, job_hang_limit,
 			     msecs_to_jiffies(hang_limit_ms), NULL,
 			     NULL, "v3d_bin", v3d->drm.dev);
@@ -396,7 +397,8 @@ v3d_sched_init(struct v3d_dev *v3d)
 		return ret;
 
 	ret = drm_sched_init(&v3d->queue[V3D_RENDER].sched,
-			     &v3d_render_sched_ops,
+			     &v3d_render_sched_ops, NULL,
+			     DRM_SCHED_PRIORITY_COUNT,
 			     hw_jobs_limit, job_hang_limit,
 			     msecs_to_jiffies(hang_limit_ms), NULL,
 			     NULL, "v3d_render", v3d->drm.dev);
@@ -404,7 +406,8 @@ v3d_sched_init(struct v3d_dev *v3d)
 		goto fail;
 
 	ret = drm_sched_init(&v3d->queue[V3D_TFU].sched,
-			     &v3d_tfu_sched_ops,
+			     &v3d_tfu_sched_ops, NULL,
+			     DRM_SCHED_PRIORITY_COUNT,
 			     hw_jobs_limit, job_hang_limit,
 			     msecs_to_jiffies(hang_limit_ms), NULL,
 			     NULL, "v3d_tfu", v3d->drm.dev);
@@ -413,7 +416,8 @@ v3d_sched_init(struct v3d_dev *v3d)
 
 	if (v3d_has_csd(v3d)) {
 		ret = drm_sched_init(&v3d->queue[V3D_CSD].sched,
-				     &v3d_csd_sched_ops,
+				     &v3d_csd_sched_ops, NULL,
+				     DRM_SCHED_PRIORITY_COUNT,
 				     hw_jobs_limit, job_hang_limit,
 				     msecs_to_jiffies(hang_limit_ms), NULL,
 				     NULL, "v3d_csd", v3d->drm.dev);
@@ -421,7 +425,8 @@ v3d_sched_init(struct v3d_dev *v3d)
 			goto fail;
 
 		ret = drm_sched_init(&v3d->queue[V3D_CACHE_CLEAN].sched,
-				     &v3d_cache_clean_sched_ops,
+				     &v3d_cache_clean_sched_ops, NULL,
+				     DRM_SCHED_PRIORITY_COUNT,
 				     hw_jobs_limit, job_hang_limit,
 				     msecs_to_jiffies(hang_limit_ms), NULL,
 				     NULL, "v3d_cache_clean", v3d->drm.dev);
diff --git a/drivers/gpu/drm/virtio/virtgpu_drv.c b/drivers/gpu/drm/virtio/virtgpu_drv.c
index 0035affc3e590..314dbb5e7fbe6 100644
--- a/drivers/gpu/drm/virtio/virtgpu_drv.c
+++ b/drivers/gpu/drm/virtio/virtgpu_drv.c
@@ -181,8 +181,6 @@ static const struct drm_driver driver = {
 #if defined(CONFIG_DEBUG_FS)
 	.debugfs_init = virtio_gpu_debugfs_init,
 #endif
-	.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
-	.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
 	.gem_prime_mmap = drm_gem_prime_mmap,
 	.gem_prime_import = virtgpu_gem_prime_import,
 	.gem_prime_import_sg_table = virtgpu_gem_prime_import_sg_table,
diff --git a/drivers/gpu/drm/xen/xen_drm_front.c b/drivers/gpu/drm/xen/xen_drm_front.c
index 0d8e6bd1ccbf2..a37224a2da538 100644
--- a/drivers/gpu/drm/xen/xen_drm_front.c
+++ b/drivers/gpu/drm/xen/xen_drm_front.c
@@ -474,8 +474,6 @@ DEFINE_DRM_GEM_FOPS(xen_drm_dev_fops);
 static const struct drm_driver xen_drm_driver = {
 	.driver_features           = DRIVER_GEM | DRIVER_MODESET | DRIVER_ATOMIC,
 	.release                   = xen_drm_drv_release,
-	.prime_handle_to_fd        = drm_gem_prime_handle_to_fd,
-	.prime_fd_to_handle        = drm_gem_prime_fd_to_handle,
 	.gem_prime_import_sg_table = xen_drm_front_gem_import_sg_table,
 	.gem_prime_mmap            = drm_gem_prime_mmap,
 	.dumb_create               = xen_drm_drv_dumb_create,
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 0ba817e863465..367ac91723952 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -188,20 +188,28 @@ static dma_addr_t __arm_lpae_dma_addr(void *pages)
 }
 
 static void *__arm_lpae_alloc_pages(size_t size, gfp_t gfp,
-				    struct io_pgtable_cfg *cfg)
+				    struct io_pgtable_cfg *cfg,
+				    void *cookie)
 {
 	struct device *dev = cfg->iommu_dev;
 	int order = get_order(size);
-	struct page *p;
 	dma_addr_t dma;
 	void *pages;
 
 	VM_BUG_ON((gfp & __GFP_HIGHMEM));
-	p = alloc_pages_node(dev_to_node(dev), gfp | __GFP_ZERO, order);
-	if (!p)
+
+	if (cfg->alloc) {
+		pages = cfg->alloc(cookie, size, gfp);
+	} else {
+		struct page *p;
+
+		p = alloc_pages_node(dev_to_node(dev), gfp | __GFP_ZERO, order);
+		pages = p ? page_address(p) : NULL;
+	}
+
+	if (!pages)
 		return NULL;
 
-	pages = page_address(p);
 	if (!cfg->coherent_walk) {
 		dma = dma_map_single(dev, pages, size, DMA_TO_DEVICE);
 		if (dma_mapping_error(dev, dma))
@@ -220,18 +228,28 @@ static void *__arm_lpae_alloc_pages(size_t size, gfp_t gfp,
 out_unmap:
 	dev_err(dev, "Cannot accommodate DMA translation for IOMMU page tables\n");
 	dma_unmap_single(dev, dma, size, DMA_TO_DEVICE);
+
 out_free:
-	__free_pages(p, order);
+	if (cfg->free)
+		cfg->free(cookie, pages, size);
+	else
+		free_pages((unsigned long)pages, order);
+
 	return NULL;
 }
 
 static void __arm_lpae_free_pages(void *pages, size_t size,
-				  struct io_pgtable_cfg *cfg)
+				  struct io_pgtable_cfg *cfg,
+				  void *cookie)
 {
 	if (!cfg->coherent_walk)
 		dma_unmap_single(cfg->iommu_dev, __arm_lpae_dma_addr(pages),
 				 size, DMA_TO_DEVICE);
-	free_pages((unsigned long)pages, get_order(size));
+
+	if (cfg->free)
+		cfg->free(cookie, pages, size);
+	else
+		free_pages((unsigned long)pages, get_order(size));
 }
 
 static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep, int num_entries,
@@ -373,13 +391,13 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova,
 	/* Grab a pointer to the next level */
 	pte = READ_ONCE(*ptep);
 	if (!pte) {
-		cptep = __arm_lpae_alloc_pages(tblsz, gfp, cfg);
+		cptep = __arm_lpae_alloc_pages(tblsz, gfp, cfg, data->iop.cookie);
 		if (!cptep)
 			return -ENOMEM;
 
 		pte = arm_lpae_install_table(cptep, ptep, 0, data);
 		if (pte)
-			__arm_lpae_free_pages(cptep, tblsz, cfg);
+			__arm_lpae_free_pages(cptep, tblsz, cfg, data->iop.cookie);
 	} else if (!cfg->coherent_walk && !(pte & ARM_LPAE_PTE_SW_SYNC)) {
 		__arm_lpae_sync_pte(ptep, 1, cfg);
 	}
@@ -531,7 +549,7 @@ static void __arm_lpae_free_pgtable(struct arm_lpae_io_pgtable *data, int lvl,
 		__arm_lpae_free_pgtable(data, lvl + 1, iopte_deref(pte, data));
 	}
 
-	__arm_lpae_free_pages(start, table_size, &data->iop.cfg);
+	__arm_lpae_free_pages(start, table_size, &data->iop.cfg, data->iop.cookie);
 }
 
 static void arm_lpae_free_pgtable(struct io_pgtable *iop)
@@ -559,7 +577,7 @@ static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data,
 	if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS))
 		return 0;
 
-	tablep = __arm_lpae_alloc_pages(tablesz, GFP_ATOMIC, cfg);
+	tablep = __arm_lpae_alloc_pages(tablesz, GFP_ATOMIC, cfg, data->iop.cookie);
 	if (!tablep)
 		return 0; /* Bytes unmapped */
 
@@ -582,7 +600,7 @@ static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data,
 
 	pte = arm_lpae_install_table(tablep, ptep, blk_pte, data);
 	if (pte != blk_pte) {
-		__arm_lpae_free_pages(tablep, tablesz, cfg);
+		__arm_lpae_free_pages(tablep, tablesz, cfg, data->iop.cookie);
 		/*
 		 * We may race against someone unmapping another part of this
 		 * block, but anything else is invalid. We can't misinterpret
@@ -897,7 +915,7 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
 
 	/* Looking good; allocate a pgd */
 	data->pgd = __arm_lpae_alloc_pages(ARM_LPAE_PGD_SIZE(data),
-					   GFP_KERNEL, cfg);
+					   GFP_KERNEL, cfg, cookie);
 	if (!data->pgd)
 		goto out_free_data;
 
@@ -999,7 +1017,7 @@ arm_64_lpae_alloc_pgtable_s2(struct io_pgtable_cfg *cfg, void *cookie)
 
 	/* Allocate pgd pages */
 	data->pgd = __arm_lpae_alloc_pages(ARM_LPAE_PGD_SIZE(data),
-					   GFP_KERNEL, cfg);
+					   GFP_KERNEL, cfg, cookie);
 	if (!data->pgd)
 		goto out_free_data;
 
@@ -1074,7 +1092,7 @@ arm_mali_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
 		 << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_DEV));
 
 	data->pgd = __arm_lpae_alloc_pages(ARM_LPAE_PGD_SIZE(data), GFP_KERNEL,
-					   cfg);
+					   cfg, cookie);
 	if (!data->pgd)
 		goto out_free_data;
 
@@ -1095,26 +1113,31 @@ arm_mali_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
 }
 
 struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s1_init_fns = {
+	.caps	= IO_PGTABLE_CAP_CUSTOM_ALLOCATOR,
 	.alloc	= arm_64_lpae_alloc_pgtable_s1,
 	.free	= arm_lpae_free_pgtable,
 };
 
 struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s2_init_fns = {
+	.caps	= IO_PGTABLE_CAP_CUSTOM_ALLOCATOR,
 	.alloc	= arm_64_lpae_alloc_pgtable_s2,
 	.free	= arm_lpae_free_pgtable,
 };
 
 struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s1_init_fns = {
+	.caps	= IO_PGTABLE_CAP_CUSTOM_ALLOCATOR,
 	.alloc	= arm_32_lpae_alloc_pgtable_s1,
 	.free	= arm_lpae_free_pgtable,
 };
 
 struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s2_init_fns = {
+	.caps	= IO_PGTABLE_CAP_CUSTOM_ALLOCATOR,
 	.alloc	= arm_32_lpae_alloc_pgtable_s2,
 	.free	= arm_lpae_free_pgtable,
 };
 
 struct io_pgtable_init_fns io_pgtable_arm_mali_lpae_init_fns = {
+	.caps	= IO_PGTABLE_CAP_CUSTOM_ALLOCATOR,
 	.alloc	= arm_mali_lpae_alloc_pgtable,
 	.free	= arm_lpae_free_pgtable,
 };
diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c
index b843fcd365d28..8841c1487f004 100644
--- a/drivers/iommu/io-pgtable.c
+++ b/drivers/iommu/io-pgtable.c
@@ -34,6 +34,26 @@ io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] = {
 #endif
 };
 
+static int check_custom_allocator(enum io_pgtable_fmt fmt,
+				  struct io_pgtable_cfg *cfg)
+{
+	/* No custom allocator, no need to check the format. */
+	if (!cfg->alloc && !cfg->free)
+		return 0;
+
+	/* When passing a custom allocator, both the alloc and free
+	 * functions should be provided.
+	 */
+	if (!cfg->alloc || !cfg->free)
+		return -EINVAL;
+
+	/* Make sure the format supports custom allocators. */
+	if (io_pgtable_init_table[fmt]->caps & IO_PGTABLE_CAP_CUSTOM_ALLOCATOR)
+		return 0;
+
+	return -EINVAL;
+}
+
 struct io_pgtable_ops *alloc_io_pgtable_ops(enum io_pgtable_fmt fmt,
 					    struct io_pgtable_cfg *cfg,
 					    void *cookie)
@@ -44,6 +64,9 @@ struct io_pgtable_ops *alloc_io_pgtable_ops(enum io_pgtable_fmt fmt,
 	if (fmt >= IO_PGTABLE_NUM_FMTS)
 		return NULL;
 
+	if (check_custom_allocator(fmt, cfg))
+		return NULL;
+
 	fns = io_pgtable_init_table[fmt];
 	if (!fns)
 		return NULL;
diff --git a/include/drm/drm_drv.h b/include/drm/drm_drv.h
index f6159acb88569..f20c7374e4d9e 100644
--- a/include/drm/drm_drv.h
+++ b/include/drm/drm_drv.h
@@ -94,6 +94,12 @@ enum drm_driver_feature {
 	 * synchronization of command submission.
 	 */
 	DRIVER_SYNCOBJ_TIMELINE         = BIT(6),
+	/**
+	 * @DRIVER_GEM_GPUVA:
+	 *
+	 * Driver supports user defined GPU VA bindings for GEM objects.
+	 */
+	DRIVER_GEM_GPUVA		= BIT(8),
 
 	/* IMPORTANT: Below are all the legacy flags, add new ones above. */
 
@@ -301,22 +307,14 @@ struct drm_driver {
 	/**
 	 * @prime_handle_to_fd:
 	 *
-	 * Main PRIME export function. Should be implemented with
-	 * drm_gem_prime_handle_to_fd() for GEM based drivers.
-	 *
-	 * For an in-depth discussion see :ref:`PRIME buffer sharing
-	 * documentation <prime_buffer_sharing>`.
+	 * PRIME export function. Only used by vmwgfx.
 	 */
 	int (*prime_handle_to_fd)(struct drm_device *dev, struct drm_file *file_priv,
 				uint32_t handle, uint32_t flags, int *prime_fd);
 	/**
 	 * @prime_fd_to_handle:
 	 *
-	 * Main PRIME import function. Should be implemented with
-	 * drm_gem_prime_fd_to_handle() for GEM based drivers.
-	 *
-	 * For an in-depth discussion see :ref:`PRIME buffer sharing
-	 * documentation <prime_buffer_sharing>`.
+	 * PRIME import function. Only used by vmwgfx.
 	 */
 	int (*prime_fd_to_handle)(struct drm_device *dev, struct drm_file *file_priv,
 				int prime_fd, uint32_t *handle);
diff --git a/include/drm/drm_exec.h b/include/drm/drm_exec.h
new file mode 100644
index 0000000000000..8069d57672d72
--- /dev/null
+++ b/include/drm/drm_exec.h
@@ -0,0 +1,150 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+
+#ifndef __DRM_EXEC_H__
+#define __DRM_EXEC_H__
+
+#include <linux/ww_mutex.h>
+
+#define DRM_EXEC_INTERRUPTIBLE_WAIT	BIT(0)
+#define DRM_EXEC_IGNORE_DUPLICATES	BIT(1)
+
+struct drm_gem_object;
+
+/**
+ * struct drm_exec - Execution context
+ */
+struct drm_exec {
+	/**
+	 * @flags: Flags to control locking behavior
+	 */
+	uint32_t		flags;
+
+	/**
+	 * @ticket: WW ticket used for acquiring locks
+	 */
+	struct ww_acquire_ctx	ticket;
+
+	/**
+	 * @num_objects: number of objects locked
+	 */
+	unsigned int		num_objects;
+
+	/**
+	 * @max_objects: maximum objects in array
+	 */
+	unsigned int		max_objects;
+
+	/**
+	 * @objects: array of the locked objects
+	 */
+	struct drm_gem_object	**objects;
+
+	/**
+	 * @contended: contended GEM object we backed off for
+	 */
+	struct drm_gem_object	*contended;
+
+	/**
+	 * @prelocked: already locked GEM object due to contention
+	 */
+	struct drm_gem_object *prelocked;
+};
+
+/**
+ * drm_exec_obj() - Return the object for a give drm_exec index
+ * @exec: Pointer to the drm_exec context
+ * @index: The index.
+ *
+ * Return: Pointer to the locked object corresponding to @index if
+ * index is within the number of locked objects. NULL otherwise.
+ */
+static inline struct drm_gem_object *
+drm_exec_obj(struct drm_exec *exec, unsigned long index)
+{
+	return index < exec->num_objects ? exec->objects[index] : NULL;
+}
+
+/**
+ * drm_exec_for_each_locked_object - iterate over all the locked objects
+ * @exec: drm_exec object
+ * @index: unsigned long index for the iteration
+ * @obj: the current GEM object
+ *
+ * Iterate over all the locked GEM objects inside the drm_exec object.
+ */
+#define drm_exec_for_each_locked_object(exec, index, obj)		\
+	for ((index) = 0; ((obj) = drm_exec_obj(exec, index)); ++(index))
+
+/**
+ * drm_exec_for_each_locked_object_reverse - iterate over all the locked
+ * objects in reverse locking order
+ * @exec: drm_exec object
+ * @index: unsigned long index for the iteration
+ * @obj: the current GEM object
+ *
+ * Iterate over all the locked GEM objects inside the drm_exec object in
+ * reverse locking order. Note that @index may go below zero and wrap,
+ * but that will be caught by drm_exec_obj(), returning a NULL object.
+ */
+#define drm_exec_for_each_locked_object_reverse(exec, index, obj)	\
+	for ((index) = (exec)->num_objects - 1;				\
+	     ((obj) = drm_exec_obj(exec, index)); --(index))
+
+/**
+ * drm_exec_until_all_locked - loop until all GEM objects are locked
+ * @exec: drm_exec object
+ *
+ * Core functionality of the drm_exec object. Loops until all GEM objects are
+ * locked and no more contention exists. At the beginning of the loop it is
+ * guaranteed that no GEM object is locked.
+ *
+ * Since labels can't be defined local to the loops body we use a jump pointer
+ * to make sure that the retry is only used from within the loops body.
+ */
+#define drm_exec_until_all_locked(exec)				\
+	for (void *__drm_exec_retry_ptr; ({			\
+		__label__ __drm_exec_retry;			\
+__drm_exec_retry:						\
+		__drm_exec_retry_ptr = &&__drm_exec_retry;	\
+		(void)__drm_exec_retry_ptr;			\
+		drm_exec_cleanup(exec);				\
+	});)
+
+/**
+ * drm_exec_retry_on_contention - restart the loop to grap all locks
+ * @exec: drm_exec object
+ *
+ * Control flow helper to continue when a contention was detected and we need to
+ * clean up and re-start the loop to prepare all GEM objects.
+ */
+#define drm_exec_retry_on_contention(exec)			\
+	do {							\
+		if (unlikely(drm_exec_is_contended(exec)))	\
+			goto *__drm_exec_retry_ptr;		\
+	} while (0)
+
+/**
+ * drm_exec_is_contended - check for contention
+ * @exec: drm_exec object
+ *
+ * Returns true if the drm_exec object has run into some contention while
+ * locking a GEM object and needs to clean up.
+ */
+static inline bool drm_exec_is_contended(struct drm_exec *exec)
+{
+	return !!exec->contended;
+}
+
+void drm_exec_init(struct drm_exec *exec, uint32_t flags, unsigned nr);
+void drm_exec_fini(struct drm_exec *exec);
+bool drm_exec_cleanup(struct drm_exec *exec);
+int drm_exec_lock_obj(struct drm_exec *exec, struct drm_gem_object *obj);
+void drm_exec_unlock_obj(struct drm_exec *exec, struct drm_gem_object *obj);
+int drm_exec_prepare_obj(struct drm_exec *exec, struct drm_gem_object *obj,
+			 unsigned int num_fences);
+int drm_exec_prepare_array(struct drm_exec *exec,
+			   struct drm_gem_object **objects,
+			   unsigned int num_objects,
+			   unsigned int num_fences);
+
+#endif
diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h
index 60b2dda8d964b..83afab00e11f4 100644
--- a/include/drm/drm_gem.h
+++ b/include/drm/drm_gem.h
@@ -36,6 +36,8 @@
 
 #include <linux/kref.h>
 #include <linux/dma-resv.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
 
 #include <drm/drm_vma_manager.h>
 
@@ -337,6 +339,22 @@ struct drm_gem_object {
 	 */
 	struct dma_resv _resv;
 
+	/**
+	 * @gpuva:
+	 *
+	 * Provides the list of GPU VAs attached to this GEM object.
+	 *
+	 * Drivers should lock list accesses with the GEMs &dma_resv lock
+	 * (&drm_gem_object.resv) or a custom lock if one is provided.
+	 */
+	struct {
+		struct list_head list;
+
+#ifdef CONFIG_LOCKDEP
+		struct lockdep_map *lock_dep_map;
+#endif
+	} gpuva;
+
 	/**
 	 * @funcs:
 	 *
@@ -405,6 +423,7 @@ int drm_gem_object_init(struct drm_device *dev,
 			struct drm_gem_object *obj, size_t size);
 void drm_gem_private_object_init(struct drm_device *dev,
 				 struct drm_gem_object *obj, size_t size);
+void drm_gem_private_object_fini(struct drm_gem_object *obj);
 void drm_gem_vm_open(struct vm_area_struct *vma);
 void drm_gem_vm_close(struct vm_area_struct *vma);
 int drm_gem_mmap_obj(struct drm_gem_object *obj, unsigned long obj_size,
@@ -457,6 +476,9 @@ struct page **drm_gem_get_pages(struct drm_gem_object *obj);
 void drm_gem_put_pages(struct drm_gem_object *obj, struct page **pages,
 		bool dirty, bool accessed);
 
+int drm_gem_vmap_unlocked(struct drm_gem_object *obj, struct iosys_map *map);
+void drm_gem_vunmap_unlocked(struct drm_gem_object *obj, struct iosys_map *map);
+
 int drm_gem_objects_lookup(struct drm_file *filp, void __user *bo_handles,
 			   int count, struct drm_gem_object ***objs_out);
 struct drm_gem_object *drm_gem_object_lookup(struct drm_file *filp, u32 handle);
@@ -477,4 +499,65 @@ unsigned long drm_gem_lru_scan(struct drm_gem_lru *lru,
 			       unsigned long *remaining,
 			       bool (*shrink)(struct drm_gem_object *obj));
 
+#ifdef CONFIG_LOCKDEP
+/**
+ * drm_gem_gpuva_set_lock() - Set the lock protecting accesses to the gpuva list.
+ * @obj: the &drm_gem_object
+ * @lock: the lock used to protect the gpuva list. The locking primitive
+ * must contain a dep_map field.
+ *
+ * Call this if you're not proctecting access to the gpuva list
+ * with the dma-resv lock, otherwise, drm_gem_gpuva_init() takes care
+ * of initializing lock_dep_map for you.
+ */
+#define drm_gem_gpuva_set_lock(obj, lock) \
+	if (!(obj)->gpuva.lock_dep_map) \
+		(obj)->gpuva.lock_dep_map = &(lock)->dep_map
+#define drm_gem_gpuva_assert_lock_held(obj) \
+	lockdep_assert(lock_is_held((obj)->gpuva.lock_dep_map))
+#else
+#define drm_gem_gpuva_set_lock(obj, lock) do {} while (0)
+#define drm_gem_gpuva_assert_lock_held(obj) do {} while (0)
+#endif
+
+/**
+ * drm_gem_gpuva_init() - initialize the gpuva list of a GEM object
+ * @obj: the &drm_gem_object
+ *
+ * This initializes the &drm_gem_object's &drm_gpuvm_bo list.
+ *
+ * Calling this function is only necessary for drivers intending to support the
+ * &drm_driver_feature DRIVER_GEM_GPUVA.
+ */
+static inline void drm_gem_gpuva_init(struct drm_gem_object *obj)
+{
+	INIT_LIST_HEAD(&obj->gpuva.list);
+	drm_gem_gpuva_set_lock(obj, &obj->resv->lock.base);
+}
+
+/**
+ * drm_gem_for_each_gpuvm_bo() - iterator to walk over a list of &drm_gpuvm_bo
+ * @entry__: &drm_gpuvm_bo structure to assign to in each iteration step
+ * @obj__: the &drm_gem_object the &drm_gpuvm_bo to walk are associated with
+ *
+ * This iterator walks over all &drm_gpuvm_bo structures associated with the
+ * &drm_gem_object.
+ */
+#define drm_gem_for_each_gpuvm_bo(entry__, obj__) \
+	list_for_each_entry(entry__, &(obj__)->gpuva.list, list.entry.gem)
+
+/**
+ * drm_gem_for_each_gpuvm_bo_safe() - iterator to safely walk over a list of
+ * &drm_gpuvm_bo
+ * @entry__: &drm_gpuvm_bostructure to assign to in each iteration step
+ * @next__: &next &drm_gpuvm_bo to store the next step
+ * @obj__: the &drm_gem_object the &drm_gpuvm_bo to walk are associated with
+ *
+ * This iterator walks over all &drm_gpuvm_bo structures associated with the
+ * &drm_gem_object. It is implemented with list_for_each_entry_safe(), hence
+ * it is save against removal of elements.
+ */
+#define drm_gem_for_each_gpuvm_bo_safe(entry__, next__, obj__) \
+	list_for_each_entry_safe(entry__, next__, &(obj__)->gpuva.list, list.entry.gem)
+
 #endif /* __DRM_GEM_H__ */
diff --git a/include/drm/drm_gem_dma_helper.h b/include/drm/drm_gem_dma_helper.h
index 8a043235dad81..f0b4e3d3d57de 100644
--- a/include/drm/drm_gem_dma_helper.h
+++ b/include/drm/drm_gem_dma_helper.h
@@ -167,8 +167,6 @@ drm_gem_dma_prime_import_sg_table(struct drm_device *dev,
  */
 #define DRM_GEM_DMA_DRIVER_OPS_WITH_DUMB_CREATE(dumb_create_func) \
 	.dumb_create		= (dumb_create_func), \
-	.prime_handle_to_fd	= drm_gem_prime_handle_to_fd, \
-	.prime_fd_to_handle	= drm_gem_prime_fd_to_handle, \
 	.gem_prime_import_sg_table = drm_gem_dma_prime_import_sg_table, \
 	.gem_prime_mmap		= drm_gem_prime_mmap
 
diff --git a/include/drm/drm_gem_shmem_helper.h b/include/drm/drm_gem_shmem_helper.h
index a2201b2488c56..3b055d238584c 100644
--- a/include/drm/drm_gem_shmem_helper.h
+++ b/include/drm/drm_gem_shmem_helper.h
@@ -26,11 +26,6 @@ struct drm_gem_shmem_object {
 	 */
 	struct drm_gem_object base;
 
-	/**
-	 * @pages_lock: Protects the page table and use count
-	 */
-	struct mutex pages_lock;
-
 	/**
 	 * @pages: Page table
 	 */
@@ -79,11 +74,6 @@ struct drm_gem_shmem_object {
 	 */
 	struct sg_table *sgt;
 
-	/**
-	 * @vmap_lock: Protects the vmap address and use count
-	 */
-	struct mutex vmap_lock;
-
 	/**
 	 * @vaddr: Kernel virtual address of the backing memory
 	 */
@@ -109,7 +99,6 @@ struct drm_gem_shmem_object {
 struct drm_gem_shmem_object *drm_gem_shmem_create(struct drm_device *dev, size_t size);
 void drm_gem_shmem_free(struct drm_gem_shmem_object *shmem);
 
-int drm_gem_shmem_get_pages(struct drm_gem_shmem_object *shmem);
 void drm_gem_shmem_put_pages(struct drm_gem_shmem_object *shmem);
 int drm_gem_shmem_pin(struct drm_gem_shmem_object *shmem);
 void drm_gem_shmem_unpin(struct drm_gem_shmem_object *shmem);
@@ -128,8 +117,7 @@ static inline bool drm_gem_shmem_is_purgeable(struct drm_gem_shmem_object *shmem
 		!shmem->base.dma_buf && !shmem->base.import_attach;
 }
 
-void drm_gem_shmem_purge_locked(struct drm_gem_shmem_object *shmem);
-bool drm_gem_shmem_purge(struct drm_gem_shmem_object *shmem);
+void drm_gem_shmem_purge(struct drm_gem_shmem_object *shmem);
 
 struct sg_table *drm_gem_shmem_get_sg_table(struct drm_gem_shmem_object *shmem);
 struct sg_table *drm_gem_shmem_get_pages_sgt(struct drm_gem_shmem_object *shmem);
diff --git a/include/drm/drm_gem_vram_helper.h b/include/drm/drm_gem_vram_helper.h
index c083a1d71cf4c..320a99e5a2bc8 100644
--- a/include/drm/drm_gem_vram_helper.h
+++ b/include/drm/drm_gem_vram_helper.h
@@ -159,8 +159,7 @@ void drm_gem_vram_simple_display_pipe_cleanup_fb(
 #define DRM_GEM_VRAM_DRIVER \
 	.debugfs_init             = drm_vram_mm_debugfs_init, \
 	.dumb_create		  = drm_gem_vram_driver_dumb_create, \
-	.dumb_map_offset	  = drm_gem_ttm_dumb_map_offset, \
-	.gem_prime_mmap		  = drm_gem_prime_mmap
+	.dumb_map_offset	  = drm_gem_ttm_dumb_map_offset
 
 /*
  *  VRAM memory manager
diff --git a/include/drm/drm_gpuvm.h b/include/drm/drm_gpuvm.h
new file mode 100644
index 0000000000000..e37871dcc0894
--- /dev/null
+++ b/include/drm/drm_gpuvm.h
@@ -0,0 +1,1227 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __DRM_GPUVM_H__
+#define __DRM_GPUVM_H__
+
+/*
+ * Copyright (c) 2022 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <linux/dma-resv.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/types.h>
+
+#include <drm/drm_device.h>
+#include <drm/drm_gem.h>
+#include <drm/drm_exec.h>
+
+struct drm_gpuvm;
+struct drm_gpuvm_bo;
+struct drm_gpuvm_ops;
+
+/**
+ * enum drm_gpuva_flags - flags for struct drm_gpuva
+ */
+enum drm_gpuva_flags {
+	/**
+	 * @DRM_GPUVA_INVALIDATED:
+	 *
+	 * Flag indicating that the &drm_gpuva's backing GEM is invalidated.
+	 */
+	DRM_GPUVA_INVALIDATED = (1 << 0),
+
+	/**
+	 * @DRM_GPUVA_SPARSE:
+	 *
+	 * Flag indicating that the &drm_gpuva is a sparse mapping.
+	 */
+	DRM_GPUVA_SPARSE = (1 << 1),
+
+	/**
+	 * @DRM_GPUVA_USERBITS: user defined bits
+	 */
+	DRM_GPUVA_USERBITS = (1 << 2),
+};
+
+/**
+ * struct drm_gpuva - structure to track a GPU VA mapping
+ *
+ * This structure represents a GPU VA mapping and is associated with a
+ * &drm_gpuvm.
+ *
+ * Typically, this structure is embedded in bigger driver structures.
+ */
+struct drm_gpuva {
+	/**
+	 * @vm: the &drm_gpuvm this object is associated with
+	 */
+	struct drm_gpuvm *vm;
+
+	/**
+	 * @vm_bo: the &drm_gpuvm_bo abstraction for the mapped
+	 * &drm_gem_object
+	 */
+	struct drm_gpuvm_bo *vm_bo;
+
+	/**
+	 * @flags: the &drm_gpuva_flags for this mapping
+	 */
+	enum drm_gpuva_flags flags;
+
+	/**
+	 * @va: structure containing the address and range of the &drm_gpuva
+	 */
+	struct {
+		/**
+		 * @addr: the start address
+		 */
+		u64 addr;
+
+		/*
+		 * @range: the range
+		 */
+		u64 range;
+	} va;
+
+	/**
+	 * @gem: structure containing the &drm_gem_object and it's offset
+	 */
+	struct {
+		/**
+		 * @offset: the offset within the &drm_gem_object
+		 */
+		u64 offset;
+
+		/**
+		 * @obj: the mapped &drm_gem_object
+		 */
+		struct drm_gem_object *obj;
+
+		/**
+		 * @entry: the &list_head to attach this object to a &drm_gpuvm_bo
+		 */
+		struct list_head entry;
+	} gem;
+
+	/**
+	 * @rb: structure containing data to store &drm_gpuvas in a rb-tree
+	 */
+	struct {
+		/**
+		 * @rb: the rb-tree node
+		 */
+		struct rb_node node;
+
+		/**
+		 * @entry: The &list_head to additionally connect &drm_gpuvas
+		 * in the same order they appear in the interval tree. This is
+		 * useful to keep iterating &drm_gpuvas from a start node found
+		 * through the rb-tree while doing modifications on the rb-tree
+		 * itself.
+		 */
+		struct list_head entry;
+
+		/**
+		 * @__subtree_last: needed by the interval tree, holding last-in-subtree
+		 */
+		u64 __subtree_last;
+	} rb;
+};
+
+int drm_gpuva_insert(struct drm_gpuvm *gpuvm, struct drm_gpuva *va);
+void drm_gpuva_remove(struct drm_gpuva *va);
+
+void drm_gpuva_link(struct drm_gpuva *va, struct drm_gpuvm_bo *vm_bo);
+void drm_gpuva_unlink(struct drm_gpuva *va);
+
+struct drm_gpuva *drm_gpuva_find(struct drm_gpuvm *gpuvm,
+				 u64 addr, u64 range);
+struct drm_gpuva *drm_gpuva_find_first(struct drm_gpuvm *gpuvm,
+				       u64 addr, u64 range);
+struct drm_gpuva *drm_gpuva_find_prev(struct drm_gpuvm *gpuvm, u64 start);
+struct drm_gpuva *drm_gpuva_find_next(struct drm_gpuvm *gpuvm, u64 end);
+
+static inline void drm_gpuva_init(struct drm_gpuva *va, u64 addr, u64 range,
+				  struct drm_gem_object *obj, u64 offset)
+{
+	va->va.addr = addr;
+	va->va.range = range;
+	va->gem.obj = obj;
+	va->gem.offset = offset;
+}
+
+/**
+ * drm_gpuva_invalidate() - sets whether the backing GEM of this &drm_gpuva is
+ * invalidated
+ * @va: the &drm_gpuva to set the invalidate flag for
+ * @invalidate: indicates whether the &drm_gpuva is invalidated
+ */
+static inline void drm_gpuva_invalidate(struct drm_gpuva *va, bool invalidate)
+{
+	if (invalidate)
+		va->flags |= DRM_GPUVA_INVALIDATED;
+	else
+		va->flags &= ~DRM_GPUVA_INVALIDATED;
+}
+
+/**
+ * drm_gpuva_invalidated() - indicates whether the backing BO of this &drm_gpuva
+ * is invalidated
+ * @va: the &drm_gpuva to check
+ */
+static inline bool drm_gpuva_invalidated(struct drm_gpuva *va)
+{
+	return va->flags & DRM_GPUVA_INVALIDATED;
+}
+
+/**
+ * enum drm_gpuvm_flags - flags for struct drm_gpuvm
+ */
+enum drm_gpuvm_flags {
+	/**
+	 * @DRM_GPUVM_RESV_PROTECTED: GPUVM is protected externally by the
+	 * GPUVM's &dma_resv lock
+	 */
+	DRM_GPUVM_RESV_PROTECTED = BIT(0),
+
+	/**
+	 * @DRM_GPUVM_USERBITS: user defined bits
+	 */
+	DRM_GPUVM_USERBITS = BIT(1),
+};
+
+/**
+ * struct drm_gpuvm - DRM GPU VA Manager
+ *
+ * The DRM GPU VA Manager keeps track of a GPU's virtual address space by using
+ * &maple_tree structures. Typically, this structure is embedded in bigger
+ * driver structures.
+ *
+ * Drivers can pass addresses and ranges in an arbitrary unit, e.g. bytes or
+ * pages.
+ *
+ * There should be one manager instance per GPU virtual address space.
+ */
+struct drm_gpuvm {
+	/**
+	 * @name: the name of the DRM GPU VA space
+	 */
+	const char *name;
+
+	/**
+	 * @flags: the &drm_gpuvm_flags of this GPUVM
+	 */
+	enum drm_gpuvm_flags flags;
+
+	/**
+	 * @drm: the &drm_device this VM lives in
+	 */
+	struct drm_device *drm;
+
+	/**
+	 * @mm_start: start of the VA space
+	 */
+	u64 mm_start;
+
+	/**
+	 * @mm_range: length of the VA space
+	 */
+	u64 mm_range;
+
+	/**
+	 * @rb: structures to track &drm_gpuva entries
+	 */
+	struct {
+		/**
+		 * @tree: the rb-tree to track GPU VA mappings
+		 */
+		struct rb_root_cached tree;
+
+		/**
+		 * @list: the &list_head to track GPU VA mappings
+		 */
+		struct list_head list;
+	} rb;
+
+	/**
+	 * @kref: reference count of this object
+	 */
+	struct kref kref;
+
+	/**
+	 * @kernel_alloc_node:
+	 *
+	 * &drm_gpuva representing the address space cutout reserved for
+	 * the kernel
+	 */
+	struct drm_gpuva kernel_alloc_node;
+
+	/**
+	 * @ops: &drm_gpuvm_ops providing the split/merge steps to drivers
+	 */
+	const struct drm_gpuvm_ops *ops;
+
+	/**
+	 * @r_obj: Resv GEM object; representing the GPUVM's common &dma_resv.
+	 */
+	struct drm_gem_object *r_obj;
+
+	/**
+	 * @extobj: structure holding the extobj list
+	 */
+	struct {
+		/**
+		 * @list: &list_head storing &drm_gpuvm_bos serving as
+		 * external object
+		 */
+		struct list_head list;
+
+		/**
+		 * @local_list: pointer to the local list temporarily storing
+		 * entries from the external object list
+		 */
+		struct list_head *local_list;
+
+		/**
+		 * @lock: spinlock to protect the extobj list
+		 */
+		spinlock_t lock;
+	} extobj;
+
+	/**
+	 * @evict: structure holding the evict list and evict list lock
+	 */
+	struct {
+		/**
+		 * @list: &list_head storing &drm_gpuvm_bos currently being
+		 * evicted
+		 */
+		struct list_head list;
+
+		/**
+		 * @local_list: pointer to the local list temporarily storing
+		 * entries from the evicted object list
+		 */
+		struct list_head *local_list;
+
+		/**
+		 * @lock: spinlock to protect the evict list
+		 */
+		spinlock_t lock;
+	} evict;
+};
+
+void drm_gpuvm_init(struct drm_gpuvm *gpuvm, const char *name,
+		    enum drm_gpuvm_flags flags,
+		    struct drm_device *drm,
+		    struct drm_gem_object *r_obj,
+		    u64 start_offset, u64 range,
+		    u64 reserve_offset, u64 reserve_range,
+		    const struct drm_gpuvm_ops *ops);
+
+/**
+ * drm_gpuvm_get() - acquire a struct drm_gpuvm reference
+ * @gpuvm: the &drm_gpuvm to acquire the reference of
+ *
+ * This function acquires an additional reference to @gpuvm. It is illegal to
+ * call this without already holding a reference. No locks required.
+ */
+static inline struct drm_gpuvm *
+drm_gpuvm_get(struct drm_gpuvm *gpuvm)
+{
+	kref_get(&gpuvm->kref);
+
+	return gpuvm;
+}
+
+void drm_gpuvm_put(struct drm_gpuvm *gpuvm);
+
+bool drm_gpuvm_range_valid(struct drm_gpuvm *gpuvm, u64 addr, u64 range);
+bool drm_gpuvm_interval_empty(struct drm_gpuvm *gpuvm, u64 addr, u64 range);
+
+struct drm_gem_object *
+drm_gpuvm_resv_object_alloc(struct drm_device *drm);
+
+/**
+ * drm_gpuvm_resv_protected() - indicates whether &DRM_GPUVM_RESV_PROTECTED is
+ * set
+ * @gpuvm: the &drm_gpuvm
+ *
+ * Returns: true if &DRM_GPUVM_RESV_PROTECTED is set, false otherwise.
+ */
+static inline bool
+drm_gpuvm_resv_protected(struct drm_gpuvm *gpuvm)
+{
+	return gpuvm->flags & DRM_GPUVM_RESV_PROTECTED;
+}
+
+/**
+ * drm_gpuvm_resv() - returns the &drm_gpuvm's &dma_resv
+ * @gpuvm__: the &drm_gpuvm
+ *
+ * Returns: a pointer to the &drm_gpuvm's shared &dma_resv
+ */
+#define drm_gpuvm_resv(gpuvm__) ((gpuvm__)->r_obj->resv)
+
+/**
+ * drm_gpuvm_resv_obj() - returns the &drm_gem_object holding the &drm_gpuvm's
+ * &dma_resv
+ * @gpuvm__: the &drm_gpuvm
+ *
+ * Returns: a pointer to the &drm_gem_object holding the &drm_gpuvm's shared
+ * &dma_resv
+ */
+#define drm_gpuvm_resv_obj(gpuvm__) ((gpuvm__)->r_obj)
+
+#define drm_gpuvm_resv_held(gpuvm__) \
+	dma_resv_held(drm_gpuvm_resv(gpuvm__))
+
+#define drm_gpuvm_resv_assert_held(gpuvm__) \
+	dma_resv_assert_held(drm_gpuvm_resv(gpuvm__))
+
+#define drm_gpuvm_resv_held(gpuvm__) \
+	dma_resv_held(drm_gpuvm_resv(gpuvm__))
+
+#define drm_gpuvm_resv_assert_held(gpuvm__) \
+	dma_resv_assert_held(drm_gpuvm_resv(gpuvm__))
+
+/**
+ * drm_gpuvm_is_extobj() - indicates whether the given &drm_gem_object is an
+ * external object
+ * @gpuvm: the &drm_gpuvm to check
+ * @obj: the &drm_gem_object to check
+ *
+ * Returns: true if the &drm_gem_object &dma_resv differs from the
+ * &drm_gpuvms &dma_resv, false otherwise
+ */
+static inline bool
+drm_gpuvm_is_extobj(struct drm_gpuvm *gpuvm,
+		    struct drm_gem_object *obj)
+{
+	return obj && obj->resv != drm_gpuvm_resv(gpuvm);
+}
+
+static inline struct drm_gpuva *
+__drm_gpuva_next(struct drm_gpuva *va)
+{
+	if (va && !list_is_last(&va->rb.entry, &va->vm->rb.list))
+		return list_next_entry(va, rb.entry);
+
+	return NULL;
+}
+
+/**
+ * drm_gpuvm_for_each_va_range() - iterate over a range of &drm_gpuvas
+ * @va__: &drm_gpuva structure to assign to in each iteration step
+ * @gpuvm__: &drm_gpuvm to walk over
+ * @start__: starting offset, the first gpuva will overlap this
+ * @end__: ending offset, the last gpuva will start before this (but may
+ * overlap)
+ *
+ * This iterator walks over all &drm_gpuvas in the &drm_gpuvm that lie
+ * between @start__ and @end__. It is implemented similarly to list_for_each(),
+ * but is using the &drm_gpuvm's internal interval tree to accelerate
+ * the search for the starting &drm_gpuva, and hence isn't safe against removal
+ * of elements. It assumes that @end__ is within (or is the upper limit of) the
+ * &drm_gpuvm. This iterator does not skip over the &drm_gpuvm's
+ * @kernel_alloc_node.
+ */
+#define drm_gpuvm_for_each_va_range(va__, gpuvm__, start__, end__) \
+	for (va__ = drm_gpuva_find_first((gpuvm__), (start__), (end__) - (start__)); \
+	     va__ && (va__->va.addr < (end__)); \
+	     va__ = __drm_gpuva_next(va__))
+
+/**
+ * drm_gpuvm_for_each_va_range_safe() - safely iterate over a range of
+ * &drm_gpuvas
+ * @va__: &drm_gpuva to assign to in each iteration step
+ * @next__: another &drm_gpuva to use as temporary storage
+ * @gpuvm__: &drm_gpuvm to walk over
+ * @start__: starting offset, the first gpuva will overlap this
+ * @end__: ending offset, the last gpuva will start before this (but may
+ * overlap)
+ *
+ * This iterator walks over all &drm_gpuvas in the &drm_gpuvm that lie
+ * between @start__ and @end__. It is implemented similarly to
+ * list_for_each_safe(), but is using the &drm_gpuvm's internal interval
+ * tree to accelerate the search for the starting &drm_gpuva, and hence is safe
+ * against removal of elements. It assumes that @end__ is within (or is the
+ * upper limit of) the &drm_gpuvm. This iterator does not skip over the
+ * &drm_gpuvm's @kernel_alloc_node.
+ */
+#define drm_gpuvm_for_each_va_range_safe(va__, next__, gpuvm__, start__, end__) \
+	for (va__ = drm_gpuva_find_first((gpuvm__), (start__), (end__) - (start__)), \
+	     next__ = __drm_gpuva_next(va__); \
+	     va__ && (va__->va.addr < (end__)); \
+	     va__ = next__, next__ = __drm_gpuva_next(va__))
+
+/**
+ * drm_gpuvm_for_each_va() - iterate over all &drm_gpuvas
+ * @va__: &drm_gpuva to assign to in each iteration step
+ * @gpuvm__: &drm_gpuvm to walk over
+ *
+ * This iterator walks over all &drm_gpuva structures associated with the given
+ * &drm_gpuvm.
+ */
+#define drm_gpuvm_for_each_va(va__, gpuvm__) \
+	list_for_each_entry(va__, &(gpuvm__)->rb.list, rb.entry)
+
+/**
+ * drm_gpuvm_for_each_va_safe() - safely iterate over all &drm_gpuvas
+ * @va__: &drm_gpuva to assign to in each iteration step
+ * @next__: another &drm_gpuva to use as temporary storage
+ * @gpuvm__: &drm_gpuvm to walk over
+ *
+ * This iterator walks over all &drm_gpuva structures associated with the given
+ * &drm_gpuvm. It is implemented with list_for_each_entry_safe(), and
+ * hence safe against the removal of elements.
+ */
+#define drm_gpuvm_for_each_va_safe(va__, next__, gpuvm__) \
+	list_for_each_entry_safe(va__, next__, &(gpuvm__)->rb.list, rb.entry)
+
+/**
+ * struct drm_gpuvm_exec - &drm_gpuvm abstraction of &drm_exec
+ *
+ * This structure should be created on the stack as &drm_exec should be.
+ *
+ * Optionally, @extra can be set in order to lock additional &drm_gem_objects.
+ */
+struct drm_gpuvm_exec {
+	/**
+	 * @exec: the &drm_exec structure
+	 */
+	struct drm_exec exec;
+
+	/**
+	 * @flags: the flags for the struct drm_exec
+	 */
+	uint32_t flags;
+
+	/**
+	 * @vm: the &drm_gpuvm to lock its DMA reservations
+	 */
+	struct drm_gpuvm *vm;
+
+	/**
+	 * @num_fences: the number of fences to reserve for the &dma_resv of the
+	 * locked &drm_gem_objects
+	 */
+	unsigned int num_fences;
+
+	/**
+	 * @extra: Callback and corresponding private data for the driver to
+	 * lock arbitrary additional &drm_gem_objects.
+	 */
+	struct {
+		/**
+		 * @fn: The driver callback to lock additional &drm_gem_objects.
+		 */
+		int (*fn)(struct drm_gpuvm_exec *vm_exec);
+
+		/**
+		 * @priv: driver private data for the @fn callback
+		 */
+		void *priv;
+	} extra;
+};
+
+int drm_gpuvm_prepare_vm(struct drm_gpuvm *gpuvm,
+			 struct drm_exec *exec,
+			 unsigned int num_fences);
+
+int drm_gpuvm_prepare_objects(struct drm_gpuvm *gpuvm,
+			      struct drm_exec *exec,
+			      unsigned int num_fences);
+
+int drm_gpuvm_prepare_range(struct drm_gpuvm *gpuvm,
+			    struct drm_exec *exec,
+			    u64 addr, u64 range,
+			    unsigned int num_fences);
+
+int drm_gpuvm_exec_lock(struct drm_gpuvm_exec *vm_exec);
+
+int drm_gpuvm_exec_lock_array(struct drm_gpuvm_exec *vm_exec,
+			      struct drm_gem_object **objs,
+			      unsigned int num_objs);
+
+int drm_gpuvm_exec_lock_range(struct drm_gpuvm_exec *vm_exec,
+			      u64 addr, u64 range);
+
+/**
+ * drm_gpuvm_exec_unlock() - lock all dma-resv of all assoiciated BOs
+ * @vm_exec: the &drm_gpuvm_exec wrapper
+ *
+ * Releases all dma-resv locks of all &drm_gem_objects previously acquired
+ * through drm_gpuvm_exec_lock() or its variants.
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+static inline void
+drm_gpuvm_exec_unlock(struct drm_gpuvm_exec *vm_exec)
+{
+	drm_exec_fini(&vm_exec->exec);
+}
+
+int drm_gpuvm_validate(struct drm_gpuvm *gpuvm, struct drm_exec *exec);
+void drm_gpuvm_resv_add_fence(struct drm_gpuvm *gpuvm,
+			      struct drm_exec *exec,
+			      struct dma_fence *fence,
+			      enum dma_resv_usage private_usage,
+			      enum dma_resv_usage extobj_usage);
+
+/**
+ * drm_gpuvm_exec_resv_add_fence()
+ * @vm_exec: the &drm_gpuvm_exec wrapper
+ * @fence: fence to add
+ * @private_usage: private dma-resv usage
+ * @extobj_usage: extobj dma-resv usage
+ *
+ * See drm_gpuvm_resv_add_fence().
+ */
+static inline void
+drm_gpuvm_exec_resv_add_fence(struct drm_gpuvm_exec *vm_exec,
+			      struct dma_fence *fence,
+			      enum dma_resv_usage private_usage,
+			      enum dma_resv_usage extobj_usage)
+{
+	drm_gpuvm_resv_add_fence(vm_exec->vm, &vm_exec->exec, fence,
+				 private_usage, extobj_usage);
+}
+
+/**
+ * drm_gpuvm_exec_validate()
+ * @vm_exec: the &drm_gpuvm_exec wrapper
+ *
+ * See drm_gpuvm_validate().
+ */
+static inline int
+drm_gpuvm_exec_validate(struct drm_gpuvm_exec *vm_exec)
+{
+	return drm_gpuvm_validate(vm_exec->vm, &vm_exec->exec);
+}
+
+/**
+ * struct drm_gpuvm_bo - structure representing a &drm_gpuvm and
+ * &drm_gem_object combination
+ *
+ * This structure is an abstraction representing a &drm_gpuvm and
+ * &drm_gem_object combination. It serves as an indirection to accelerate
+ * iterating all &drm_gpuvas within a &drm_gpuvm backed by the same
+ * &drm_gem_object.
+ *
+ * Furthermore it is used cache evicted GEM objects for a certain GPU-VM to
+ * accelerate validation.
+ *
+ * Typically, drivers want to create an instance of a struct drm_gpuvm_bo once
+ * a GEM object is mapped first in a GPU-VM and release the instance once the
+ * last mapping of the GEM object in this GPU-VM is unmapped.
+ */
+struct drm_gpuvm_bo {
+	/**
+	 * @vm: The &drm_gpuvm the @obj is mapped in. This is a reference
+	 * counted pointer.
+	 */
+	struct drm_gpuvm *vm;
+
+	/**
+	 * @obj: The &drm_gem_object being mapped in @vm. This is a reference
+	 * counted pointer.
+	 */
+	struct drm_gem_object *obj;
+
+	/**
+	 * @evicted: Indicates whether the &drm_gem_object is evicted; field
+	 * protected by the &drm_gem_object's dma-resv lock.
+	 */
+	bool evicted;
+
+	/**
+	 * @kref: The reference count for this &drm_gpuvm_bo.
+	 */
+	struct kref kref;
+
+	/**
+	 * @list: Structure containing all &list_heads.
+	 */
+	struct {
+		/**
+		 * @gpuva: The list of linked &drm_gpuvas.
+		 *
+		 * It is safe to access entries from this list as long as the
+		 * GEM's gpuva lock is held. See also struct drm_gem_object.
+		 */
+		struct list_head gpuva;
+
+		/**
+		 * @entry: Structure containing all &list_heads serving as
+		 * entry.
+		 */
+		struct {
+			/**
+			 * @gem: List entry to attach to the &drm_gem_objects
+			 * gpuva list.
+			 */
+			struct list_head gem;
+
+			/**
+			 * @evict: List entry to attach to the &drm_gpuvms
+			 * extobj list.
+			 */
+			struct list_head extobj;
+
+			/**
+			 * @evict: List entry to attach to the &drm_gpuvms evict
+			 * list.
+			 */
+			struct list_head evict;
+		} entry;
+	} list;
+};
+
+struct drm_gpuvm_bo *
+drm_gpuvm_bo_create(struct drm_gpuvm *gpuvm,
+		    struct drm_gem_object *obj);
+
+struct drm_gpuvm_bo *
+drm_gpuvm_bo_obtain(struct drm_gpuvm *gpuvm,
+		    struct drm_gem_object *obj);
+struct drm_gpuvm_bo *
+drm_gpuvm_bo_obtain_prealloc(struct drm_gpuvm_bo *vm_bo);
+
+/**
+ * drm_gpuvm_bo_get() - acquire a struct drm_gpuvm_bo reference
+ * @vm_bo: the &drm_gpuvm_bo to acquire the reference of
+ *
+ * This function acquires an additional reference to @vm_bo. It is illegal to
+ * call this without already holding a reference. No locks required.
+ */
+static inline struct drm_gpuvm_bo *
+drm_gpuvm_bo_get(struct drm_gpuvm_bo *vm_bo)
+{
+	kref_get(&vm_bo->kref);
+	return vm_bo;
+}
+
+bool drm_gpuvm_bo_put(struct drm_gpuvm_bo *vm_bo);
+
+struct drm_gpuvm_bo *
+drm_gpuvm_bo_find(struct drm_gpuvm *gpuvm,
+		  struct drm_gem_object *obj);
+
+void drm_gpuvm_bo_evict(struct drm_gpuvm_bo *vm_bo, bool evict);
+
+/**
+ * drm_gpuvm_bo_gem_evict()
+ * @obj: the &drm_gem_object
+ * @evict: indicates whether @obj is evicted
+ *
+ * See drm_gpuvm_bo_evict().
+ */
+static inline void
+drm_gpuvm_bo_gem_evict(struct drm_gem_object *obj, bool evict)
+{
+	struct drm_gpuvm_bo *vm_bo;
+
+	drm_gem_gpuva_assert_lock_held(obj);
+	drm_gem_for_each_gpuvm_bo(vm_bo, obj)
+		drm_gpuvm_bo_evict(vm_bo, evict);
+}
+
+void drm_gpuvm_bo_extobj_add(struct drm_gpuvm_bo *vm_bo);
+
+/**
+ * drm_gpuvm_bo_for_each_va() - iterator to walk over a list of &drm_gpuva
+ * @va__: &drm_gpuva structure to assign to in each iteration step
+ * @vm_bo__: the &drm_gpuvm_bo the &drm_gpuva to walk are associated with
+ *
+ * This iterator walks over all &drm_gpuva structures associated with the
+ * &drm_gpuvm_bo.
+ *
+ * The caller must hold the GEM's gpuva lock.
+ */
+#define drm_gpuvm_bo_for_each_va(va__, vm_bo__) \
+	list_for_each_entry(va__, &(vm_bo)->list.gpuva, gem.entry)
+
+/**
+ * drm_gpuvm_bo_for_each_va_safe() - iterator to safely walk over a list of
+ * &drm_gpuva
+ * @va__: &drm_gpuva structure to assign to in each iteration step
+ * @next__: &next &drm_gpuva to store the next step
+ * @vm_bo__: the &drm_gpuvm_bo the &drm_gpuva to walk are associated with
+ *
+ * This iterator walks over all &drm_gpuva structures associated with the
+ * &drm_gpuvm_bo. It is implemented with list_for_each_entry_safe(), hence
+ * it is save against removal of elements.
+ *
+ * The caller must hold the GEM's gpuva lock.
+ */
+#define drm_gpuvm_bo_for_each_va_safe(va__, next__, vm_bo__) \
+	list_for_each_entry_safe(va__, next__, &(vm_bo)->list.gpuva, gem.entry)
+
+/**
+ * enum drm_gpuva_op_type - GPU VA operation type
+ *
+ * Operations to alter the GPU VA mappings tracked by the &drm_gpuvm.
+ */
+enum drm_gpuva_op_type {
+	/**
+	 * @DRM_GPUVA_OP_MAP: the map op type
+	 */
+	DRM_GPUVA_OP_MAP,
+
+	/**
+	 * @DRM_GPUVA_OP_REMAP: the remap op type
+	 */
+	DRM_GPUVA_OP_REMAP,
+
+	/**
+	 * @DRM_GPUVA_OP_UNMAP: the unmap op type
+	 */
+	DRM_GPUVA_OP_UNMAP,
+
+	/**
+	 * @DRM_GPUVA_OP_PREFETCH: the prefetch op type
+	 */
+	DRM_GPUVA_OP_PREFETCH,
+};
+
+/**
+ * struct drm_gpuva_op_map - GPU VA map operation
+ *
+ * This structure represents a single map operation generated by the
+ * DRM GPU VA manager.
+ */
+struct drm_gpuva_op_map {
+	/**
+	 * @va: structure containing address and range of a map
+	 * operation
+	 */
+	struct {
+		/**
+		 * @addr: the base address of the new mapping
+		 */
+		u64 addr;
+
+		/**
+		 * @range: the range of the new mapping
+		 */
+		u64 range;
+	} va;
+
+	/**
+	 * @gem: structure containing the &drm_gem_object and it's offset
+	 */
+	struct {
+		/**
+		 * @offset: the offset within the &drm_gem_object
+		 */
+		u64 offset;
+
+		/**
+		 * @obj: the &drm_gem_object to map
+		 */
+		struct drm_gem_object *obj;
+	} gem;
+};
+
+/**
+ * struct drm_gpuva_op_unmap - GPU VA unmap operation
+ *
+ * This structure represents a single unmap operation generated by the
+ * DRM GPU VA manager.
+ */
+struct drm_gpuva_op_unmap {
+	/**
+	 * @va: the &drm_gpuva to unmap
+	 */
+	struct drm_gpuva *va;
+
+	/**
+	 * @keep:
+	 *
+	 * Indicates whether this &drm_gpuva is physically contiguous with the
+	 * original mapping request.
+	 *
+	 * Optionally, if &keep is set, drivers may keep the actual page table
+	 * mappings for this &drm_gpuva, adding the missing page table entries
+	 * only and update the &drm_gpuvm accordingly.
+	 */
+	bool keep;
+};
+
+/**
+ * struct drm_gpuva_op_remap - GPU VA remap operation
+ *
+ * This represents a single remap operation generated by the DRM GPU VA manager.
+ *
+ * A remap operation is generated when an existing GPU VA mmapping is split up
+ * by inserting a new GPU VA mapping or by partially unmapping existent
+ * mapping(s), hence it consists of a maximum of two map and one unmap
+ * operation.
+ *
+ * The @unmap operation takes care of removing the original existing mapping.
+ * @prev is used to remap the preceding part, @next the subsequent part.
+ *
+ * If either a new mapping's start address is aligned with the start address
+ * of the old mapping or the new mapping's end address is aligned with the
+ * end address of the old mapping, either @prev or @next is NULL.
+ *
+ * Note, the reason for a dedicated remap operation, rather than arbitrary
+ * unmap and map operations, is to give drivers the chance of extracting driver
+ * specific data for creating the new mappings from the unmap operations's
+ * &drm_gpuva structure which typically is embedded in larger driver specific
+ * structures.
+ */
+struct drm_gpuva_op_remap {
+	/**
+	 * @prev: the preceding part of a split mapping
+	 */
+	struct drm_gpuva_op_map *prev;
+
+	/**
+	 * @next: the subsequent part of a split mapping
+	 */
+	struct drm_gpuva_op_map *next;
+
+	/**
+	 * @unmap: the unmap operation for the original existing mapping
+	 */
+	struct drm_gpuva_op_unmap *unmap;
+};
+
+/**
+ * struct drm_gpuva_op_prefetch - GPU VA prefetch operation
+ *
+ * This structure represents a single prefetch operation generated by the
+ * DRM GPU VA manager.
+ */
+struct drm_gpuva_op_prefetch {
+	/**
+	 * @va: the &drm_gpuva to prefetch
+	 */
+	struct drm_gpuva *va;
+};
+
+/**
+ * struct drm_gpuva_op - GPU VA operation
+ *
+ * This structure represents a single generic operation.
+ *
+ * The particular type of the operation is defined by @op.
+ */
+struct drm_gpuva_op {
+	/**
+	 * @entry:
+	 *
+	 * The &list_head used to distribute instances of this struct within
+	 * &drm_gpuva_ops.
+	 */
+	struct list_head entry;
+
+	/**
+	 * @op: the type of the operation
+	 */
+	enum drm_gpuva_op_type op;
+
+	union {
+		/**
+		 * @map: the map operation
+		 */
+		struct drm_gpuva_op_map map;
+
+		/**
+		 * @remap: the remap operation
+		 */
+		struct drm_gpuva_op_remap remap;
+
+		/**
+		 * @unmap: the unmap operation
+		 */
+		struct drm_gpuva_op_unmap unmap;
+
+		/**
+		 * @prefetch: the prefetch operation
+		 */
+		struct drm_gpuva_op_prefetch prefetch;
+	};
+};
+
+/**
+ * struct drm_gpuva_ops - wraps a list of &drm_gpuva_op
+ */
+struct drm_gpuva_ops {
+	/**
+	 * @list: the &list_head
+	 */
+	struct list_head list;
+};
+
+/**
+ * drm_gpuva_for_each_op() - iterator to walk over &drm_gpuva_ops
+ * @op: &drm_gpuva_op to assign in each iteration step
+ * @ops: &drm_gpuva_ops to walk
+ *
+ * This iterator walks over all ops within a given list of operations.
+ */
+#define drm_gpuva_for_each_op(op, ops) list_for_each_entry(op, &(ops)->list, entry)
+
+/**
+ * drm_gpuva_for_each_op_safe() - iterator to safely walk over &drm_gpuva_ops
+ * @op: &drm_gpuva_op to assign in each iteration step
+ * @next: &next &drm_gpuva_op to store the next step
+ * @ops: &drm_gpuva_ops to walk
+ *
+ * This iterator walks over all ops within a given list of operations. It is
+ * implemented with list_for_each_safe(), so save against removal of elements.
+ */
+#define drm_gpuva_for_each_op_safe(op, next, ops) \
+	list_for_each_entry_safe(op, next, &(ops)->list, entry)
+
+/**
+ * drm_gpuva_for_each_op_from_reverse() - iterate backwards from the given point
+ * @op: &drm_gpuva_op to assign in each iteration step
+ * @ops: &drm_gpuva_ops to walk
+ *
+ * This iterator walks over all ops within a given list of operations beginning
+ * from the given operation in reverse order.
+ */
+#define drm_gpuva_for_each_op_from_reverse(op, ops) \
+	list_for_each_entry_from_reverse(op, &(ops)->list, entry)
+
+/**
+ * drm_gpuva_first_op() - returns the first &drm_gpuva_op from &drm_gpuva_ops
+ * @ops: the &drm_gpuva_ops to get the fist &drm_gpuva_op from
+ */
+#define drm_gpuva_first_op(ops) \
+	list_first_entry(&(ops)->list, struct drm_gpuva_op, entry)
+
+/**
+ * drm_gpuva_last_op() - returns the last &drm_gpuva_op from &drm_gpuva_ops
+ * @ops: the &drm_gpuva_ops to get the last &drm_gpuva_op from
+ */
+#define drm_gpuva_last_op(ops) \
+	list_last_entry(&(ops)->list, struct drm_gpuva_op, entry)
+
+/**
+ * drm_gpuva_prev_op() - previous &drm_gpuva_op in the list
+ * @op: the current &drm_gpuva_op
+ */
+#define drm_gpuva_prev_op(op) list_prev_entry(op, entry)
+
+/**
+ * drm_gpuva_next_op() - next &drm_gpuva_op in the list
+ * @op: the current &drm_gpuva_op
+ */
+#define drm_gpuva_next_op(op) list_next_entry(op, entry)
+
+struct drm_gpuva_ops *
+drm_gpuvm_sm_map_ops_create(struct drm_gpuvm *gpuvm,
+			    u64 addr, u64 range,
+			    struct drm_gem_object *obj, u64 offset);
+struct drm_gpuva_ops *
+drm_gpuvm_sm_unmap_ops_create(struct drm_gpuvm *gpuvm,
+			      u64 addr, u64 range);
+
+struct drm_gpuva_ops *
+drm_gpuvm_prefetch_ops_create(struct drm_gpuvm *gpuvm,
+				 u64 addr, u64 range);
+
+struct drm_gpuva_ops *
+drm_gpuvm_bo_unmap_ops_create(struct drm_gpuvm_bo *vm_bo);
+
+void drm_gpuva_ops_free(struct drm_gpuvm *gpuvm,
+			struct drm_gpuva_ops *ops);
+
+static inline void drm_gpuva_init_from_op(struct drm_gpuva *va,
+					  struct drm_gpuva_op_map *op)
+{
+	drm_gpuva_init(va, op->va.addr, op->va.range,
+		       op->gem.obj, op->gem.offset);
+}
+
+/**
+ * struct drm_gpuvm_ops - callbacks for split/merge steps
+ *
+ * This structure defines the callbacks used by &drm_gpuvm_sm_map and
+ * &drm_gpuvm_sm_unmap to provide the split/merge steps for map and unmap
+ * operations to drivers.
+ */
+struct drm_gpuvm_ops {
+	/**
+	 * @vm_free: called when the last reference of a struct drm_gpuvm is
+	 * dropped
+	 *
+	 * This callback is mandatory.
+	 */
+	void (*vm_free)(struct drm_gpuvm *gpuvm);
+
+	/**
+	 * @op_alloc: called when the &drm_gpuvm allocates
+	 * a struct drm_gpuva_op
+	 *
+	 * Some drivers may want to embed struct drm_gpuva_op into driver
+	 * specific structures. By implementing this callback drivers can
+	 * allocate memory accordingly.
+	 *
+	 * This callback is optional.
+	 */
+	struct drm_gpuva_op *(*op_alloc)(void);
+
+	/**
+	 * @op_free: called when the &drm_gpuvm frees a
+	 * struct drm_gpuva_op
+	 *
+	 * Some drivers may want to embed struct drm_gpuva_op into driver
+	 * specific structures. By implementing this callback drivers can
+	 * free the previously allocated memory accordingly.
+	 *
+	 * This callback is optional.
+	 */
+	void (*op_free)(struct drm_gpuva_op *op);
+
+	/**
+	 * @vm_bo_alloc: called when the &drm_gpuvm allocates
+	 * a struct drm_gpuvm_bo
+	 *
+	 * Some drivers may want to embed struct drm_gpuvm_bo into driver
+	 * specific structures. By implementing this callback drivers can
+	 * allocate memory accordingly.
+	 *
+	 * This callback is optional.
+	 */
+	struct drm_gpuvm_bo *(*vm_bo_alloc)(void);
+
+	/**
+	 * @vm_bo_free: called when the &drm_gpuvm frees a
+	 * struct drm_gpuvm_bo
+	 *
+	 * Some drivers may want to embed struct drm_gpuvm_bo into driver
+	 * specific structures. By implementing this callback drivers can
+	 * free the previously allocated memory accordingly.
+	 *
+	 * This callback is optional.
+	 */
+	void (*vm_bo_free)(struct drm_gpuvm_bo *vm_bo);
+
+	/**
+	 * @vm_bo_validate: called from drm_gpuvm_validate()
+	 *
+	 * Drivers receive this callback for every evicted &drm_gem_object being
+	 * mapped in the corresponding &drm_gpuvm.
+	 *
+	 * Typically, drivers would call their driver specific variant of
+	 * ttm_bo_validate() from within this callback.
+	 */
+	int (*vm_bo_validate)(struct drm_gpuvm_bo *vm_bo,
+			      struct drm_exec *exec);
+
+	/**
+	 * @sm_step_map: called from &drm_gpuvm_sm_map to finally insert the
+	 * mapping once all previous steps were completed
+	 *
+	 * The &priv pointer matches the one the driver passed to
+	 * &drm_gpuvm_sm_map or &drm_gpuvm_sm_unmap, respectively.
+	 *
+	 * Can be NULL if &drm_gpuvm_sm_map is used.
+	 */
+	int (*sm_step_map)(struct drm_gpuva_op *op, void *priv);
+
+	/**
+	 * @sm_step_remap: called from &drm_gpuvm_sm_map and
+	 * &drm_gpuvm_sm_unmap to split up an existent mapping
+	 *
+	 * This callback is called when existent mapping needs to be split up.
+	 * This is the case when either a newly requested mapping overlaps or
+	 * is enclosed by an existent mapping or a partial unmap of an existent
+	 * mapping is requested.
+	 *
+	 * The &priv pointer matches the one the driver passed to
+	 * &drm_gpuvm_sm_map or &drm_gpuvm_sm_unmap, respectively.
+	 *
+	 * Can be NULL if neither &drm_gpuvm_sm_map nor &drm_gpuvm_sm_unmap is
+	 * used.
+	 */
+	int (*sm_step_remap)(struct drm_gpuva_op *op, void *priv);
+
+	/**
+	 * @sm_step_unmap: called from &drm_gpuvm_sm_map and
+	 * &drm_gpuvm_sm_unmap to unmap an existent mapping
+	 *
+	 * This callback is called when existent mapping needs to be unmapped.
+	 * This is the case when either a newly requested mapping encloses an
+	 * existent mapping or an unmap of an existent mapping is requested.
+	 *
+	 * The &priv pointer matches the one the driver passed to
+	 * &drm_gpuvm_sm_map or &drm_gpuvm_sm_unmap, respectively.
+	 *
+	 * Can be NULL if neither &drm_gpuvm_sm_map nor &drm_gpuvm_sm_unmap is
+	 * used.
+	 */
+	int (*sm_step_unmap)(struct drm_gpuva_op *op, void *priv);
+};
+
+int drm_gpuvm_sm_map(struct drm_gpuvm *gpuvm, void *priv,
+		     u64 addr, u64 range,
+		     struct drm_gem_object *obj, u64 offset);
+
+int drm_gpuvm_sm_unmap(struct drm_gpuvm *gpuvm, void *priv,
+		       u64 addr, u64 range);
+
+void drm_gpuva_map(struct drm_gpuvm *gpuvm,
+		   struct drm_gpuva *va,
+		   struct drm_gpuva_op_map *op);
+
+void drm_gpuva_remap(struct drm_gpuva *prev,
+		     struct drm_gpuva *next,
+		     struct drm_gpuva_op_remap *op);
+
+void drm_gpuva_unmap(struct drm_gpuva_op_unmap *op);
+
+/**
+ * drm_gpuva_op_remap_to_unmap_range() - Helper to get the start and range of
+ * the unmap stage of a remap op.
+ * @op: Remap op.
+ * @start_addr: Output pointer for the start of the required unmap.
+ * @range: Output pointer for the length of the required unmap.
+ *
+ * The given start address and range will be set such that they represent the
+ * range of the address space that was previously covered by the mapping being
+ * re-mapped, but is now empty.
+ */
+static inline void
+drm_gpuva_op_remap_to_unmap_range(const struct drm_gpuva_op_remap *op,
+				  u64 *start_addr, u64 *range)
+{
+	const u64 va_start = op->prev ?
+			     op->prev->va.addr + op->prev->va.range :
+			     op->unmap->va->va.addr;
+	const u64 va_end = op->next ?
+			   op->next->va.addr :
+			   op->unmap->va->va.addr + op->unmap->va->va.range;
+
+	if (start_addr)
+		*start_addr = va_start;
+	if (range)
+		*range = va_end - va_start;
+}
+
+#endif /* __DRM_GPUVM_H__ */
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 2ae4fd62e01c4..c1e21fb44f090 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -41,24 +41,43 @@
  */
 #define DRM_SCHED_FENCE_DONT_PIPELINE	DMA_FENCE_FLAG_USER_BITS
 
+/**
+ * DRM_SCHED_FENCE_FLAG_HAS_DEADLINE_BIT - A fence deadline hint has been set
+ *
+ * Because we could have a deadline hint can be set before the backing hw
+ * fence is created, we need to keep track of whether a deadline has already
+ * been set.
+ */
+#define DRM_SCHED_FENCE_FLAG_HAS_DEADLINE_BIT	(DMA_FENCE_FLAG_USER_BITS + 1)
+
+enum dma_resv_usage;
+struct dma_resv;
 struct drm_gem_object;
 
 struct drm_gpu_scheduler;
 struct drm_sched_rq;
 
+struct drm_file;
+
 /* These are often used as an (initial) index
  * to an array, and as such should start at 0.
  */
 enum drm_sched_priority {
-	DRM_SCHED_PRIORITY_MIN,
-	DRM_SCHED_PRIORITY_NORMAL,
-	DRM_SCHED_PRIORITY_HIGH,
 	DRM_SCHED_PRIORITY_KERNEL,
+	DRM_SCHED_PRIORITY_HIGH,
+	DRM_SCHED_PRIORITY_NORMAL,
+	DRM_SCHED_PRIORITY_LOW,
 
 	DRM_SCHED_PRIORITY_COUNT,
 	DRM_SCHED_PRIORITY_UNSET = -2
 };
 
+/* Used to chose between FIFO and RR jobs scheduling */
+extern int drm_sched_policy;
+
+#define DRM_SCHED_POLICY_RR    0
+#define DRM_SCHED_POLICY_FIFO  1
+
 /**
  * struct drm_sched_entity - A wrapper around a job queue (typically
  * attached to the DRM file_priv).
@@ -182,7 +201,7 @@ struct drm_sched_entity {
 	 * by the scheduler thread, can be accessed locklessly from
 	 * drm_sched_job_arm() iff the queue is empty.
 	 */
-	struct dma_fence                *last_scheduled;
+	struct dma_fence __rcu		*last_scheduled;
 
 	/**
 	 * @last_user: last group leader pushing a job into the entity.
@@ -205,6 +224,21 @@ struct drm_sched_entity {
 	 * drm_sched_entity_fini().
 	 */
 	struct completion		entity_idle;
+
+	/**
+	 * @oldest_job_waiting:
+	 *
+	 * Marks earliest job waiting in SW queue
+	 */
+	ktime_t				oldest_job_waiting;
+
+	/**
+	 * @rb_tree_node:
+	 *
+	 * The node used to insert this entity into time based priority queue
+	 */
+	struct rb_node			rb_tree_node;
+
 };
 
 /**
@@ -214,6 +248,7 @@ struct drm_sched_entity {
  * @sched: the scheduler to which this rq belongs to.
  * @entities: list of the entities to be scheduled.
  * @current_entity: the entity which is to be scheduled.
+ * @rb_tree_root: root of time based priory queue of entities for FIFO scheduling
  *
  * Run queue is a set of entities scheduling command submissions for
  * one specific ring. It implements the scheduling policy that selects
@@ -224,6 +259,7 @@ struct drm_sched_rq {
 	struct drm_gpu_scheduler	*sched;
 	struct list_head		entities;
 	struct drm_sched_entity		*current_entity;
+	struct rb_root_cached		rb_tree_root;
 };
 
 /**
@@ -248,6 +284,12 @@ struct drm_sched_fence {
          */
 	struct dma_fence		finished;
 
+	/**
+	 * @deadline: deadline set on &drm_sched_fence.finished which
+	 * potentially needs to be propagated to &drm_sched_fence.parent
+	 */
+	ktime_t				deadline;
+
         /**
          * @parent: the fence returned by &drm_sched_backend_ops.run_job
          * when scheduling the job on hardware. We signal the
@@ -279,6 +321,7 @@ struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f);
  * @sched: the scheduler instance on which this job is scheduled.
  * @s_fence: contains the fences for the scheduling of job.
  * @finish_cb: the callback for the finished fence.
+ * @credits: the number of credits this job contributes to the scheduler
  * @work: Helper to reschdeule job kill to different context.
  * @id: a unique id assigned to each job scheduled on the scheduler.
  * @karma: increment on every hang caused by this job. If this exceeds the hang
@@ -298,13 +341,15 @@ struct drm_sched_job {
 	struct drm_gpu_scheduler	*sched;
 	struct drm_sched_fence		*s_fence;
 
+	u32				credits;
+
 	/*
 	 * work is used only after finish_cb has been used and will not be
 	 * accessed anymore.
 	 */
 	union {
 		struct dma_fence_cb		finish_cb;
-		struct work_struct 		work;
+		struct work_struct		work;
 	};
 
 	uint64_t			id;
@@ -323,6 +368,13 @@ struct drm_sched_job {
 
 	/** @last_dependency: tracks @dependencies as they signal */
 	unsigned long			last_dependency;
+
+	/**
+	 * @submit_ts:
+	 *
+	 * When the job was pushed into the entity queue.
+	 */
+	ktime_t                         submit_ts;
 };
 
 static inline bool drm_sched_invalidate_job(struct drm_sched_job *s_job,
@@ -345,18 +397,17 @@ enum drm_gpu_sched_stat {
  */
 struct drm_sched_backend_ops {
 	/**
-	 * @dependency:
+	 * @prepare_job:
 	 *
 	 * Called when the scheduler is considering scheduling this job next, to
 	 * get another struct dma_fence for this job to block on.  Once it
 	 * returns NULL, run_job() may be called.
 	 *
-	 * If a driver exclusively uses drm_sched_job_add_dependency() and
-	 * drm_sched_job_add_implicit_dependencies() this can be ommitted and
-	 * left as NULL.
+	 * Can be NULL if no additional preparation to the dependencies are
+	 * necessary. Skipped when jobs are killed instead of run.
 	 */
-	struct dma_fence *(*dependency)(struct drm_sched_job *sched_job,
-					struct drm_sched_entity *s_entity);
+	struct dma_fence *(*prepare_job)(struct drm_sched_job *sched_job,
+					 struct drm_sched_entity *s_entity);
 
 	/**
          * @run_job: Called to execute the job once all of the dependencies
@@ -415,27 +466,42 @@ struct drm_sched_backend_ops {
          * and it's time to clean it up.
 	 */
 	void (*free_job)(struct drm_sched_job *sched_job);
+
+	/**
+	 * @update_job_credits: Called when the scheduler is considering this
+	 * job for execution.
+	 *
+	 * This callback returns the number of credits the job would take if
+	 * pushed to the hardware. Drivers may use this to dynamically update
+	 * the job's credit count. For instance, deduct the number of credits
+	 * for already signalled native fences.
+	 *
+	 * This callback is optional.
+	 */
+	u32 (*update_job_credits)(struct drm_sched_job *sched_job);
 };
 
 /**
  * struct drm_gpu_scheduler - scheduler instance-specific data
  *
  * @ops: backend operations provided by the driver.
- * @hw_submission_limit: the max size of the hardware queue.
+ * @credit_limit: the credit limit of this scheduler
+ * @credit_count: the current credit count of this scheduler
  * @timeout: the time after which a job is removed from the scheduler.
  * @name: name of the ring for which this scheduler is being used.
- * @sched_rq: priority wise array of run queues.
- * @wake_up_worker: the wait queue on which the scheduler sleeps until a job
- *                  is ready to be scheduled.
+ * @num_rqs: Number of run-queues. This is at most DRM_SCHED_PRIORITY_COUNT,
+ *           as there's usually one run-queue per priority, but could be less.
+ * @sched_rq: An allocated array of run-queues of size @num_rqs;
  * @job_scheduled: once @drm_sched_entity_do_release is called the scheduler
  *                 waits on this wait queue until all the scheduled jobs are
  *                 finished.
- * @hw_rq_count: the number of jobs currently in the hardware queue.
  * @job_id_count: used to assign unique id to the each job.
+ * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
  * @timeout_wq: workqueue used to queue @work_tdr
+ * @work_run_job: work which calls run_job op of each scheduler.
+ * @work_free_job: work which calls free_job op of each scheduler.
  * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
  *            timeout interval is over.
- * @thread: the kthread on which the scheduler which run.
  * @pending_list: the list of jobs which are currently in the job queue.
  * @job_list_lock: lock to protect the pending_list.
  * @hang_limit: once the hangs by a job crosses this limit then it is marked
@@ -444,23 +510,27 @@ struct drm_sched_backend_ops {
  * @_score: score used when the driver doesn't provide one
  * @ready: marks if the underlying HW is ready to work
  * @free_guilty: A hit to time out handler to free the guilty job.
+ * @pause_submit: pause queuing of @work_run_job on @submit_wq
+ * @own_submit_wq: scheduler owns allocation of @submit_wq
  * @dev: system &struct device
  *
  * One scheduler is implemented for each hardware ring.
  */
 struct drm_gpu_scheduler {
 	const struct drm_sched_backend_ops	*ops;
-	uint32_t			hw_submission_limit;
+	u32				credit_limit;
+	atomic_t			credit_count;
 	long				timeout;
 	const char			*name;
-	struct drm_sched_rq		sched_rq[DRM_SCHED_PRIORITY_COUNT];
-	wait_queue_head_t		wake_up_worker;
+	u32                             num_rqs;
+	struct drm_sched_rq             **sched_rq;
 	wait_queue_head_t		job_scheduled;
-	atomic_t			hw_rq_count;
 	atomic64_t			job_id_count;
+	struct workqueue_struct		*submit_wq;
 	struct workqueue_struct		*timeout_wq;
+	struct work_struct		work_run_job;
+	struct work_struct		work_free_job;
 	struct delayed_work		work_tdr;
-	struct task_struct		*thread;
 	struct list_head		pending_list;
 	spinlock_t			job_list_lock;
 	int				hang_limit;
@@ -468,22 +538,32 @@ struct drm_gpu_scheduler {
 	atomic_t                        _score;
 	bool				ready;
 	bool				free_guilty;
+	bool				pause_submit;
+	bool				own_submit_wq;
 	struct device			*dev;
 };
 
 int drm_sched_init(struct drm_gpu_scheduler *sched,
 		   const struct drm_sched_backend_ops *ops,
-		   uint32_t hw_submission, unsigned hang_limit,
+		   struct workqueue_struct *submit_wq,
+		   u32 num_rqs, u32 credit_limit, unsigned int hang_limit,
 		   long timeout, struct workqueue_struct *timeout_wq,
 		   atomic_t *score, const char *name, struct device *dev);
 
 void drm_sched_fini(struct drm_gpu_scheduler *sched);
 int drm_sched_job_init(struct drm_sched_job *job,
 		       struct drm_sched_entity *entity,
-		       void *owner);
+		       u32 credits, void *owner);
 void drm_sched_job_arm(struct drm_sched_job *job);
 int drm_sched_job_add_dependency(struct drm_sched_job *job,
 				 struct dma_fence *fence);
+int drm_sched_job_add_syncobj_dependency(struct drm_sched_job *job,
+					 struct drm_file *file,
+					 u32 handle,
+					 u32 point);
+int drm_sched_job_add_resv_dependencies(struct drm_sched_job *job,
+					struct dma_resv *resv,
+					enum dma_resv_usage usage);
 int drm_sched_job_add_implicit_dependencies(struct drm_sched_job *job,
 					    struct drm_gem_object *obj,
 					    bool write);
@@ -493,17 +573,16 @@ void drm_sched_entity_modify_sched(struct drm_sched_entity *entity,
 				    struct drm_gpu_scheduler **sched_list,
                                    unsigned int num_sched_list);
 
+void drm_sched_tdr_queue_imm(struct drm_gpu_scheduler *sched);
 void drm_sched_job_cleanup(struct drm_sched_job *job);
-void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
+void drm_sched_wakeup(struct drm_gpu_scheduler *sched, struct drm_sched_entity *entity);
+bool drm_sched_wqueue_ready(struct drm_gpu_scheduler *sched);
+void drm_sched_wqueue_stop(struct drm_gpu_scheduler *sched);
+void drm_sched_wqueue_start(struct drm_gpu_scheduler *sched);
 void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
 void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery);
 void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched);
-void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max);
 void drm_sched_increase_karma(struct drm_sched_job *bad);
-void drm_sched_reset_karma(struct drm_sched_job *bad);
-void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type);
-bool drm_sched_dependency_optimized(struct dma_fence* fence,
-				    struct drm_sched_entity *entity);
 void drm_sched_fault(struct drm_gpu_scheduler *sched);
 void drm_sched_job_kickout(struct drm_sched_job *s_job);
 
@@ -512,6 +591,8 @@ void drm_sched_rq_add_entity(struct drm_sched_rq *rq,
 void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
 				struct drm_sched_entity *entity);
 
+void drm_sched_rq_update_fifo(struct drm_sched_entity *entity, ktime_t ts);
+
 int drm_sched_entity_init(struct drm_sched_entity *entity,
 			  enum drm_sched_priority priority,
 			  struct drm_gpu_scheduler **sched_list,
@@ -526,6 +607,7 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job);
 void drm_sched_entity_set_priority(struct drm_sched_entity *entity,
 				   enum drm_sched_priority priority);
 bool drm_sched_entity_is_ready(struct drm_sched_entity *entity);
+int drm_sched_entity_error(struct drm_sched_entity *entity);
 
 struct drm_sched_fence *drm_sched_fence_alloc(
 	struct drm_sched_entity *s_entity, void *owner);
@@ -533,8 +615,9 @@ void drm_sched_fence_init(struct drm_sched_fence *fence,
 			  struct drm_sched_entity *entity);
 void drm_sched_fence_free(struct drm_sched_fence *fence);
 
-void drm_sched_fence_scheduled(struct drm_sched_fence *fence);
-void drm_sched_fence_finished(struct drm_sched_fence *fence);
+void drm_sched_fence_scheduled(struct drm_sched_fence *fence,
+			       struct dma_fence *parent);
+void drm_sched_fence_finished(struct drm_sched_fence *fence, int result);
 
 unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched);
 void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched,
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 99286d2c3d6a4..9db6e9808995c 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -648,6 +648,12 @@ int dma_buf_begin_cpu_access(struct dma_buf *dma_buf,
 			     enum dma_data_direction dir);
 int dma_buf_end_cpu_access(struct dma_buf *dma_buf,
 			   enum dma_data_direction dir);
+struct sg_table *
+dma_buf_map_attachment_unlocked(struct dma_buf_attachment *attach,
+				enum dma_data_direction direction);
+void dma_buf_unmap_attachment_unlocked(struct dma_buf_attachment *attach,
+				       struct sg_table *sg_table,
+				       enum dma_data_direction direction);
 
 int dma_buf_mmap(struct dma_buf *, struct vm_area_struct *,
 		 unsigned long);
@@ -680,4 +686,6 @@ static inline size_t dma_buf_get_peak_size(void) { return 0; }
 static inline size_t dma_buf_get_total_size(void) { return 0; }
 #endif
 
+int dma_buf_vmap_unlocked(struct dma_buf *dmabuf, struct iosys_map *map);
+void dma_buf_vunmap_unlocked(struct dma_buf *dmabuf, struct iosys_map *map);
 #endif /* __DMA_BUF_H__ */
diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index be572c3a4dcdd..ebe78bd3d121d 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -257,6 +257,26 @@ struct dma_fence_ops {
 	 */
 	void (*timeline_value_str)(struct dma_fence *fence,
 				   char *str, int size);
+
+	/**
+	 * @set_deadline:
+	 *
+	 * Callback to allow a fence waiter to inform the fence signaler of
+	 * an upcoming deadline, such as vblank, by which point the waiter
+	 * would prefer the fence to be signaled by.  This is intended to
+	 * give feedback to the fence signaler to aid in power management
+	 * decisions, such as boosting GPU frequency.
+	 *
+	 * This is called without &dma_fence.lock held, it can be called
+	 * multiple times and from any context.  Locking is up to the callee
+	 * if it has some state to manage.  If multiple deadlines are set,
+	 * the expectation is to track the soonest one.  If the deadline is
+	 * before the current time, it should be interpreted as an immediate
+	 * deadline.
+	 *
+	 * This callback is optional.
+	 */
+	void (*set_deadline)(struct dma_fence *fence, ktime_t deadline);
 };
 
 void dma_fence_init(struct dma_fence *fence, const struct dma_fence_ops *ops,
@@ -548,6 +568,25 @@ static inline void dma_fence_set_error(struct dma_fence *fence,
 	fence->error = error;
 }
 
+/**
+ * dma_fence_timestamp - helper to get the completion timestamp of a fence
+ * @fence: fence to get the timestamp from.
+ *
+ * After a fence is signaled the timestamp is updated with the signaling time,
+ * but setting the timestamp can race with tasks waiting for the signaling. This
+ * helper busy waits for the correct timestamp to appear.
+ */
+static inline ktime_t dma_fence_timestamp(struct dma_fence *fence)
+{
+	if (WARN_ON(!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)))
+		return ktime_get();
+
+	while (!test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->flags))
+		cpu_relax();
+
+	return fence->timestamp;
+}
+
 signed long dma_fence_wait_timeout(struct dma_fence *,
 				   bool intr, signed long timeout);
 signed long dma_fence_wait_any_timeout(struct dma_fence **fences,
@@ -583,6 +622,8 @@ static inline signed long dma_fence_wait(struct dma_fence *fence, bool intr)
 	return ret < 0 ? ret : 0;
 }
 
+void dma_fence_set_deadline(struct dma_fence *fence, ktime_t deadline);
+
 struct dma_fence *dma_fence_get_stub(void);
 struct dma_fence *dma_fence_allocate_private_stub(ktime_t timestamp);
 u64 dma_fence_context_alloc(unsigned num);
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 1f068dfdb140c..9e0bffe3fe5f5 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -100,6 +100,30 @@ struct io_pgtable_cfg {
 	const struct iommu_flush_ops	*tlb;
 	struct device			*iommu_dev;
 
+	/**
+	 * @alloc: Custom page allocator.
+	 *
+	 * Optional hook used to allocate page tables. If this function is NULL,
+	 * @free must be NULL too.
+	 *
+	 * Memory returned should be zeroed and suitable for dma_map_single() and
+	 * virt_to_phys().
+	 *
+	 * Not all formats support custom page allocators. Before considering
+	 * passing a non-NULL value, make sure the chosen page format supports
+	 * this feature.
+	 */
+	void *(*alloc)(void *cookie, size_t size, gfp_t gfp);
+
+	/**
+	 * @free: Custom page de-allocator.
+	 *
+	 * Optional hook used to free page tables allocated with the @alloc
+	 * hook. Must be non-NULL if @alloc is not NULL, must be NULL
+	 * otherwise.
+	 */
+	void (*free)(void *cookie, void *pages, size_t size);
+
 	/* Low-level data specific to the table format */
 	union {
 		struct {
@@ -243,16 +267,26 @@ io_pgtable_tlb_add_page(struct io_pgtable *iop,
 		iop->cfg.tlb->tlb_add_page(gather, iova, granule, iop->cookie);
 }
 
+/**
+ * enum io_pgtable_caps - IO page table backend capabilities.
+ */
+enum io_pgtable_caps {
+	/** @IO_PGTABLE_CAP_CUSTOM_ALLOCATOR: Backend accepts custom page table allocators. */
+	IO_PGTABLE_CAP_CUSTOM_ALLOCATOR = BIT(0),
+};
+
 /**
  * struct io_pgtable_init_fns - Alloc/free a set of page tables for a
  *                              particular format.
  *
  * @alloc: Allocate a set of page tables described by cfg.
  * @free:  Free the page tables associated with iop.
+ * @caps:  Combination of @io_pgtable_caps flags encoding the backend capabilities.
  */
 struct io_pgtable_init_fns {
 	struct io_pgtable *(*alloc)(struct io_pgtable_cfg *cfg, void *cookie);
 	void (*free)(struct io_pgtable *iop);
+	u32 caps;
 };
 
 extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s1_init_fns;
diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
new file mode 100644
index 0000000000000..373df80f41edf
--- /dev/null
+++ b/include/uapi/drm/panthor_drm.h
@@ -0,0 +1,945 @@
+/* SPDX-License-Identifier: MIT */
+/* Copyright (C) 2023 Collabora ltd. */
+#ifndef _PANTHOR_DRM_H_
+#define _PANTHOR_DRM_H_
+
+#include "drm.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/**
+ * DOC: Introduction
+ *
+ * This documentation describes the Panthor IOCTLs.
+ *
+ * Just a few generic rules about the data passed to the Panthor IOCTLs:
+ *
+ * - Structures must be aligned on 64-bit/8-byte. If the object is not
+ *   naturally aligned, a padding field must be added.
+ * - Fields must be explicitly aligned to their natural type alignment with
+ *   pad[0..N] fields.
+ * - All padding fields will be checked by the driver to make sure they are
+ *   zeroed.
+ * - Flags can be added, but not removed/replaced.
+ * - New fields can be added to the main structures (the structures
+ *   directly passed to the ioctl). Those fields can be added at the end of
+ *   the structure, or replace existing padding fields. Any new field being
+ *   added must preserve the behavior that existed before those fields were
+ *   added when a value of zero is passed.
+ * - New fields can be added to indirect objects (objects pointed by the
+ *   main structure), iff those objects are passed a size to reflect the
+ *   size known by the userspace driver (see drm_panthor_obj_array::stride
+ *   or drm_panthor_dev_query::size).
+ * - If the kernel driver is too old to know some fields, those will be
+ *   ignored if zero, and otherwise rejected (and so will be zero on output).
+ * - If userspace is too old to know some fields, those will be zeroed
+ *   (input) before the structure is parsed by the kernel driver.
+ * - Each new flag/field addition must come with a driver version update so
+ *   the userspace driver doesn't have to trial and error to know which
+ *   flags are supported.
+ * - Structures should not contain unions, as this would defeat the
+ *   extensibility of such structures.
+ * - IOCTLs can't be removed or replaced. New IOCTL IDs should be placed
+ *   at the end of the drm_panthor_ioctl_id enum.
+ */
+
+/**
+ * DOC: MMIO regions exposed to userspace.
+ *
+ * .. c:macro:: DRM_PANTHOR_USER_MMIO_OFFSET
+ *
+ * File offset for all MMIO regions being exposed to userspace. Don't use
+ * this value directly, use DRM_PANTHOR_USER_<name>_OFFSET values instead.
+ * pgoffset passed to mmap2() is an unsigned long, which forces us to use a
+ * different offset on 32-bit and 64-bit systems.
+ *
+ * .. c:macro:: DRM_PANTHOR_USER_FLUSH_ID_MMIO_OFFSET
+ *
+ * File offset for the LATEST_FLUSH_ID register. The Userspace driver controls
+ * GPU cache flushing through CS instructions, but the flush reduction
+ * mechanism requires a flush_id. This flush_id could be queried with an
+ * ioctl, but Arm provides a well-isolated register page containing only this
+ * read-only register, so let's expose this page through a static mmap offset
+ * and allow direct mapping of this MMIO region so we can avoid the
+ * user <-> kernel round-trip.
+ */
+#define DRM_PANTHOR_USER_MMIO_OFFSET_32BIT	(1ull << 43)
+#define DRM_PANTHOR_USER_MMIO_OFFSET_64BIT	(1ull << 56)
+#define DRM_PANTHOR_USER_MMIO_OFFSET		(sizeof(unsigned long) < 8 ? \
+						 DRM_PANTHOR_USER_MMIO_OFFSET_32BIT : \
+						 DRM_PANTHOR_USER_MMIO_OFFSET_64BIT)
+#define DRM_PANTHOR_USER_FLUSH_ID_MMIO_OFFSET	(DRM_PANTHOR_USER_MMIO_OFFSET | 0)
+
+/**
+ * DOC: IOCTL IDs
+ *
+ * enum drm_panthor_ioctl_id - IOCTL IDs
+ *
+ * Place new ioctls at the end, don't re-order, don't replace or remove entries.
+ *
+ * These IDs are not meant to be used directly. Use the DRM_IOCTL_PANTHOR_xxx
+ * definitions instead.
+ */
+enum drm_panthor_ioctl_id {
+	/** @DRM_PANTHOR_DEV_QUERY: Query device information. */
+	DRM_PANTHOR_DEV_QUERY = 0,
+
+	/** @DRM_PANTHOR_VM_CREATE: Create a VM. */
+	DRM_PANTHOR_VM_CREATE,
+
+	/** @DRM_PANTHOR_VM_DESTROY: Destroy a VM. */
+	DRM_PANTHOR_VM_DESTROY,
+
+	/** @DRM_PANTHOR_VM_BIND: Bind/unbind memory to a VM. */
+	DRM_PANTHOR_VM_BIND,
+
+	/** @DRM_PANTHOR_VM_GET_STATE: Get VM state. */
+	DRM_PANTHOR_VM_GET_STATE,
+
+	/** @DRM_PANTHOR_BO_CREATE: Create a buffer object. */
+	DRM_PANTHOR_BO_CREATE,
+
+	/**
+	 * @DRM_PANTHOR_BO_MMAP_OFFSET: Get the file offset to pass to
+	 * mmap to map a GEM object.
+	 */
+	DRM_PANTHOR_BO_MMAP_OFFSET,
+
+	/** @DRM_PANTHOR_GROUP_CREATE: Create a scheduling group. */
+	DRM_PANTHOR_GROUP_CREATE,
+
+	/** @DRM_PANTHOR_GROUP_DESTROY: Destroy a scheduling group. */
+	DRM_PANTHOR_GROUP_DESTROY,
+
+	/**
+	 * @DRM_PANTHOR_GROUP_SUBMIT: Submit jobs to queues belonging
+	 * to a specific scheduling group.
+	 */
+	DRM_PANTHOR_GROUP_SUBMIT,
+
+	/** @DRM_PANTHOR_GROUP_GET_STATE: Get the state of a scheduling group. */
+	DRM_PANTHOR_GROUP_GET_STATE,
+
+	/** @DRM_PANTHOR_TILER_HEAP_CREATE: Create a tiler heap. */
+	DRM_PANTHOR_TILER_HEAP_CREATE,
+
+	/** @DRM_PANTHOR_TILER_HEAP_DESTROY: Destroy a tiler heap. */
+	DRM_PANTHOR_TILER_HEAP_DESTROY,
+};
+
+/**
+ * DRM_IOCTL_PANTHOR() - Build a Panthor IOCTL number
+ * @__access: Access type. Must be R, W or RW.
+ * @__id: One of the DRM_PANTHOR_xxx id.
+ * @__type: Suffix of the type being passed to the IOCTL.
+ *
+ * Don't use this macro directly, use the DRM_IOCTL_PANTHOR_xxx
+ * values instead.
+ *
+ * Return: An IOCTL number to be passed to ioctl() from userspace.
+ */
+#define DRM_IOCTL_PANTHOR(__access, __id, __type) \
+	DRM_IO ## __access(DRM_COMMAND_BASE + DRM_PANTHOR_ ## __id, \
+			   struct drm_panthor_ ## __type)
+
+#define DRM_IOCTL_PANTHOR_DEV_QUERY \
+	DRM_IOCTL_PANTHOR(WR, DEV_QUERY, dev_query)
+#define DRM_IOCTL_PANTHOR_VM_CREATE \
+	DRM_IOCTL_PANTHOR(WR, VM_CREATE, vm_create)
+#define DRM_IOCTL_PANTHOR_VM_DESTROY \
+	DRM_IOCTL_PANTHOR(WR, VM_DESTROY, vm_destroy)
+#define DRM_IOCTL_PANTHOR_VM_BIND \
+	DRM_IOCTL_PANTHOR(WR, VM_BIND, vm_bind)
+#define DRM_IOCTL_PANTHOR_VM_GET_STATE \
+	DRM_IOCTL_PANTHOR(WR, VM_GET_STATE, vm_get_state)
+#define DRM_IOCTL_PANTHOR_BO_CREATE \
+	DRM_IOCTL_PANTHOR(WR, BO_CREATE, bo_create)
+#define DRM_IOCTL_PANTHOR_BO_MMAP_OFFSET \
+	DRM_IOCTL_PANTHOR(WR, BO_MMAP_OFFSET, bo_mmap_offset)
+#define DRM_IOCTL_PANTHOR_GROUP_CREATE \
+	DRM_IOCTL_PANTHOR(WR, GROUP_CREATE, group_create)
+#define DRM_IOCTL_PANTHOR_GROUP_DESTROY \
+	DRM_IOCTL_PANTHOR(WR, GROUP_DESTROY, group_destroy)
+#define DRM_IOCTL_PANTHOR_GROUP_SUBMIT \
+	DRM_IOCTL_PANTHOR(WR, GROUP_SUBMIT, group_submit)
+#define DRM_IOCTL_PANTHOR_GROUP_GET_STATE \
+	DRM_IOCTL_PANTHOR(WR, GROUP_GET_STATE, group_get_state)
+#define DRM_IOCTL_PANTHOR_TILER_HEAP_CREATE \
+	DRM_IOCTL_PANTHOR(WR, TILER_HEAP_CREATE, tiler_heap_create)
+#define DRM_IOCTL_PANTHOR_TILER_HEAP_DESTROY \
+	DRM_IOCTL_PANTHOR(WR, TILER_HEAP_DESTROY, tiler_heap_destroy)
+
+/**
+ * DOC: IOCTL arguments
+ */
+
+/**
+ * struct drm_panthor_obj_array - Object array.
+ *
+ * This object is used to pass an array of objects whose size is subject to changes in
+ * future versions of the driver. In order to support this mutability, we pass a stride
+ * describing the size of the object as known by userspace.
+ *
+ * You shouldn't fill drm_panthor_obj_array fields directly. You should instead use
+ * the DRM_PANTHOR_OBJ_ARRAY() macro that takes care of initializing the stride to
+ * the object size.
+ */
+struct drm_panthor_obj_array {
+	/** @stride: Stride of object struct. Used for versioning. */
+	__u32 stride;
+
+	/** @count: Number of objects in the array. */
+	__u32 count;
+
+	/** @array: User pointer to an array of objects. */
+	__u64 array;
+};
+
+/**
+ * DRM_PANTHOR_OBJ_ARRAY() - Initialize a drm_panthor_obj_array field.
+ * @cnt: Number of elements in the array.
+ * @ptr: Pointer to the array to pass to the kernel.
+ *
+ * Macro initializing a drm_panthor_obj_array based on the object size as known
+ * by userspace.
+ */
+#define DRM_PANTHOR_OBJ_ARRAY(cnt, ptr) \
+	{ .stride = sizeof((ptr)[0]), .count = (cnt), .array = (__u64)(uintptr_t)(ptr) }
+
+/**
+ * enum drm_panthor_sync_op_flags - Synchronization operation flags.
+ */
+enum drm_panthor_sync_op_flags {
+	/** @DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_MASK: Synchronization handle type mask. */
+	DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_MASK = 0xff,
+
+	/** @DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ: Synchronization object type. */
+	DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ = 0,
+
+	/**
+	 * @DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ: Timeline synchronization
+	 * object type.
+	 */
+	DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ = 1,
+
+	/** @DRM_PANTHOR_SYNC_OP_WAIT: Wait operation. */
+	DRM_PANTHOR_SYNC_OP_WAIT = 0 << 31,
+
+	/** @DRM_PANTHOR_SYNC_OP_SIGNAL: Signal operation. */
+	DRM_PANTHOR_SYNC_OP_SIGNAL = (int)(1u << 31),
+};
+
+/**
+ * struct drm_panthor_sync_op - Synchronization operation.
+ */
+struct drm_panthor_sync_op {
+	/** @flags: Synchronization operation flags. Combination of DRM_PANTHOR_SYNC_OP values. */
+	__u32 flags;
+
+	/** @handle: Sync handle. */
+	__u32 handle;
+
+	/**
+	 * @timeline_value: MBZ if
+	 * (flags & DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_MASK) !=
+	 * DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ.
+	 */
+	__u64 timeline_value;
+};
+
+/**
+ * enum drm_panthor_dev_query_type - Query type
+ *
+ * Place new types at the end, don't re-order, don't remove or replace.
+ */
+enum drm_panthor_dev_query_type {
+	/** @DRM_PANTHOR_DEV_QUERY_GPU_INFO: Query GPU information. */
+	DRM_PANTHOR_DEV_QUERY_GPU_INFO = 0,
+
+	/** @DRM_PANTHOR_DEV_QUERY_CSIF_INFO: Query command-stream interface information. */
+	DRM_PANTHOR_DEV_QUERY_CSIF_INFO,
+};
+
+/**
+ * struct drm_panthor_gpu_info - GPU information
+ *
+ * Structure grouping all queryable information relating to the GPU.
+ */
+struct drm_panthor_gpu_info {
+	/** @gpu_id : GPU ID. */
+	__u32 gpu_id;
+#define DRM_PANTHOR_ARCH_MAJOR(x)		((x) >> 28)
+#define DRM_PANTHOR_ARCH_MINOR(x)		(((x) >> 24) & 0xf)
+#define DRM_PANTHOR_ARCH_REV(x)			(((x) >> 20) & 0xf)
+#define DRM_PANTHOR_PRODUCT_MAJOR(x)		(((x) >> 16) & 0xf)
+#define DRM_PANTHOR_VERSION_MAJOR(x)		(((x) >> 12) & 0xf)
+#define DRM_PANTHOR_VERSION_MINOR(x)		(((x) >> 4) & 0xff)
+#define DRM_PANTHOR_VERSION_STATUS(x)		((x) & 0xf)
+
+	/** @gpu_rev: GPU revision. */
+	__u32 gpu_rev;
+
+	/** @csf_id: Command stream frontend ID. */
+	__u32 csf_id;
+#define DRM_PANTHOR_CSHW_MAJOR(x)		(((x) >> 26) & 0x3f)
+#define DRM_PANTHOR_CSHW_MINOR(x)		(((x) >> 20) & 0x3f)
+#define DRM_PANTHOR_CSHW_REV(x)			(((x) >> 16) & 0xf)
+#define DRM_PANTHOR_MCU_MAJOR(x)		(((x) >> 10) & 0x3f)
+#define DRM_PANTHOR_MCU_MINOR(x)		(((x) >> 4) & 0x3f)
+#define DRM_PANTHOR_MCU_REV(x)			((x) & 0xf)
+
+	/** @l2_features: L2-cache features. */
+	__u32 l2_features;
+
+	/** @tiler_features: Tiler features. */
+	__u32 tiler_features;
+
+	/** @mem_features: Memory features. */
+	__u32 mem_features;
+
+	/** @mmu_features: MMU features. */
+	__u32 mmu_features;
+#define DRM_PANTHOR_MMU_VA_BITS(x)		((x) & 0xff)
+
+	/** @thread_features: Thread features. */
+	__u32 thread_features;
+
+	/** @max_threads: Maximum number of threads. */
+	__u32 max_threads;
+
+	/** @thread_max_workgroup_size: Maximum workgroup size. */
+	__u32 thread_max_workgroup_size;
+
+	/**
+	 * @thread_max_barrier_size: Maximum number of threads that can wait
+	 * simultaneously on a barrier.
+	 */
+	__u32 thread_max_barrier_size;
+
+	/** @coherency_features: Coherency features. */
+	__u32 coherency_features;
+
+	/** @texture_features: Texture features. */
+	__u32 texture_features[4];
+
+	/** @as_present: Bitmask encoding the number of address-space exposed by the MMU. */
+	__u32 as_present;
+
+	/** @shader_present: Bitmask encoding the shader cores exposed by the GPU. */
+	__u64 shader_present;
+
+	/** @l2_present: Bitmask encoding the L2 caches exposed by the GPU. */
+	__u64 l2_present;
+
+	/** @tiler_present: Bitmask encoding the tiler units exposed by the GPU. */
+	__u64 tiler_present;
+
+	/* @core_features: Used to discriminate core variants when they exist. */
+	__u32 core_features;
+
+	/* @pad: MBZ. */
+	__u32 pad;
+};
+
+/**
+ * struct drm_panthor_csif_info - Command stream interface information
+ *
+ * Structure grouping all queryable information relating to the command stream interface.
+ */
+struct drm_panthor_csif_info {
+	/** @csg_slot_count: Number of command stream group slots exposed by the firmware. */
+	__u32 csg_slot_count;
+
+	/** @cs_slot_count: Number of command stream slots per group. */
+	__u32 cs_slot_count;
+
+	/** @cs_reg_count: Number of command stream registers. */
+	__u32 cs_reg_count;
+
+	/** @scoreboard_slot_count: Number of scoreboard slots. */
+	__u32 scoreboard_slot_count;
+
+	/**
+	 * @unpreserved_cs_reg_count: Number of command stream registers reserved by
+	 * the kernel driver to call a userspace command stream.
+	 *
+	 * All registers can be used by a userspace command stream, but the
+	 * [cs_slot_count - unpreserved_cs_reg_count .. cs_slot_count] registers are
+	 * used by the kernel when DRM_PANTHOR_IOCTL_GROUP_SUBMIT is called.
+	 */
+	__u32 unpreserved_cs_reg_count;
+
+	/**
+	 * @pad: Padding field, set to zero.
+	 */
+	__u32 pad;
+};
+
+/**
+ * struct drm_panthor_dev_query - Arguments passed to DRM_PANTHOR_IOCTL_DEV_QUERY
+ */
+struct drm_panthor_dev_query {
+	/** @type: the query type (see drm_panthor_dev_query_type). */
+	__u32 type;
+
+	/**
+	 * @size: size of the type being queried.
+	 *
+	 * If pointer is NULL, size is updated by the driver to provide the
+	 * output structure size. If pointer is not NULL, the driver will
+	 * only copy min(size, actual_structure_size) bytes to the pointer,
+	 * and update the size accordingly. This allows us to extend query
+	 * types without breaking userspace.
+	 */
+	__u32 size;
+
+	/**
+	 * @pointer: user pointer to a query type struct.
+	 *
+	 * Pointer can be NULL, in which case, nothing is copied, but the
+	 * actual structure size is returned. If not NULL, it must point to
+	 * a location that's large enough to hold size bytes.
+	 */
+	__u64 pointer;
+};
+
+/**
+ * struct drm_panthor_vm_create - Arguments passed to DRM_PANTHOR_IOCTL_VM_CREATE
+ */
+struct drm_panthor_vm_create {
+	/** @flags: VM flags, MBZ. */
+	__u32 flags;
+
+	/** @id: Returned VM ID. */
+	__u32 id;
+
+	/**
+	 * @user_va_range: Size of the VA space reserved for user objects.
+	 *
+	 * The kernel will pick the remaining space to map kernel-only objects to the
+	 * VM (heap chunks, heap context, ring buffers, kernel synchronization objects,
+	 * ...). If the space left for kernel objects is too small, kernel object
+	 * allocation will fail further down the road. One can use
+	 * drm_panthor_gpu_info::mmu_features to extract the total virtual address
+	 * range, and chose a user_va_range that leaves some space to the kernel.
+	 *
+	 * If user_va_range is zero, the kernel will pick a sensible value based on
+	 * TASK_SIZE and the virtual range supported by the GPU MMU (the kernel/user
+	 * split should leave enough VA space for userspace processes to support SVM,
+	 * while still allowing the kernel to map some amount of kernel objects in
+	 * the kernel VA range). The value chosen by the driver will be returned in
+	 * @user_va_range.
+	 *
+	 * User VA space always starts at 0x0, kernel VA space is always placed after
+	 * the user VA range.
+	 */
+	__u64 user_va_range;
+};
+
+/**
+ * struct drm_panthor_vm_destroy - Arguments passed to DRM_PANTHOR_IOCTL_VM_DESTROY
+ */
+struct drm_panthor_vm_destroy {
+	/** @id: ID of the VM to destroy. */
+	__u32 id;
+
+	/** @pad: MBZ. */
+	__u32 pad;
+};
+
+/**
+ * enum drm_panthor_vm_bind_op_flags - VM bind operation flags
+ */
+enum drm_panthor_vm_bind_op_flags {
+	/**
+	 * @DRM_PANTHOR_VM_BIND_OP_MAP_READONLY: Map the memory read-only.
+	 *
+	 * Only valid with DRM_PANTHOR_VM_BIND_OP_TYPE_MAP.
+	 */
+	DRM_PANTHOR_VM_BIND_OP_MAP_READONLY = 1 << 0,
+
+	/**
+	 * @DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC: Map the memory not-executable.
+	 *
+	 * Only valid with DRM_PANTHOR_VM_BIND_OP_TYPE_MAP.
+	 */
+	DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC = 1 << 1,
+
+	/**
+	 * @DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED: Map the memory uncached.
+	 *
+	 * Only valid with DRM_PANTHOR_VM_BIND_OP_TYPE_MAP.
+	 */
+	DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED = 1 << 2,
+
+	/**
+	 * @DRM_PANTHOR_VM_BIND_OP_TYPE_MASK: Mask used to determine the type of operation.
+	 */
+	DRM_PANTHOR_VM_BIND_OP_TYPE_MASK = (int)(0xfu << 28),
+
+	/** @DRM_PANTHOR_VM_BIND_OP_TYPE_MAP: Map operation. */
+	DRM_PANTHOR_VM_BIND_OP_TYPE_MAP = 0 << 28,
+
+	/** @DRM_PANTHOR_VM_BIND_OP_TYPE_UNMAP: Unmap operation. */
+	DRM_PANTHOR_VM_BIND_OP_TYPE_UNMAP = 1 << 28,
+
+	/**
+	 * @DRM_PANTHOR_VM_BIND_OP_TYPE_SYNC_ONLY: No VM operation.
+	 *
+	 * Just serves as a synchronization point on a VM queue.
+	 *
+	 * Only valid if %DRM_PANTHOR_VM_BIND_ASYNC is set in drm_panthor_vm_bind::flags,
+	 * and drm_panthor_vm_bind_op::syncs contains at least one element.
+	 */
+	DRM_PANTHOR_VM_BIND_OP_TYPE_SYNC_ONLY = 2 << 28,
+};
+
+/**
+ * struct drm_panthor_vm_bind_op - VM bind operation
+ */
+struct drm_panthor_vm_bind_op {
+	/** @flags: Combination of drm_panthor_vm_bind_op_flags flags. */
+	__u32 flags;
+
+	/**
+	 * @bo_handle: Handle of the buffer object to map.
+	 * MBZ for unmap or sync-only operations.
+	 */
+	__u32 bo_handle;
+
+	/**
+	 * @bo_offset: Buffer object offset.
+	 * MBZ for unmap or sync-only operations.
+	 */
+	__u64 bo_offset;
+
+	/**
+	 * @va: Virtual address to map/unmap.
+	 * MBZ for sync-only operations.
+	 */
+	__u64 va;
+
+	/**
+	 * @size: Size to map/unmap.
+	 * MBZ for sync-only operations.
+	 */
+	__u64 size;
+
+	/**
+	 * @syncs: Array of struct drm_panthor_sync_op synchronization
+	 * operations.
+	 *
+	 * This array must be empty if %DRM_PANTHOR_VM_BIND_ASYNC is not set on
+	 * the drm_panthor_vm_bind object containing this VM bind operation.
+	 *
+	 * This array shall not be empty for sync-only operations.
+	 */
+	struct drm_panthor_obj_array syncs;
+
+};
+
+/**
+ * enum drm_panthor_vm_bind_flags - VM bind flags
+ */
+enum drm_panthor_vm_bind_flags {
+	/**
+	 * @DRM_PANTHOR_VM_BIND_ASYNC: VM bind operations are queued to the VM
+	 * queue instead of being executed synchronously.
+	 */
+	DRM_PANTHOR_VM_BIND_ASYNC = 1 << 0,
+};
+
+/**
+ * struct drm_panthor_vm_bind - Arguments passed to DRM_IOCTL_PANTHOR_VM_BIND
+ */
+struct drm_panthor_vm_bind {
+	/** @vm_id: VM targeted by the bind request. */
+	__u32 vm_id;
+
+	/** @flags: Combination of drm_panthor_vm_bind_flags flags. */
+	__u32 flags;
+
+	/** @ops: Array of struct drm_panthor_vm_bind_op bind operations. */
+	struct drm_panthor_obj_array ops;
+};
+
+/**
+ * enum drm_panthor_vm_state - VM states.
+ */
+enum drm_panthor_vm_state {
+	/**
+	 * @DRM_PANTHOR_VM_STATE_USABLE: VM is usable.
+	 *
+	 * New VM operations will be accepted on this VM.
+	 */
+	DRM_PANTHOR_VM_STATE_USABLE,
+
+	/**
+	 * @DRM_PANTHOR_VM_STATE_UNUSABLE: VM is unusable.
+	 *
+	 * Something put the VM in an unusable state (like an asynchronous
+	 * VM_BIND request failing for any reason).
+	 *
+	 * Once the VM is in this state, all new MAP operations will be
+	 * rejected, and any GPU job targeting this VM will fail.
+	 * UNMAP operations are still accepted.
+	 *
+	 * The only way to recover from an unusable VM is to create a new
+	 * VM, and destroy the old one.
+	 */
+	DRM_PANTHOR_VM_STATE_UNUSABLE,
+};
+
+/**
+ * struct drm_panthor_vm_get_state - Get VM state.
+ */
+struct drm_panthor_vm_get_state {
+	/** @vm_id: VM targeted by the get_state request. */
+	__u32 vm_id;
+
+	/**
+	 * @state: state returned by the driver.
+	 *
+	 * Must be one of the enum drm_panthor_vm_state values.
+	 */
+	__u32 state;
+};
+
+/**
+ * enum drm_panthor_bo_flags - Buffer object flags, passed at creation time.
+ */
+enum drm_panthor_bo_flags {
+	/** @DRM_PANTHOR_BO_NO_MMAP: The buffer object will never be CPU-mapped in userspace. */
+	DRM_PANTHOR_BO_NO_MMAP = (1 << 0),
+};
+
+/**
+ * struct drm_panthor_bo_create - Arguments passed to DRM_IOCTL_PANTHOR_BO_CREATE.
+ */
+struct drm_panthor_bo_create {
+	/**
+	 * @size: Requested size for the object
+	 *
+	 * The (page-aligned) allocated size for the object will be returned.
+	 */
+	__u64 size;
+
+	/**
+	 * @flags: Flags. Must be a combination of drm_panthor_bo_flags flags.
+	 */
+	__u32 flags;
+
+	/**
+	 * @exclusive_vm_id: Exclusive VM this buffer object will be mapped to.
+	 *
+	 * If not zero, the field must refer to a valid VM ID, and implies that:
+	 *  - the buffer object will only ever be bound to that VM
+	 *  - cannot be exported as a PRIME fd
+	 */
+	__u32 exclusive_vm_id;
+
+	/**
+	 * @handle: Returned handle for the object.
+	 *
+	 * Object handles are nonzero.
+	 */
+	__u32 handle;
+
+	/** @pad: MBZ. */
+	__u32 pad;
+};
+
+/**
+ * struct drm_panthor_bo_mmap_offset - Arguments passed to DRM_IOCTL_PANTHOR_BO_MMAP_OFFSET.
+ */
+struct drm_panthor_bo_mmap_offset {
+	/** @handle: Handle of the object we want an mmap offset for. */
+	__u32 handle;
+
+	/** @pad: MBZ. */
+	__u32 pad;
+
+	/** @offset: The fake offset to use for subsequent mmap calls. */
+	__u64 offset;
+};
+
+/**
+ * struct drm_panthor_queue_create - Queue creation arguments.
+ */
+struct drm_panthor_queue_create {
+	/**
+	 * @priority: Defines the priority of queues inside a group. Goes from 0 to 15,
+	 * 15 being the highest priority.
+	 */
+	__u8 priority;
+
+	/** @pad: Padding fields, MBZ. */
+	__u8 pad[3];
+
+	/** @ringbuf_size: Size of the ring buffer to allocate to this queue. */
+	__u32 ringbuf_size;
+};
+
+/**
+ * enum drm_panthor_group_priority - Scheduling group priority
+ */
+enum drm_panthor_group_priority {
+	/** @PANTHOR_GROUP_PRIORITY_LOW: Low priority group. */
+	PANTHOR_GROUP_PRIORITY_LOW = 0,
+
+	/** @PANTHOR_GROUP_PRIORITY_MEDIUM: Medium priority group. */
+	PANTHOR_GROUP_PRIORITY_MEDIUM,
+
+	/** @PANTHOR_GROUP_PRIORITY_HIGH: High priority group. */
+	PANTHOR_GROUP_PRIORITY_HIGH,
+};
+
+/**
+ * struct drm_panthor_group_create - Arguments passed to DRM_IOCTL_PANTHOR_GROUP_CREATE
+ */
+struct drm_panthor_group_create {
+	/** @queues: Array of drm_panthor_queue_create elements. */
+	struct drm_panthor_obj_array queues;
+
+	/**
+	 * @max_compute_cores: Maximum number of cores that can be used by compute
+	 * jobs across CS queues bound to this group.
+	 *
+	 * Must be less or equal to the number of bits set in @compute_core_mask.
+	 */
+	__u8 max_compute_cores;
+
+	/**
+	 * @max_fragment_cores: Maximum number of cores that can be used by fragment
+	 * jobs across CS queues bound to this group.
+	 *
+	 * Must be less or equal to the number of bits set in @fragment_core_mask.
+	 */
+	__u8 max_fragment_cores;
+
+	/**
+	 * @max_tiler_cores: Maximum number of tilers that can be used by tiler jobs
+	 * across CS queues bound to this group.
+	 *
+	 * Must be less or equal to the number of bits set in @tiler_core_mask.
+	 */
+	__u8 max_tiler_cores;
+
+	/** @priority: Group priority (see enum drm_panthor_group_priority). */
+	__u8 priority;
+
+	/** @pad: Padding field, MBZ. */
+	__u32 pad;
+
+	/**
+	 * @compute_core_mask: Mask encoding cores that can be used for compute jobs.
+	 *
+	 * This field must have at least @max_compute_cores bits set.
+	 *
+	 * The bits set here should also be set in drm_panthor_gpu_info::shader_present.
+	 */
+	__u64 compute_core_mask;
+
+	/**
+	 * @fragment_core_mask: Mask encoding cores that can be used for fragment jobs.
+	 *
+	 * This field must have at least @max_fragment_cores bits set.
+	 *
+	 * The bits set here should also be set in drm_panthor_gpu_info::shader_present.
+	 */
+	__u64 fragment_core_mask;
+
+	/**
+	 * @tiler_core_mask: Mask encoding cores that can be used for tiler jobs.
+	 *
+	 * This field must have at least @max_tiler_cores bits set.
+	 *
+	 * The bits set here should also be set in drm_panthor_gpu_info::tiler_present.
+	 */
+	__u64 tiler_core_mask;
+
+	/**
+	 * @vm_id: VM ID to bind this group to.
+	 *
+	 * All submission to queues bound to this group will use this VM.
+	 */
+	__u32 vm_id;
+
+	/**
+	 * @group_handle: Returned group handle. Passed back when submitting jobs or
+	 * destroying a group.
+	 */
+	__u32 group_handle;
+};
+
+/**
+ * struct drm_panthor_group_destroy - Arguments passed to DRM_IOCTL_PANTHOR_GROUP_DESTROY
+ */
+struct drm_panthor_group_destroy {
+	/** @group_handle: Group to destroy */
+	__u32 group_handle;
+
+	/** @pad: Padding field, MBZ. */
+	__u32 pad;
+};
+
+/**
+ * struct drm_panthor_queue_submit - Job submission arguments.
+ *
+ * This is describing the userspace command stream to call from the kernel
+ * command stream ring-buffer. Queue submission is always part of a group
+ * submission, taking one or more jobs to submit to the underlying queues.
+ */
+struct drm_panthor_queue_submit {
+	/** @queue_index: Index of the queue inside a group. */
+	__u32 queue_index;
+
+	/**
+	 * @stream_size: Size of the command stream to execute.
+	 *
+	 * Must be 64-bit/8-byte aligned (the size of a CS instruction)
+	 *
+	 * Can be zero if stream_addr is zero too.
+	 */
+	__u32 stream_size;
+
+	/**
+	 * @stream_addr: GPU address of the command stream to execute.
+	 *
+	 * Must be aligned on 64-byte.
+	 *
+	 * Can be zero is stream_size is zero too.
+	 */
+	__u64 stream_addr;
+
+	/**
+	 * @latest_flush: FLUSH_ID read at the time the stream was built.
+	 *
+	 * This allows cache flush elimination for the automatic
+	 * flush+invalidate(all) done at submission time, which is needed to
+	 * ensure the GPU doesn't get garbage when reading the indirect command
+	 * stream buffers. If you want the cache flush to happen
+	 * unconditionally, pass a zero here.
+	 */
+	__u32 latest_flush;
+
+	/** @pad: MBZ. */
+	__u32 pad;
+
+	/** @syncs: Array of struct drm_panthor_sync_op sync operations. */
+	struct drm_panthor_obj_array syncs;
+};
+
+/**
+ * struct drm_panthor_group_submit - Arguments passed to DRM_IOCTL_PANTHOR_GROUP_SUBMIT
+ */
+struct drm_panthor_group_submit {
+	/** @group_handle: Handle of the group to queue jobs to. */
+	__u32 group_handle;
+
+	/** @pad: MBZ. */
+	__u32 pad;
+
+	/** @queue_submits: Array of drm_panthor_queue_submit objects. */
+	struct drm_panthor_obj_array queue_submits;
+};
+
+/**
+ * enum drm_panthor_group_state_flags - Group state flags
+ */
+enum drm_panthor_group_state_flags {
+	/**
+	 * @DRM_PANTHOR_GROUP_STATE_TIMEDOUT: Group had unfinished jobs.
+	 *
+	 * When a group ends up with this flag set, no jobs can be submitted to its queues.
+	 */
+	DRM_PANTHOR_GROUP_STATE_TIMEDOUT = 1 << 0,
+
+	/**
+	 * @DRM_PANTHOR_GROUP_STATE_FATAL_FAULT: Group had fatal faults.
+	 *
+	 * When a group ends up with this flag set, no jobs can be submitted to its queues.
+	 */
+	DRM_PANTHOR_GROUP_STATE_FATAL_FAULT = 1 << 1,
+};
+
+/**
+ * struct drm_panthor_group_get_state - Arguments passed to DRM_IOCTL_PANTHOR_GROUP_GET_STATE
+ *
+ * Used to query the state of a group and decide whether a new group should be created to
+ * replace it.
+ */
+struct drm_panthor_group_get_state {
+	/** @group_handle: Handle of the group to query state on */
+	__u32 group_handle;
+
+	/**
+	 * @state: Combination of DRM_PANTHOR_GROUP_STATE_* flags encoding the
+	 * group state.
+	 */
+	__u32 state;
+
+	/** @fatal_queues: Bitmask of queues that faced fatal faults. */
+	__u32 fatal_queues;
+
+	/** @pad: MBZ */
+	__u32 pad;
+};
+
+/**
+ * struct drm_panthor_tiler_heap_create - Arguments passed to DRM_IOCTL_PANTHOR_TILER_HEAP_CREATE
+ */
+struct drm_panthor_tiler_heap_create {
+	/** @vm_id: VM ID the tiler heap should be mapped to */
+	__u32 vm_id;
+
+	/** @initial_chunk_count: Initial number of chunks to allocate. */
+	__u32 initial_chunk_count;
+
+	/** @chunk_size: Chunk size. Must be a power of two at least 256KB large. */
+	__u32 chunk_size;
+
+	/** @max_chunks: Maximum number of chunks that can be allocated. */
+	__u32 max_chunks;
+
+	/**
+	 * @target_in_flight: Maximum number of in-flight render passes.
+	 *
+	 * If the heap has more than tiler jobs in-flight, the FW will wait for render
+	 * passes to finish before queuing new tiler jobs.
+	 */
+	__u32 target_in_flight;
+
+	/** @handle: Returned heap handle. Passed back to DESTROY_TILER_HEAP. */
+	__u32 handle;
+
+	/** @tiler_heap_ctx_gpu_va: Returned heap GPU virtual address returned */
+	__u64 tiler_heap_ctx_gpu_va;
+
+	/**
+	 * @first_heap_chunk_gpu_va: First heap chunk.
+	 *
+	 * The tiler heap is formed of heap chunks forming a single-link list. This
+	 * is the first element in the list.
+	 */
+	__u64 first_heap_chunk_gpu_va;
+};
+
+/**
+ * struct drm_panthor_tiler_heap_destroy - Arguments passed to DRM_IOCTL_PANTHOR_TILER_HEAP_DESTROY
+ */
+struct drm_panthor_tiler_heap_destroy {
+	/** @handle: Handle of the tiler heap to destroy */
+	__u32 handle;
+
+	/** @pad: Padding field, MBZ. */
+	__u32 pad;
+};
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* _PANTHOR_DRM_H_ */