From 51d4814a7e857fe520d2bf337bbf86ed0a458013 Mon Sep 17 00:00:00 2001
From: Unity Technologies <@unity>
Date: Mon, 17 Apr 2023 00:00:00 +0000
Subject: [PATCH] com.unity.entities.graphics@1.0.8 ## [1.0.8] - 2023-04-17

### Added

* Support for CPU-based (Burst) masked occlusion culling on Neon processors.
* Explicit usage of Burst's FMA optimization, parallel sort, and missing changes from previous review feedback of boc-neon branch

### Changed

* Greatly improved performance of CPU-based (Burst) masked occlusion culling.
* Greatly improved performance of Depth and Test debug views used with CPU-based (Burst) masked occlusion culling.
* Reduced the amount of memory allocated by allocating based on the maximum number of worker threads the running platform requires rather than defaulting to using a theoretical upper-bound of 128 worker threads.

### Fixed

* Entities Graphics Occlusion shader throws errors when building the project
* Fixed a GraphicsBuffer leak that could occur in cases where Entities Graphics is running without any entities to render.
* enabling/disabling per-view occlusion
---
 .footignore                                   |    1 -
 CHANGELOG.md                                  |   21 +
 Documentation~/TableOfContents.md             |   18 +-
 ...usion-culling-components-occlusion-view.md |    4 +-
 .../burst-occlusion-culling-components.md     |    6 +-
 .../burst-occlusion-culling-debug.md          |    2 +-
 .../burst-occlusion-culling-optimize.md       |    8 +-
 .../burst-occlusion-culling-overview.md       |   16 +-
 .../burst-occlusion-culling-requirements.md   |   14 +-
 .../burst-occlusion-culling-setup.md          |   16 +-
 Documentation~/burst-occlusion-culling.md     |   18 +-
 Documentation~/index.md                       |    4 +
 Documentation~/pre-release.md                 |    3 +
 .../Deformations/DeformationSystemGroup.cs    |    4 +-
 .../DrawCommandGeneration.cs                  |   14 +-
 .../EntitiesGraphicsLightBakingDataSystem.cs  |    7 +-
 .../EntitiesGraphicsSystem.cs                 |   64 +-
 .../LODRequirementsUpdateSystem.cs            |    8 +-
 Unity.Entities.Graphics/LightMaps.cs          |    5 +-
 Unity.Entities.Graphics/MaterialColor.cs      |    5 +-
 .../Occlusion/Masked/BufferGroup.cs           |   47 +-
 .../Occlusion/Masked/Dots/OccluderMesh.cs     |    3 +
 .../Occlusion/Masked/IntrinsicUtils.cs        |  123 +-
 .../Occlusion/Masked/RasterizeJob.cs          | 1808 ++++++++++++-----
 .../Occlusion/Masked/TestJob.cs               |  188 +-
 .../Masked/Visualization/DebugSettings.cs     |   12 +-
 .../Masked/Visualization/DebugView.cs         |   18 +-
 .../Visualization/DecodeMaskedDepthJob.cs     |   95 +-
 Unity.Entities.Graphics/Occlusion/Occluder.cs |   15 +
 .../Occlusion/OcclusionSortJob.cs             |   25 +-
 .../Occlusion/OcclusionView.cs                |   30 +-
 .../Occlusion/UnityOcclusion.cs               |   59 +-
 .../RenderFilterSettings.cs                   |   17 +-
 Unity.Entities.Graphics/RenderMeshArray.cs    |   14 +-
 .../Occlusion/OcclusionDebugOccluders.shader  |    1 -
 Unity.Entities.Graphics/SparseUploader.cs     |   29 +-
 .../Unity.Entities.Graphics.asmdef            |    8 +-
 .../UpdateEntitiesGraphicsChunksStructure.cs  |    8 +-
 ValidationExceptions.json                     |   10 +
 ValidationExceptions.json.meta                |    7 +
 package.json                                  |   12 +-
 41 files changed, 2043 insertions(+), 724 deletions(-)
 delete mode 100644 .footignore
 create mode 100644 Documentation~/pre-release.md
 create mode 100644 ValidationExceptions.json
 create mode 100644 ValidationExceptions.json.meta

diff --git a/.footignore b/.footignore
deleted file mode 100644
index 9cf577b..0000000
--- a/.footignore
+++ /dev/null
@@ -1 +0,0 @@
-ValidationExceptions.json
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 00588ac..16155dd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,26 @@
+
 # Changelog
 
+## [1.0.8] - 2023-04-17
+
+### Added
+
+* Support for CPU-based (Burst) masked occlusion culling on Neon processors.
+* Explicit usage of Burst's FMA optimization, parallel sort, and missing changes from previous review feedback of boc-neon branch
+
+### Changed
+
+* Greatly improved performance of CPU-based (Burst) masked occlusion culling.
+* Greatly improved performance of Depth and Test debug views used with CPU-based (Burst) masked occlusion culling.
+* Reduced the amount of memory allocated by allocating based on the maximum number of worker threads the running platform requires rather than defaulting to using a theoretical upper-bound of 128 worker threads.
+
+### Fixed
+
+* Entities Graphics Occlusion shader throws errors when building the project
+* Fixed a GraphicsBuffer leak that could occur in cases where Entities Graphics is running without any entities to render.
+* enabling/disabling per-view occlusion
+
+
 ## [1.0.0-pre.65] - 2023-03-21
 
 ### Added
diff --git a/Documentation~/TableOfContents.md b/Documentation~/TableOfContents.md
index b3bc6c8..3b62566 100644
--- a/Documentation~/TableOfContents.md
+++ b/Documentation~/TableOfContents.md
@@ -13,15 +13,15 @@
   * [Hybrid Entities](hybrid-entities.md)
   * [The BatchRendererGroup API](batch-renderer-group-api.md)
   * [Mesh deformations](mesh_deformations.md)
-* [Burst occlusion culling](burst-occlusion-culling.md)
-  * [Requirements and compatibility](burst-occlusion-culling-requirements.md)
-  * [Overview](burst-occlusion-culling-overview.md)
-  * [Setup](burst-occlusion-culling-setup.md)
-  * [Optimize occlusion culling](burst-occlusion-culling-optimize.md)
-  * [Rendering Debugger Culling tab reference](burst-occlusion-culling-debug.md)
-  * [Components](burst-occlusion-culling-components.md)
-    * [Occluder component](burst-occlusion-culling-components-occluder.md)
-    * [Occlusion View component](burst-occlusion-culling-components-occlusion-view.md)
+  * [Burst Occlusion Culling](burst-occlusion-culling.md)
+    * [Requirements and compatibility](burst-occlusion-culling-requirements.md)
+    * [Overview](burst-occlusion-culling-overview.md)
+    * [Setup](burst-occlusion-culling-setup.md)
+    * [Optimize occlusion culling](burst-occlusion-culling-optimize.md)
+    * [Rendering Debugger Culling tab reference](burst-occlusion-culling-debug.md)
+    * [Components](burst-occlusion-culling-components.md)
+      * [Occluder component](burst-occlusion-culling-components-occluder.md)
+      * [Occlusion View component](burst-occlusion-culling-components-occlusion-view.md)
 * [Runtime Usage](runtime-usage.md)
   * [Runtime Entity Creation](runtime-entity-creation.md)
 * Sample Content
diff --git a/Documentation~/burst-occlusion-culling-components-occlusion-view.md b/Documentation~/burst-occlusion-culling-components-occlusion-view.md
index 65805ba..b168e1d 100644
--- a/Documentation~/burst-occlusion-culling-components-occlusion-view.md
+++ b/Documentation~/burst-occlusion-culling-components-occlusion-view.md
@@ -1,12 +1,12 @@
 # Occlusion View component
 
-An Occlusion View MonoBehaviour component specifies which cameras, lights, and reflection probes use Burst occlusion culling. It also configures the size of the buffer to use for occlusion culling calculations which affects the resource intensity and precision of the calculations.
+An Occlusion View MonoBehaviour component specifies which cameras, lights, and reflection probes use Burst Occlusion Culling. It also configures the size of the buffer to use for occlusion culling calculations which affects the resource intensity and precision of the calculations.
 
 ## Occlusion View Inspector reference
 
 | **Property**                | **Description**                                              |
 | --------------------------- | ------------------------------------------------------------ |
-| **Occlusion Enabled**       | Controls whether the attached cameras, light, or reflection probes uses Burst occlusion culling. |
+| **Occlusion Enabled**       | Controls whether the attached cameras, light, or reflection probes uses Burst Occlusion Culling. |
 | **Occlusion Buffer Width**  | The width of the buffer to use for occlusion culling calculations. This value should always be a multiple of 16. |
 | **Occlusion Buffer Height** | The height of the buffer to use for occlusion culling calculations. This value should always be a multiple of 16. |
 
diff --git a/Documentation~/burst-occlusion-culling-components.md b/Documentation~/burst-occlusion-culling-components.md
index 9e7dfb8..d3b4437 100644
--- a/Documentation~/burst-occlusion-culling-components.md
+++ b/Documentation~/burst-occlusion-culling-components.md
@@ -1,11 +1,11 @@
-# Burst occlusion culling components
+# Burst Occlusion Culling components
 
-This section contains information on the MonoBehaviour components that configure Burst occlusion culling.
+This section contains information on the MonoBehaviour components that configure Burst Occlusion Culling.
 
 | **Topic**                                                                        | **Description**                                                                                                                          |
 | -------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------- |
 | [Occluder component](burst-occlusion-culling-components-occluder.md)             | Understand the Occluder component, which creates and configures occluders.                                                               |
-| [Occlusion View component](burst-occlusion-culling-components-occlusion-view.md) | Understand the Occlusion View component, which configures Burst occlusion culling for individual cameras, lights, and reflection probes. |
+| [Occlusion View component](burst-occlusion-culling-components-occlusion-view.md) | Understand the Occlusion View component, which configures Burst Occlusion Culling for individual cameras, lights, and reflection probes. |
 
 ## Additional resources
 
diff --git a/Documentation~/burst-occlusion-culling-debug.md b/Documentation~/burst-occlusion-culling-debug.md
index e958e7f..3a136b5 100644
--- a/Documentation~/burst-occlusion-culling-debug.md
+++ b/Documentation~/burst-occlusion-culling-debug.md
@@ -1,6 +1,6 @@
 # Rendering Debugger Culling tab reference
 
-The **Culling** tab in the [Rendering debugger](https://docs.unity3d.com/Packages/com.unity.render-pipelines.high-definition@latest?subfolder=/manual/Render-Pipeline-Debug-Window.html) includes debugging options and visualizations to help you investigate Burst occlusion culling issues.
+The **Culling** tab in the [Rendering debugger](https://docs.unity3d.com/Packages/com.unity.render-pipelines.high-definition@latest?subfolder=/manual/Render-Pipeline-Debug-Window.html) includes debugging options and visualizations to help you investigate Burst Occlusion Culling issues.
 
 <table>
   <thead>
diff --git a/Documentation~/burst-occlusion-culling-optimize.md b/Documentation~/burst-occlusion-culling-optimize.md
index d94e455..960fd98 100644
--- a/Documentation~/burst-occlusion-culling-optimize.md
+++ b/Documentation~/burst-occlusion-culling-optimize.md
@@ -1,12 +1,12 @@
-# Optimize Burst occlusion culling
+# Optimize Burst Occlusion Culling
 
-If Burst occlusion culling is a good fit for a scene (refer to [When to use Burst occlusion culling](burst-occlusion-culling-overview.md#when-to-use-burst-occlusion-culling)), you can configure it to be bespokely optimized for the scene. This page explains the different methods you can use to get the best performance out of Burst occlusion culling for a particular scene.
+If Burst Occlusion Culling is a good fit for a scene (refer to [When to use Burst Occlusion Culling](burst-occlusion-culling-overview.md#when-to-use-burst-occlusion-culling)), you can configure it to be bespokely optimized for the scene. This page explains the different methods you can use to get the best performance out of Burst Occlusion Culling for a particular scene.
 
 ## Optimize occlusion views
 
-The Burst occlusion culling system can use a different buffer resolution for each view it processes. A lower-resolution buffer is less resource-intensive to process but produces a less precise culling result. If a view doesn't require precise occlusion culling results, you can reduce the resolution of its occlusion buffer to increase the performance of the Burst occlusion culling process.
+The Burst Occlusion Culling system can use a different buffer resolution for each view it processes. A lower-resolution buffer is less resource-intensive to process but produces a less precise culling result. If a view doesn't require precise occlusion culling results, you can reduce the resolution of its occlusion buffer to increase the performance of the Burst Occlusion Culling process.
 
-If an occlusion view uses a lower resolution buffer, the Burst occlusion culling system can misidentify some totally hidden objects as being visible. This means that the rendering system must unnecessarily process the objects. If you reduce the resolution of an occlusion view buffer, it's best practice to [profile](xref:Profiler) the scene to make sure that the reduced resolution doesn't degrade overall performance.
+If an occlusion view uses a lower resolution buffer, the Burst Occlusion Culling system can misidentify some totally hidden objects as being visible. This means that the rendering system must unnecessarily process the objects. If you reduce the resolution of an occlusion view buffer, it's best practice to [profile](xref:Profiler) the scene to make sure that the reduced resolution doesn't degrade overall performance.
 
 ## Additional resources
 
diff --git a/Documentation~/burst-occlusion-culling-overview.md b/Documentation~/burst-occlusion-culling-overview.md
index b16ae1b..67fc095 100644
--- a/Documentation~/burst-occlusion-culling-overview.md
+++ b/Documentation~/burst-occlusion-culling-overview.md
@@ -1,22 +1,22 @@
-# Burst occlusion culling overview
+# Burst Occlusion Culling overview
 
-The Burst occlusion culling system disables rendering for entities that are hidden behind other entities. This reduces the amount of data that Unity uploads to the GPU every frame and the amount of unnecessary work that the GPU must do.
+The Burst Occlusion Culling system disables rendering for entities that are hidden behind other entities. This reduces the amount of data that Unity uploads to the GPU every frame and the amount of unnecessary work that the GPU must do.
 
-## How Burst occlusion culling works
+## How Burst Occlusion Culling works
 
-From the point of view of the cameras, lights, and reflection probes you specify, the Burst occlusion culling system determines which entities are completely hidden and don't need to be sent to the GPU for rendering. To do this, the system splits entities into occluders and occludees. The system gets all occluders and calculates which occludees are hidden by them.
+From the point of view of the cameras, lights, and reflection probes you specify, the Burst Occlusion Culling system determines which entities are completely hidden and don't need to be sent to the GPU for rendering. To do this, the system splits entities into occluders and occludees. The system gets all occluders and calculates which occludees are hidden by them.
 
 For performance reasons, the culling system doesn't use the same meshes in its culling calculations that the rendering system uses to draw entities. Instead, each occluder entity needs an additional lower-resolution mesh for the culling system to use instead. This occlusion mesh must be completely inscribed within the original mesh to avoid artifacts such as visible popping which is where objects appear and disappear visibly on-screen.
 
 Entities that use the Mesh Renderer component with **Dynamic Occlusion** set at [author time](https://docs.unity3d.com/Packages/com.unity.entities@latest?subfolder=/manual/editor-authoring-runtime.html) will be occludees. It's your responsibility to specify which entities are occluders. For help on how to decide which entities should be occluders, refer to [How to choose occluders](#how-to-choose-occluders).
 
-## When to use Burst occlusion culling
+## When to use Burst Occlusion Culling
 
-Burst occlusion culling isn't appropriate for every application or scene. Scenes with many unique objects (with unique meshes or materials) that produce a lot of overdraw are perfect for Burst occlusion culling. Examples of this type of scene include large open worlds, dense cities, or interiors with separate rooms.
+Burst Occlusion Culling isn't appropriate for every application or scene. Scenes with many unique objects (with unique meshes or materials) that produce a lot of overdraw are perfect for Burst Occlusion Culling. Examples of this type of scene include large open worlds, dense cities, or interiors with separate rooms.
 
 Entities graphics can render instanced objects very quickly so it's often not beneficial to calculate which instanced objects are or aren't occluded and instead pass them all to the GPU to render. This is because the overhead of the occlusion culling calculations can exceed the overhead saved by reducing the number of instanced objects to draw.
 
-If there is a mix of unique and instanced objects in a scene, you can enable Burst occlusion culling for the scene, but make the instanced objects not occludees (disable **Dynamic Occlusion** on their Mesh Renderer component). This makes Burst occlusion culling optimize the draw submission for unique objects without wasting resources processing the instanced objects.
+If there is a mix of unique and instanced objects in a scene, you can enable Burst Occlusion Culling for the scene, but make the instanced objects not occludees (disable **Dynamic Occlusion** on their Mesh Renderer component). This makes Burst Occlusion Culling optimize the draw submission for unique objects without wasting resources processing the instanced objects.
 
 ## How to choose occluders
 
@@ -36,4 +36,4 @@ Entities likely to be unsuitable occluders are:
 
 ## Additional resources
 
-- [Set up Burst occlusion culling](burst-occlusion-culling-setup.md)
+- [Set up Burst Occlusion Culling](burst-occlusion-culling-setup.md)
diff --git a/Documentation~/burst-occlusion-culling-requirements.md b/Documentation~/burst-occlusion-culling-requirements.md
index 31703b6..8625310 100644
--- a/Documentation~/burst-occlusion-culling-requirements.md
+++ b/Documentation~/burst-occlusion-culling-requirements.md
@@ -1,14 +1,14 @@
-# Burst occlusion culling requirements and compatibility
+# Burst Occlusion Culling requirements and compatibility
 
-This page contains information on requirements and feature compatibility of Burst occlusion culling. Burst occlusion culling currently only supports [Entities](https://docs.unity3d.com/Packages/com.unity.entities@latest/index.html)-based applications.
+This page contains information on requirements and feature compatibility of Burst Occlusion Culling. Burst Occlusion Culling currently only supports [Entities](https://docs.unity3d.com/Packages/com.unity.entities@latest/index.html)-based applications.
 
 ## Hardware requirements
 
-Burst occlusion culling requires the target CPU to support SSE4 or Neon instructions. Burst doesn't support 32-bit intrinsics for Neon so, to build for ARM, you must use a 64-bit build target.
+Burst Occlusion Culling requires the target CPU to support SSE4 or Neon instructions. Burst doesn't support 32-bit intrinsics for Neon so, to build for ARM, you must use a 64-bit build target.
 
 ## Renderer compatibility
 
-The following table shows which renderers Burst occlusion culling supports.
+The following table shows which renderers Burst Occlusion Culling supports.
 
 | Renderer                 | Occludee support | Occluder support |
 | ------------------------ | ---------------- | ---------------- |
@@ -23,7 +23,7 @@ The following table shows which renderers Burst occlusion culling supports.
 
 ## Occlusion view compatibility
 
-The following table shows which components Burst occlusion culling supports as views:
+The following table shows which components Burst Occlusion Culling supports as views:
 
 | Component                | View support |
 | ------------------------ | ------------ |
@@ -35,11 +35,11 @@ The following table shows which components Burst occlusion culling supports as v
 
 ## Feature compatibility
 
-Burst occlusion culling doesn't support the following:
+Burst Occlusion Culling doesn't support the following:
 
 * [Mesh deformations](mesh_deformations.md).
 * Concurrent usage with Unity's [built-in occlusion culling system](xref:OcclusionCulling).
 
 ## Additional resources
 
-* [Burst occlusion culling overview](burst-occlusion-culling-overview.md)
\ No newline at end of file
+* [Burst Occlusion Culling overview](burst-occlusion-culling-overview.md)
\ No newline at end of file
diff --git a/Documentation~/burst-occlusion-culling-setup.md b/Documentation~/burst-occlusion-culling-setup.md
index a104fca..0013abd 100644
--- a/Documentation~/burst-occlusion-culling-setup.md
+++ b/Documentation~/burst-occlusion-culling-setup.md
@@ -1,23 +1,23 @@
-# Set up Burst occlusion culling
+# Set up Burst Occlusion Culling
 
-To set up Burst occlusion culling in your Unity project:
+To set up Burst Occlusion Culling in your Unity project:
 
 1. Enable the feature.
-2. Enable and configure Burst occlusion culling for individual cameras, lights, and reflection probes.
+2. Enable and configure Burst Occlusion Culling for individual cameras, lights, and reflection probes.
 3. Configure some entities to be occluders.
 
-## Enable Burst occlusion culling
+## Enable Burst Occlusion Culling
 
-The first step to set up Burst occlusion culling is to enable the feature for your project. To do this:
+The first step to set up Burst Occlusion Culling is to enable the feature for your project. To do this:
 
 1. Set the `ENABLE_UNITY_OCCLUSION` custom scripting symbol. For information on how to do this, refer to [Custom scripting symbols](xref:CustomScriptingSymbols).
 2. Ensure that [Burst](https://docs.unity3d.com/Packages/com.unity.burst@latest/index.html) is enabled. To do this, select **Jobs** > **Burst** > **Enable Compilation**.
 3. Select **Occlusion** > **Enable**.
-4. Burst occlusion culling requires the target CPU to support SSE4 instructions. To be able to build a Unity Player, go to **Edit** > **Project Settings** > **Burst AOT Settings** and set **Target CPU architectures** to **SSE4**.
+4. Burst Occlusion Culling requires the target CPU to support SSE4 instructions. To be able to build a Unity Player, go to **Edit** > **Project Settings** > **Burst AOT Settings** and set **Target CPU architectures** to **SSE4**.
 
 ## Configure per-view occlusion culling
 
-You can enable and configure Burst occlusion culling on a per-camera, per-light, and per reflection probe basis. By default, only the [main camera](xref:UnityEngine.Camera.main) uses Burst occlusion culling. To enable Burst occlusion culling for a camera, light, and reflection probe, add the **Occlusion View** component and enable the **Occlusion Enable** property. The Occlusion View component also controls the resolution of the occlusion buffer for the camera, light, or reflection probe. The occlusion buffer resolution affects the resource intensity of the occlusion culling calculations. For more information about configuration options and performance, refer to [Optimize Burst occlusion culling](burst-occlusion-culling-optimize.md)
+You can enable and configure Burst Occlusion Culling on a per-camera, per-light, and per reflection probe basis. By default, only the [main camera](xref:UnityEngine.Camera.main) uses Burst Occlusion Culling. To enable Burst Occlusion Culling for a camera, light, and reflection probe, add the **Occlusion View** component and enable the **Occlusion Enable** property. The Occlusion View component also controls the resolution of the occlusion buffer for the camera, light, or reflection probe. The occlusion buffer resolution affects the resource intensity of the occlusion culling calculations. For more information about configuration options and performance, refer to [Optimize Burst Occlusion Culling](burst-occlusion-culling-optimize.md)
 
 ## Create occluders
 
@@ -34,4 +34,4 @@ To set up an entity as an occluder:
 
 ## Additional resources
 
-- [Optimize Burst occlusion culling](burst-occlusion-culling-optimize.md)
+- [Optimize Burst Occlusion Culling](burst-occlusion-culling-optimize.md)
diff --git a/Documentation~/burst-occlusion-culling.md b/Documentation~/burst-occlusion-culling.md
index d241b55..6396471 100644
--- a/Documentation~/burst-occlusion-culling.md
+++ b/Documentation~/burst-occlusion-culling.md
@@ -1,18 +1,18 @@
-# Burst occlusion culling
+# Burst Occlusion Culling
 
-Burst occlusion culling is a Burst-optimized occlusion culling system available in the Entities Graphics package. It provides a cross-platform occlusion system that disables the rendering of objects when they're occluded by other objects, and hence are not seen by the camera.
+Burst Occlusion Culling is a Burst-optimized occlusion culling system available in the Entities Graphics package. It provides a cross-platform occlusion system that disables the rendering of objects when they're occluded by other objects, and hence are not seen by the camera.
 
 > [!IMPORTANT]
-> This version of Burst occlusion culling is experimental. This means that it isn't yet ready to use for production and parts of the implementation and API will change.
+> This version of Burst Occlusion Culling is experimental. This means that it isn't yet ready to use for production and parts of the implementation and API will change.
 
 | **Topic**                                                                                         | **Description**                                                                                                                   |
 | ------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------- |
-| [Requirements and compatibility](burst-occlusion-culling-requirements.md) | Learn about the system requirements and feature compatibility of Burst occlusion culling.                                         |
-| [Overview](burst-occlusion-culling-overview.md)                           | Learn how the Burst occlusion culling system works and whether it's suitable for your project.                                    |
-| [Setup](burst-occlusion-culling-setup.md)                                | Enable Burst occlusion culling and create your first occluders.                                                                   |
-| [Optimize occlusion culling](burst-occlusion-culling-optimize.md)                           | Optimize Burst occlusion culling for your project.                                                                                |
-| [Rendering Debugger Culling tab reference](burst-occlusion-culling-debug.md)                      | Learn about the debugging options and visualizations available to help you investigate Burst occlusion culling issues.            |
-| [Components](burst-occlusion-culling-components.md)                       | Learn about the MonoBehaviour components that control which views use Burst occlusion culling and which objects act as occluders. |
+| [Requirements and compatibility](burst-occlusion-culling-requirements.md) | Learn about the system requirements and feature compatibility of Burst Occlusion Culling.                                         |
+| [Overview](burst-occlusion-culling-overview.md)                           | Learn how the Burst Occlusion Culling system works and whether it's suitable for your project.                                    |
+| [Setup](burst-occlusion-culling-setup.md)                                | Enable Burst Occlusion Culling and create your first occluders.                                                                   |
+| [Optimize occlusion culling](burst-occlusion-culling-optimize.md)                           | Optimize Burst Occlusion Culling for your project.                                                                                |
+| [Rendering Debugger Culling tab reference](burst-occlusion-culling-debug.md)                      | Learn about the debugging options and visualizations available to help you investigate Burst Occlusion Culling issues.            |
+| [Components](burst-occlusion-culling-components.md)                       | Learn about the MonoBehaviour components that control which views use Burst Occlusion Culling and which objects act as occluders. |
 
 ## Additional resources
 
diff --git a/Documentation~/index.md b/Documentation~/index.md
index fa71cac..8077990 100644
--- a/Documentation~/index.md
+++ b/Documentation~/index.md
@@ -16,3 +16,7 @@ For information about Entities Graphics's requirements, see [Requirements and co
 ## Getting started with Entities Graphics
 
 For information on getting started with Entities Graphics, see the [Getting started](getting-started.md) section.
+
+## Additional resources
+
+* [Prerelease (1.0.0-pre.65) documentation](pre-release.md)
diff --git a/Documentation~/pre-release.md b/Documentation~/pre-release.md
new file mode 100644
index 0000000..d981f58
--- /dev/null
+++ b/Documentation~/pre-release.md
@@ -0,0 +1,3 @@
+# Documentation for pre-release versions of com.unity.entities.graphics
+
+Because of idiosyncrasies with which Unity packages are published, it may be possible to see the documentation for the 1.0.0 version of `com.unity.entities.graphics` before the package itself is available.  Should you find yourself in that situation, we've made the [pre-release documentation](images/com.unity.entities.graphics@1.0.zip) available.  This offline archive contains a snapshot of documentation for the 1.0.0-pre.65 version.
diff --git a/Unity.Entities.Graphics/Deformations/DeformationSystemGroup.cs b/Unity.Entities.Graphics/Deformations/DeformationSystemGroup.cs
index 69581c8..332fcb4 100644
--- a/Unity.Entities.Graphics/Deformations/DeformationSystemGroup.cs
+++ b/Unity.Entities.Graphics/Deformations/DeformationSystemGroup.cs
@@ -10,7 +10,9 @@ namespace Unity.Rendering
     [UpdateInGroup(typeof(PresentationSystemGroup)), UpdateAfter(typeof(RegisterMaterialsAndMeshesSystem)), UpdateBefore(typeof(EntitiesGraphicsSystem))]
     public sealed partial class DeformationsInPresentation : ComponentSystemGroup
     {
-        /// <inheritdoc/>
+        /// <summary>
+        /// Called when this system is created.
+        /// </summary>
         protected override void OnCreate()
         {
             if (UnityEngine.SystemInfo.graphicsDeviceType == GraphicsDeviceType.Null)
diff --git a/Unity.Entities.Graphics/DrawCommandGeneration.cs b/Unity.Entities.Graphics/DrawCommandGeneration.cs
index 367f780..0303257 100644
--- a/Unity.Entities.Graphics/DrawCommandGeneration.cs
+++ b/Unity.Entities.Graphics/DrawCommandGeneration.cs
@@ -120,7 +120,7 @@ internal unsafe struct ThreadLocalAllocator
     {
         public const int kInitialSize = 1024 * 1024;
         public const Allocator kAllocator = Allocator.Persistent;
-        public const int NumThreads = ChunkDrawCommandOutput.NumThreads;
+        public static readonly int NumThreads = ChunkDrawCommandOutput.NumThreads;
 
         [StructLayout(LayoutKind.Explicit, Size = JobsUtility.CacheLineSize)]
         public unsafe struct PaddedAllocator
@@ -568,7 +568,7 @@ public bool EmitDepthSorted(
     internal unsafe struct ThreadLocalCollectBuffer
     {
         public const Allocator kAllocator = Allocator.TempJob;
-        public const int kCollectBufferSize = ChunkDrawCommandOutput.NumThreads;
+        public static readonly int kCollectBufferSize = ChunkDrawCommandOutput.NumThreads;
 
         public UnsafeList<DrawCommandWorkItem> WorkItems;
         private fixed int m_CacheLinePadding[12]; // The padding here assumes some internal sizes
@@ -607,7 +607,7 @@ public void Dispose()
     internal unsafe struct DrawBinCollector
     {
         public const Allocator kAllocator = Allocator.TempJob;
-        public const int NumThreads = ChunkDrawCommandOutput.NumThreads;
+        public static readonly int NumThreads = ChunkDrawCommandOutput.NumThreads;
 
         public IndirectList<DrawCommandSettings> Bins;
         private UnsafeParallelHashSet<DrawCommandSettings> m_BinSet;
@@ -795,10 +795,14 @@ internal unsafe struct ChunkDrawCommandOutput
     {
         public const Allocator kAllocator = Allocator.TempJob;
 
-        public const int NumThreads = JobsUtility.MaxJobThreadCount;
+#if UNITY_2022_2_14F1_OR_NEWER
+        public static readonly int NumThreads = JobsUtility.ThreadIndexCount;
+#else
+        public static readonly int NumThreads = JobsUtility.MaxJobThreadCount;
+#endif
 
+        public static readonly int kNumThreadsBitfieldLength = (NumThreads + 63) / 64;
         public const int kNumReleaseThreads = 4;
-        public const int kNumThreadsBitfieldLength = (NumThreads + 63) / 64;
         public const int kBinPresentFilterSize = 1 << 10;
 
         public UnsafeList<ThreadLocalDrawCommands> ThreadLocalDrawCommands;
diff --git a/Unity.Entities.Graphics/EntitiesGraphicsLightBakingDataSystem.cs b/Unity.Entities.Graphics/EntitiesGraphicsLightBakingDataSystem.cs
index 78acd04..3eae178 100644
--- a/Unity.Entities.Graphics/EntitiesGraphicsLightBakingDataSystem.cs
+++ b/Unity.Entities.Graphics/EntitiesGraphicsLightBakingDataSystem.cs
@@ -28,6 +28,9 @@ public partial class HybridLightBakingDataSystem : SystemBase
     {
         private EntityQuery m_LightBakingQuery;
 
+        /// <summary>
+        /// Called when this system is created.
+        /// </summary>
         protected override void OnCreate()
         {
             m_LightBakingQuery = SystemAPI.QueryBuilder()
@@ -36,7 +39,9 @@ protected override void OnCreate()
             m_LightBakingQuery.SetChangedVersionFilter(ComponentType.ReadOnly<Light>());
         }
 
-        /// <inheritdoc/>
+        /// <summary>
+        /// Called when this system is updated.
+        /// </summary>
         protected override void OnUpdate()
         {
             var entities = m_LightBakingQuery.ToEntityArray(Allocator.Temp);
diff --git a/Unity.Entities.Graphics/EntitiesGraphicsSystem.cs b/Unity.Entities.Graphics/EntitiesGraphicsSystem.cs
index ee1115b..e36597c 100644
--- a/Unity.Entities.Graphics/EntitiesGraphicsSystem.cs
+++ b/Unity.Entities.Graphics/EntitiesGraphicsSystem.cs
@@ -291,7 +291,9 @@ partial class RegisterMaterialsAndMeshesSystem : SystemBase
         private JobHandle m_BoundsCheckHandle = default;
 #endif
 
-        /// <inheritdoc/>
+        /// <summary>
+        /// Called when this system is created.
+        /// </summary>
         protected override void OnCreate()
         {
             if (!EntitiesGraphicsSystem.EntitiesGraphicsEnabled)
@@ -317,7 +319,9 @@ protected override void OnCreate()
 #endif
         }
 
-        /// <inheritdoc/>
+        /// <summary>
+        /// Called when this system is updated.
+        /// </summary>
         protected override void OnUpdate()
         {
             Profiler.BeginSample("RegisterMaterialsAndMeshes");
@@ -325,7 +329,9 @@ protected override void OnUpdate()
             Profiler.EndSample();
         }
 
-        /// <inheritdoc/>
+        /// <summary>
+        /// Called when this system is destroyed.
+        /// </summary>
         protected override void OnDestroy()
         {
             if (!EntitiesGraphicsSystem.EntitiesGraphicsEnabled) return;
@@ -697,8 +703,14 @@ private void ComputeStats()
         {
             Profiler.BeginSample("ComputeStats");
 
+#if UNITY_2022_2_14F1_OR_NEWER
+            int maxThreadCount = JobsUtility.ThreadIndexCount;
+#else
+            int maxThreadCount = JobsUtility.MaxJobThreadCount;
+#endif
+
             var result = default(EntitiesGraphicsStats);
-            for (int i = 0; i < JobsUtility.MaxJobThreadCount; ++i)
+            for (int i = 0; i < maxThreadCount; ++i)
             {
                 ref var s = ref m_PerThreadStats[i];
 
@@ -801,7 +813,9 @@ private void ValidateUsingURPForwardPlus()
 
         private ThreadLocalAllocator m_ThreadLocalAllocators;
 
-        /// <inheritdoc/>
+        /// <summary>
+        /// Called when this system is created.
+        /// </summary>
         protected override void OnCreate()
         {
             // If -nographics is enabled, or if there is no compute shader support, disable HR.
@@ -907,7 +921,13 @@ protected override void OnCreate()
                 });
 
 #if UNITY_EDITOR
-            m_PerThreadStats = (EntitiesGraphicsPerThreadStats*)Memory.Unmanaged.Allocate(JobsUtility.MaxJobThreadCount * sizeof(EntitiesGraphicsPerThreadStats),
+#if UNITY_2022_2_14F1_OR_NEWER
+            int maxThreadCount = JobsUtility.ThreadIndexCount;
+#else
+            int maxThreadCount = JobsUtility.MaxJobThreadCount;
+#endif
+
+            m_PerThreadStats = (EntitiesGraphicsPerThreadStats*)Memory.Unmanaged.Allocate(maxThreadCount * sizeof(EntitiesGraphicsPerThreadStats),
                 64, Allocator.Persistent);
 #endif
 
@@ -1092,7 +1112,9 @@ private void InitializeMaterialProperties()
             }
         }
 
-        /// <inheritdoc/>
+        /// <summary>
+        /// Called when this system is destroyed.
+        /// </summary>
         protected override void OnDestroy()
         {
             if (!EntitiesGraphicsEnabled) return;
@@ -1163,7 +1185,9 @@ private static BatchFilterSettings MakeFilterSettings(RenderFilterSettings filte
             };
         }
 
-        /// <inheritdoc/>
+        /// <summary>
+        /// Called when this system is updated.
+        /// </summary>
         protected override void OnUpdate()
         {
             JobHandle inputDeps = Dependency;
@@ -1424,7 +1448,12 @@ private JobHandle OnPerformCulling(BatchRendererGroup rendererGroup, BatchCullin
                 m_PrevCameraPos = lodParams.cameraPos;
                 m_ResetLod = false;
 #if UNITY_EDITOR
-                UnsafeUtility.MemClear(m_PerThreadStats, sizeof(EntitiesGraphicsPerThreadStats) * JobsUtility.MaxJobThreadCount);
+#if UNITY_2022_2_14F1_OR_NEWER
+                int maxThreadCount = JobsUtility.ThreadIndexCount;
+#else
+                int maxThreadCount = JobsUtility.MaxJobThreadCount;
+#endif
+                UnsafeUtility.MemClear(m_PerThreadStats, sizeof(EntitiesGraphicsPerThreadStats) * maxThreadCount);
 #endif
             }
             else
@@ -1666,9 +1695,15 @@ private void DebugDrawCommands(JobHandle drawCommandsDependency, BatchCullingOut
         private JobHandle UpdateAllBatches(JobHandle inputDependencies)
         {
             Profiler.BeginSample("GetComponentTypes");
+#if UNITY_2022_2_14F1_OR_NEWER
+            int maxThreadCount = JobsUtility.ThreadIndexCount;
+#else
+            int maxThreadCount = JobsUtility.MaxJobThreadCount;
+#endif
+
 
             var threadLocalAABBs = new NativeArray<ThreadLocalAABB>(
-                JobsUtility.MaxJobThreadCount,
+                maxThreadCount,
                 Allocator.TempJob,
                 NativeArrayOptions.UninitializedMemory);
             var zeroAABBJob = new ZeroThreadLocalAABBJob
@@ -2388,7 +2423,14 @@ private void UpdateBatchBufferHandles()
 
         private void EndUpdate()
         {
-            m_GPUUploader.EndAndCommit(m_ThreadedGPUUploader);
+            if (m_ThreadedGPUUploader.IsValid)
+                m_GPUUploader.EndAndCommit(m_ThreadedGPUUploader);
+
+            // Set the uploader struct to null to ensure that any calls
+            // to EndAndCommit are made with a struct returned from Begin()
+            // on the same frame. This is important in case Begin() is skipped
+            // on a frame.
+            m_ThreadedGPUUploader = default;
 
 #if DEBUG_LOG_MEMORY_USAGE
             if (m_GPUPersistentAllocator.UsedSpace != PrevUsedSpace)
diff --git a/Unity.Entities.Graphics/LODRequirementsUpdateSystem.cs b/Unity.Entities.Graphics/LODRequirementsUpdateSystem.cs
index 2305f36..b4962a7 100644
--- a/Unity.Entities.Graphics/LODRequirementsUpdateSystem.cs
+++ b/Unity.Entities.Graphics/LODRequirementsUpdateSystem.cs
@@ -95,7 +95,9 @@ internal partial class AddLODRequirementComponents : SystemBase
         EntityQuery m_MissingLODWorldReferencePoint;
         EntityQuery m_MissingLODGroupWorldReferencePoint;
 
-        /// <inheritdoc/>
+        /// <summary>
+        /// Called when this system is created.
+        /// </summary>
         protected override void OnCreate()
         {
             m_MissingRootLODRange = GetEntityQuery(new EntityQueryDesc
@@ -134,7 +136,9 @@ protected override void OnCreate()
             });
         }
 
-        /// <inheritdoc/>
+        /// <summary>
+        /// Called when this system is updated.
+        /// </summary>
         protected override void  OnUpdate()
         {
             EntityManager.AddComponent(m_MissingRootLODRange, typeof(RootLODRange));
diff --git a/Unity.Entities.Graphics/LightMaps.cs b/Unity.Entities.Graphics/LightMaps.cs
index 9e2ae74..248b074 100644
--- a/Unity.Entities.Graphics/LightMaps.cs
+++ b/Unity.Entities.Graphics/LightMaps.cs
@@ -51,7 +51,10 @@ public bool Equals(LightMaps other)
                 shadowMasks == other.shadowMasks;
         }
 
-        /// <inheritdoc/>
+        /// <summary>
+        /// Calculates the hash code for this object.
+        /// </summary>
+        /// <returns>The hash code.</returns>
         public override int GetHashCode()
         {
             int hash = 0;
diff --git a/Unity.Entities.Graphics/MaterialColor.cs b/Unity.Entities.Graphics/MaterialColor.cs
index d1dbfcd..afcea3e 100644
--- a/Unity.Entities.Graphics/MaterialColor.cs
+++ b/Unity.Entities.Graphics/MaterialColor.cs
@@ -40,7 +40,10 @@ public class MaterialColor : MonoBehaviour
         /// </summary>
         public class MaterialColorBaker : Baker<MaterialColor>
         {
-            /// <inheritdoc/>
+            /// <summary>
+            /// Called during the baking process to bake the authoring component.
+            /// </summary>
+            /// <param name="authoring">The authoring component to bake.</param>
             public override void Bake(MaterialColor authoring)
             {
                 Color linearCol = authoring.color.linear;
diff --git a/Unity.Entities.Graphics/Occlusion/Masked/BufferGroup.cs b/Unity.Entities.Graphics/Occlusion/Masked/BufferGroup.cs
index dbd7457..cc05bb7 100644
--- a/Unity.Entities.Graphics/Occlusion/Masked/BufferGroup.cs
+++ b/Unity.Entities.Graphics/Occlusion/Masked/BufferGroup.cs
@@ -61,12 +61,15 @@ class BufferGroup
         // Visualization
         DebugView m_DebugView;
 
+        public bool Enabled;
+
         public BufferGroup(BatchCullingViewType viewType)
         {
             ViewType = viewType;
             NumBuffers = math.clamp(Jobs.LowLevel.Unsafe.JobsUtility.JobWorkerMaximumCount, 1, 10);
             NearClip = float.MaxValue;
             FrustumPlanes = new NativeArray<v128>(5, Allocator.Persistent);
+            Enabled = true;
         }
 
         public void Dispose()
@@ -93,13 +96,13 @@ public void SetResolutionAndClip(int numPixelsX, int numPixelsY, BatchCullingPro
                 float h = numPixelsY; //
                 float hw = w * 0.5f;
                 float hh = h * 0.5f;
-                PixelCenterX = X86.Sse.set1_ps(hw);
-                PixelCenterY = X86.Sse.set1_ps(hh);
-                PixelCenter = X86.Sse.setr_ps(hw, hw, hh, hh);
-                HalfWidth = X86.Sse.set1_ps(hw);
-                HalfHeight = X86.Sse.set1_ps(-hh);
-                HalfSize = X86.Sse.setr_ps(hw, hw, -hh, -hh);
-                ScreenSize = X86.Sse2.setr_epi32(numPixelsX - 1, numPixelsX - 1, numPixelsY - 1, numPixelsY - 1);
+                PixelCenterX = new v128(hw);
+                PixelCenterY = new v128(hh);
+                PixelCenter = new v128(hw, hw, hh, hh);
+                HalfWidth = new v128(hw);
+                HalfHeight = new v128(-hh);
+                HalfSize = new v128(hw, hw, -hh, -hh);
+                ScreenSize = new v128(numPixelsX - 1, numPixelsX - 1, numPixelsY - 1, numPixelsY - 1);
                 // TODO: Delete this after full implementation. This isn't needed because min values are zero, and
                 // so there is opportunity for optimization.
                 // Setup a full screen scissor rectangle
@@ -124,17 +127,17 @@ public void SetResolutionAndClip(int numPixelsX, int numPixelsY, BatchCullingPro
 
                     if (projectionType == BatchCullingProjectionType.Orthographic)
                     {
-                        FrustumPlanes[1] = X86.Sse.setr_ps(1f - guardBandWidth, 0f, 0f, 1f);
-                        FrustumPlanes[2] = X86.Sse.setr_ps(-1f + guardBandWidth, 0f, 0f, 1f);
-                        FrustumPlanes[3] = X86.Sse.setr_ps(0f, 1f - guardBandHeight, 0f, 1f);
-                        FrustumPlanes[4] = X86.Sse.setr_ps(0f, -1f + guardBandHeight, 0f, 1f);
+                        FrustumPlanes[1] = new v128(1f - guardBandWidth, 0f, 0f, 1f);
+                        FrustumPlanes[2] = new v128(-1f + guardBandWidth, 0f, 0f, 1f);
+                        FrustumPlanes[3] = new v128(0f, 1f - guardBandHeight, 0f, 1f);
+                        FrustumPlanes[4] = new v128(0f, -1f + guardBandHeight, 0f, 1f);
                     }
                     else
                     {
-                        FrustumPlanes[1] = X86.Sse.setr_ps(1f - guardBandWidth, 0f, 1f, 0f);
-                        FrustumPlanes[2] = X86.Sse.setr_ps(-1f + guardBandWidth, 0f, 1f, 0f);
-                        FrustumPlanes[3] = X86.Sse.setr_ps(0f, 1f - guardBandHeight, 1f, 0f);
-                        FrustumPlanes[4] = X86.Sse.setr_ps(0f, -1f + guardBandHeight, 1f, 0f);
+                        FrustumPlanes[1] = new v128(1f - guardBandWidth, 0f, 1f, 0f);
+                        FrustumPlanes[2] = new v128(-1f + guardBandWidth, 0f, 1f, 0f);
+                        FrustumPlanes[3] = new v128(0f, 1f - guardBandHeight, 1f, 0f);
+                        FrustumPlanes[4] = new v128(0f, -1f + guardBandHeight, 1f, 0f);
                     }
                 }
             }
@@ -143,7 +146,7 @@ public void SetResolutionAndClip(int numPixelsX, int numPixelsY, BatchCullingPro
             {
                 // Set near clip
                 NearClip = nearClip;
-                FrustumPlanes[0] = X86.Sse.setr_ps(0f, 0f, 1f, -nearClip);
+                FrustumPlanes[0] = new v128(0f, 0f, 1f, -nearClip);
             }
         }
 
@@ -159,8 +162,11 @@ public Texture2D GetVisualizationTexture()
 
         public void RenderToTextures(EntityQuery testQuery, EntityQuery meshQuery, JobHandle dependency, DebugRenderMode mode)
         {
+            if (mode == DebugRenderMode.None
 #if UNITY_EDITOR
-            if (mode == DebugRenderMode.None && !OcclusionBrowseWindow.IsVisible)
+                && !OcclusionBrowseWindow.IsVisible
+#endif
+                )
             {
                 return;
             }
@@ -176,9 +182,14 @@ public void RenderToTextures(EntityQuery testQuery, EntityQuery meshQuery, JobHa
             m_DebugView.ReallocateIfNeeded(NumPixelsX, NumPixelsY);
 
             Profiler.BeginSample("Occlusion.Debug.RenderView");
-            m_DebugView.RenderToTextures(testQuery, meshQuery, this, mode, OcclusionBrowseWindow.IsVisible);
+            m_DebugView.RenderToTextures(testQuery, meshQuery, this, mode
+#if UNITY_EDITOR
+                , OcclusionBrowseWindow.IsVisible
+#endif
+                );
             Profiler.EndSample();
 
+#if UNITY_EDITOR
             if (refresh)
             {
                 OcclusionBrowseWindow.Refresh();
diff --git a/Unity.Entities.Graphics/Occlusion/Masked/Dots/OccluderMesh.cs b/Unity.Entities.Graphics/Occlusion/Masked/Dots/OccluderMesh.cs
index 64b1089..c582931 100644
--- a/Unity.Entities.Graphics/Occlusion/Masked/Dots/OccluderMesh.cs
+++ b/Unity.Entities.Graphics/Occlusion/Masked/Dots/OccluderMesh.cs
@@ -240,6 +240,9 @@ public unsafe void Transform(float4x4 mvp, BatchCullingProjectionType projection
                     // triangle to be rendered as the base.If all of the 2D w coordinates
                     // are 1, the determinant is also exactly twice the signed screenspace aren of the triangle.
                     // If the determinant is zero, either the triangle is degenerate or the view is edge - on.
+                    // Furthermore, for vertices defined by the right-hand rule, the determinant is positive if the triangle
+                    // is front-facing and negative if the triangle is back-facing. 
+
                     float area = (homogeneousVertices.c1.x - homogeneousVertices.c2.x) * (homogeneousVertices.c0.y - homogeneousVertices.c2.y)
                                - (homogeneousVertices.c2.x - homogeneousVertices.c0.x) * (homogeneousVertices.c2.y - homogeneousVertices.c1.y);
 
diff --git a/Unity.Entities.Graphics/Occlusion/Masked/IntrinsicUtils.cs b/Unity.Entities.Graphics/Occlusion/Masked/IntrinsicUtils.cs
index 4585848..9a145c9 100644
--- a/Unity.Entities.Graphics/Occlusion/Masked/IntrinsicUtils.cs
+++ b/Unity.Entities.Graphics/Occlusion/Masked/IntrinsicUtils.cs
@@ -9,11 +9,67 @@ namespace Unity.Rendering.Occlusion.Masked
 {
     static class IntrinsicUtils
     {
-        // naive approach, works with C# reference implementation
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static int _vmovemask_f32(v128 a)
+        {
+            if (Arm.Neon.IsNeonSupported)
+            {
+                //https://github.com/jratcliff63367/sse2neon/blob/master/SSE2NEON.h#L518
+                // TODO: this version should work but need to revisit the callsites and see if we can get rid of it altogether
+                v128 movemask = new v128(1u, 2u, 4u, 8u);
+                v128 highbit = new v128(0x80000000u);
+
+                v128 t0 = Arm.Neon.vtstq_u32(a, highbit);
+                v128 t1 = Arm.Neon.vandq_u32(t0, movemask);
+                return Arm.Neon.vaddvq_s32(t1);
+            }
+            return 0;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static v128 _vtranspose_s8(v128 a)
+        {
+            if (Arm.Neon.IsNeonSupported)
+            {
+                v128 v0 = Arm.Neon.vcopyq_laneq_u32(new v128(0), 0, a, 0);
+                v128 v1 = Arm.Neon.vcopyq_laneq_u32(new v128(0), 0, a, 1);
+                v128 v2 = Arm.Neon.vcopyq_laneq_u32(new v128(0), 0, a, 2);
+                v128 v3 = Arm.Neon.vcopyq_laneq_u32(new v128(0), 0, a, 3);
+
+                v128 v4 = Arm.Neon.vzip1q_s8(v0, v1);
+                v128 v5 = Arm.Neon.vzip1q_s8(v2, v3);
+                return Arm.Neon.vzip1q_u16(v4, v5);
+            }
+            return new v128();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static v128 _vsllv_ones(v128 ishift)
+        {
+            if (Arm.Neon.IsNeonSupported)
+            {
+                v128 shift = Arm.Neon.vminq_s32(ishift, new v128(32));
+                return Arm.Neon.vshlq_s32(new v128(~0), shift);
+            }
+            return new v128();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static v128 _vblendq_f32(v128 mask, v128 a, v128 b)
+        {
+            if (Arm.Neon.IsNeonSupported)
+            {
+                // set 32-bit element according to the sign bit
+                // to emulate intel blendv behavior
+                v128 swapMask = Arm.Neon.vcgezq_s32(mask);
+                return Arm.Neon.vbslq_s8(swapMask, a, b);
+            }
+            return new v128();
+        }
 
         // read access
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static int getIntLane(v128 vector, uint laneIdx)
+        internal static int getIntLane(v128 vector, uint laneIdx)
         {
             //Debug.Assert(laneIdx >= 0 && laneIdx < 4);
 
@@ -28,9 +84,38 @@ public static int getIntLane(v128 vector, uint laneIdx)
             }
         }
 
+        // read access
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static byte getByteLane(v128 vector, uint laneIdx)
+        {
+            //Debug.Assert(laneIdx >= 0 && laneIdx < 4);
+
+            // eat the modulo cost to not let it overflow
+            switch (laneIdx % 16)
+            {
+                default:    // DS: incorrect, but works with modulo and silences compiler (CS0161)
+                case 0: { return vector.Byte0; }
+                case 1: { return vector.Byte1; }
+                case 2: { return vector.Byte2; }
+                case 3: { return vector.Byte3; }
+                case 4: { return vector.Byte4; }
+                case 5: { return vector.Byte5; }
+                case 6: { return vector.Byte6; }
+                case 7: { return vector.Byte7; }
+                case 8: { return vector.Byte8; }
+                case 9: { return vector.Byte9; }
+                case 10: { return vector.Byte10; }
+                case 11: { return vector.Byte11; }
+                case 12: { return vector.Byte12; }
+                case 13: { return vector.Byte13; }
+                case 14: { return vector.Byte14; }
+                case 15: { return vector.Byte15; }
+            }
+        }
+
         // used for "write" access (returns copy, requires assignment afterwards)
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static v128 getCopyWithIntLane(v128 vector, uint laneIdx, int laneVal)
+        internal static v128 getCopyWithIntLane(v128 vector, uint laneIdx, int laneVal)
         {
             //Debug.Assert(laneIdx >= 0 && laneIdx < 4);
 
@@ -49,7 +134,7 @@ public static v128 getCopyWithIntLane(v128 vector, uint laneIdx, int laneVal)
 
         // read access
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static float getFloatLane(v128 vector, uint laneIdx)
+        internal static float getFloatLane(v128 vector, uint laneIdx)
         {
             //Debug.Assert(laneIdx >= 0 && laneIdx < 4);
 
@@ -65,37 +150,38 @@ public static float getFloatLane(v128 vector, uint laneIdx)
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static v128 _mmw_fmadd_ps(v128 a, v128 b, v128 c)
+        internal static v128 _mmw_fmadd_ps(v128 a, v128 b, v128 c)
         {
-            if (X86.Sse.IsSseSupported)
+            if (X86.Fma.IsFmaSupported)
+                return X86.Fma.fmadd_ps(a, b, c);
+            else if (X86.Sse.IsSseSupported)
                 return X86.Sse.add_ps(X86.Sse.mul_ps(a, b), c);
-            else
-                throw new System.NotImplementedException();
+            return new v128();
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static v128 _mmw_fmsub_ps(v128 a, v128 b, v128 c)
+        internal static v128 _mmw_fmsub_ps(v128 a, v128 b, v128 c)
         {
-            if (X86.Sse.IsSseSupported)
+            if (X86.Fma.IsFmaSupported)
+                return X86.Fma.fmsub_ps(a, b, c);
+            else if (X86.Sse.IsSseSupported)
                 return X86.Sse.sub_ps(X86.Sse.mul_ps(a, b), c);
-            else
-                throw new System.NotImplementedException();
+            return new v128();
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static v128 _mmw_transpose_epi8(v128 a)
+        internal static v128 _mmw_transpose_epi8(v128 a)
         {
             if (X86.Ssse3.IsSsse3Supported)
             {
                 v128 shuff = X86.Sse2.setr_epi8(0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF);
                 return X86.Ssse3.shuffle_epi8(a, shuff);
             }
-            else
-                throw new System.NotImplementedException();
+            return new v128();
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static v128 _mmw_sllv_ones(v128 ishift)
+        internal static v128 _mmw_sllv_ones(v128 ishift)
         {
             if (X86.Sse4_1.IsSse41Supported)
             {
@@ -120,12 +206,11 @@ public static v128 _mmw_sllv_ones(v128 ishift)
 
                 return retMask;
             }
-            else
-                throw new System.NotImplementedException();
+            return new v128();
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static ulong find_clear_lsb(ref uint mask)
+        internal static ulong find_clear_lsb(ref uint mask)
         {
             ulong idx = (ulong)math.tzcnt(mask);
             mask &= mask - 1;
diff --git a/Unity.Entities.Graphics/Occlusion/Masked/RasterizeJob.cs b/Unity.Entities.Graphics/Occlusion/Masked/RasterizeJob.cs
index b4dc54b..54a137b 100644
--- a/Unity.Entities.Graphics/Occlusion/Masked/RasterizeJob.cs
+++ b/Unity.Entities.Graphics/Occlusion/Masked/RasterizeJob.cs
@@ -1,5 +1,6 @@
 #if ENABLE_UNITY_OCCLUSION && (HDRP_10_0_0_OR_NEWER || URP_10_0_0_OR_NEWER)
 
+using System.Runtime.CompilerServices;
 using Unity.Burst;
 using Unity.Burst.Intrinsics;
 using Unity.Collections;
@@ -27,6 +28,7 @@ unsafe struct RasterizeJob : IJobFor
         [ReadOnly] public v128 PixelCenter;
         [ReadOnly] public v128 HalfSize;
         [ReadOnly] public v128 ScreenSize;
+        [ReadOnly] public int BinSize;
         [ReadOnly] public int NumPixelsX;
         [ReadOnly] public int NumPixelsY;
         [ReadOnly] public int NumTilesX;
@@ -35,201 +37,1063 @@ unsafe struct RasterizeJob : IJobFor
         [ReadOnly, NativeDisableUnsafePtrRestriction] public v128* FrustumPlanes;
         [ReadOnly] public ScissorRect FullScreenScissor;
 
+        [NativeSetThreadIndex] private int WorkerIndex;
+
         // A bin is a screen area formed by X * Y tiles, a tile is the minimum pixels that we
         // process in the system
         [ReadOnly] public int TilesPerBinX;
         [ReadOnly] public int TilesPerBinY;
         // A buffer group contains a bunch of contiguous tile-buffers. This pointer points to the base of the one we're
         // rendering to.
-        [NativeDisableUnsafePtrRestriction] public Tile* TilesBasePtr;
+        [NativeDisableUnsafePtrRestriction] public Tile*    TilesBasePtr;
+        [NativeDisableUnsafePtrRestriction] public float*   BinTriangleXBasePtr;
+        [NativeDisableUnsafePtrRestriction] public float*   BinTriangleYBasePtr;
+        [NativeDisableUnsafePtrRestriction] public float*   BinTriangleWBasePtr;
 
         const int MAX_CLIPPED = 32;
         const int SIMD_LANES = 4;
         const int SIMD_ALL_LANES_MASK = (1 << SIMD_LANES) - 1;
         const int BIG_TRIANGLE = 3;
 
-        public void Execute(int i)
+        #region SSE
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private void TraverseScanlineSSE(Tile* tiles, int numRight, int numLeft, int leftOffset, int rightOffset, int tileIdx, int rightEvent, int leftEvent, v128* events, v128 zTriMin, v128 zTriMax, v128 iz0, float zx)
         {
-            var tiles = &TilesBasePtr[0];
-            ScissorRect scissorRect = new ScissorRect();
+            if (X86.Sse4_1.IsSse41Supported)
+            {
+                v128* right = stackalloc v128[numRight];
+                v128* left = stackalloc v128[numLeft];
 
-            int2 pixelsPerTile = new int2(NumPixelsX / NumTilesX, NumPixelsY / NumTilesY);
+                // Floor edge events to integer pixel coordinates (shift out fixed point bits)
+                v128 eventOffset = new v128(leftOffset << BufferGroup.TileWidthShift);
+                v128 emptyBitMask = new v128(0);
+                v128 fullBitMask = new v128(~0);
+                v128 simdTileWidth = X86.Sse2.set1_epi32(BufferGroup.TileWidth);
 
-            const int binSize = 1024*3;
-            float* temp_stack_x = stackalloc float[12];
-            float* temp_stack_y = stackalloc float[12];
-            float* temp_stack_w = stackalloc float[12];
-            int tempStackSize = 0;
-            NativeArray<float> binTriangleX = new NativeArray<float>(binSize, Allocator.Temp);
-            NativeArray<float> binTriangleY = new NativeArray<float>(binSize, Allocator.Temp);
-            NativeArray<float> binTriangleW = new NativeArray<float>(binSize, Allocator.Temp);
+                for (int i = 0; i < numRight; ++i)
+                {
+                    right[i] = X86.Sse4_1.max_epi32(X86.Sse2.sub_epi32(X86.Sse2.srai_epi32(events[rightEvent + i], BufferGroup.FpBits), eventOffset), emptyBitMask);
+                }
 
-            int countOfTilesX = NumTilesX / TilesPerBinX;
-            scissorRect.mMinX = (i % countOfTilesX) * pixelsPerTile.x * TilesPerBinX;
-            scissorRect.mMaxX = scissorRect.mMinX + pixelsPerTile.x * TilesPerBinX;
-            scissorRect.mMinY = (i / countOfTilesX) * pixelsPerTile.y * TilesPerBinY;
-            scissorRect.mMaxY = scissorRect.mMinY + pixelsPerTile.y * TilesPerBinY;
+                for (int i = 0; i < numLeft; ++i)
+                {
+                    left[i] = X86.Sse4_1.max_epi32(X86.Sse2.sub_epi32(X86.Sse2.srai_epi32(events[leftEvent - i], BufferGroup.FpBits), eventOffset), emptyBitMask);
+                }
 
-            float4 clipRect = new float4(scissorRect.mMinX, scissorRect.mMinY, scissorRect.mMaxX, scissorRect.mMaxY);
-            clipRect = (2 * clipRect.xyzw / (new float2(NumPixelsX, NumPixelsY).xyxy) - 1);
+                v128 z0 = X86.Sse.add_ps(iz0, X86.Sse.set1_ps(zx * leftOffset));
+                int tileIdxEnd = tileIdx + rightOffset;
+                tileIdx += leftOffset;
 
-            // For each mesh
-            // if the mesh aabb is inside the bin aabb
-            // check all each triangle and test against the bin aabb
-            // if inside the bin, add in, once the bin is full render it
-            // once the loop finish, render the remaining triangles in the bin
-            int internalBinSize = 0;
-            for (int m = 0; m < ClippedOccluders.Length; m += 1)
-            {
-                float2 max = ClippedOccluders[m].screenMax.xy;
-                float2 min = ClippedOccluders[m].screenMin.xy;
+                for (; ; )
+                {
+                    // Compute zMin for the overlapped layers
+                    v128 mask = tiles[tileIdx].mask;
+                    v128 zMin0 = X86.Sse4_1.blendv_ps(tiles[tileIdx].zMin0, tiles[tileIdx].zMin1, X86.Sse2.cmpeq_epi32(mask, fullBitMask));
+                    v128 zMin1 = X86.Sse4_1.blendv_ps(tiles[tileIdx].zMin1, tiles[tileIdx].zMin0, X86.Sse2.cmpeq_epi32(mask, emptyBitMask));
+                    v128 zMinBuf = X86.Sse.min_ps(zMin0, zMin1);
+                    v128 dist0 = X86.Sse.sub_ps(zTriMax, zMinBuf);
 
-                if (math.any(min > clipRect.zw) || math.any(max < clipRect.xy))
-                    continue;
+                    if (X86.Sse.movemask_ps(dist0) != SIMD_ALL_LANES_MASK)
+                    {
+                        // Compute coverage mask for entire 32xN using shift operations
+                        v128 accumulatedMask = IntrinsicUtils._mmw_sllv_ones(left[0]);
 
-                ClippedOccluder clipped = ClippedOccluders[m];
-                
-                int k = 0;
-                for (int j = 0; j < clipped.expandedVertexSize; j += 3, ++k)
-                {
-                    float4 triExtents = ClippedTriExtents[clipped.sourceIndexOffset * 2 + k];
-                    min = triExtents.xy;
-                    max = triExtents.zw;
-                    
-                    if (math.any(min > clipRect.zw) || math.any(max < clipRect.xy))
-                        continue;
+                        for (int i = 1; i < numLeft; ++i)
+                        {
+                            accumulatedMask = X86.Sse2.and_si128(accumulatedMask, IntrinsicUtils._mmw_sllv_ones(left[i]));
+                        }
 
-                    for (int n = 0; n < 3; ++n)
+                        for (int i = 0; i < numRight; ++i)
+                        {
+                            accumulatedMask = X86.Sse2.andnot_si128(IntrinsicUtils._mmw_sllv_ones(right[i]), accumulatedMask);
+                        }
+
+                        // Compute interpolated min for each 8x4 subtile and update the masked hierarchical z buffer entry
+                        v128 zSubTileMin = X86.Sse.max_ps(z0, zTriMin);
+                        UpdateTileAccurateSSE(tiles, tileIdx, IntrinsicUtils._mmw_transpose_epi8(accumulatedMask), zSubTileMin);
+                    }
+
+                    // Update buffer address, interpolate z and edge events
+                    tileIdx++;
+
+                    if (tileIdx >= tileIdxEnd)
                     {
-                        float3 vert = ClippedVerts[clipped.sourceIndexOffset * 6 + j + n];
-                        temp_stack_x[tempStackSize] = vert.x;
-                        temp_stack_y[tempStackSize] = vert.y;
-                        temp_stack_w[tempStackSize] = vert.z;
-                        tempStackSize++;
+                        break;
                     }
 
-                    if(tempStackSize == 12)
+                    z0 = X86.Sse.add_ps(z0, X86.Sse.set1_ps(zx));
+
+                    for (int i = 0; i < numRight; ++i)
                     {
-                        for(int n = 0; n < 3; ++n)
-                        {
-                            for(int p = 0; p < 4; ++p)
-                            {
-                                binTriangleX[internalBinSize + p + n * 4] = temp_stack_x[n + p * 3];
-                                binTriangleY[internalBinSize + p + n * 4] = temp_stack_y[n + p * 3];
-                                binTriangleW[internalBinSize + p + n * 4] = temp_stack_w[n + p * 3];
-                            }
-                        }
-                        internalBinSize += 12;
-                        tempStackSize = 0;
+                        right[i] = X86.Sse2.subs_epu16(right[i], simdTileWidth);  // Trick, use sub saturated to avoid checking against < 0 for shift (values should fit in 16 bits)
+                    }
+
+                    for (int i = 0; i < numLeft; ++i)
+                    {
+                        left[i] = X86.Sse2.subs_epu16(left[i], simdTileWidth);
+                    }
+                }
+            }
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        void UpdateTileAccurateSSE(Tile* tiles, int tileIdx, v128 coverage, v128 zTriv)
+        {
+            if (X86.Sse4_1.IsSse41Supported)
+            {
+                v128 zMin0 = tiles[tileIdx].zMin0;
+                v128 zMin1 = tiles[tileIdx].zMin1;
+                v128 mask = tiles[tileIdx].mask;
+
+                // Swizzle coverage mask to 8x4 subtiles
+                v128 rastMask = coverage;
+
+                // Perform individual depth tests with layer 0 & 1 and mask out all failing pixels
+                v128 sdist0 = X86.Sse.sub_ps(zMin0, zTriv);
+                v128 sdist1 = X86.Sse.sub_ps(zMin1, zTriv);
+                v128 sign0 = X86.Sse2.srai_epi32(sdist0, 31);
+                v128 sign1 = X86.Sse2.srai_epi32(sdist1, 31);
+                v128 triMask = X86.Sse2.and_si128(rastMask, X86.Sse2.or_si128(X86.Sse2.andnot_si128(mask, sign0), X86.Sse2.and_si128(mask, sign1)));
+
+                // Early out if no pixels survived the depth test (this test is more accurate than
+                // the early culling test in TraverseScanline())
+                v128 t0 = X86.Sse2.cmpeq_epi32(triMask, X86.Sse2.setzero_si128());
+                v128 t0inv = /*not_epi32*/ X86.Sse2.xor_si128(t0, X86.Sse2.set1_epi32(~0));
+
+                if (X86.Sse4_1.testz_si128(t0inv, t0inv) != 0)
+                {
+                    return;
+                }
+
+#if MOC_ENABLE_STATS
+                STATS_ADD(ref mStats.mOccluders.mNumTilesUpdated, 1);
+#endif
+
+                v128 zTri = X86.Sse4_1.blendv_ps(zTriv, zMin0, t0);
+
+                // Test if incoming triangle completely overwrites layer 0 or 1
+                v128 layerMask0 = X86.Sse2.andnot_si128(triMask, /*not_epi32*/ X86.Sse2.xor_si128(mask, X86.Sse2.set1_epi32(~0)));
+                v128 layerMask1 = X86.Sse2.andnot_si128(triMask, mask);
+                v128 lm0 = X86.Sse2.cmpeq_epi32(layerMask0, X86.Sse2.setzero_si128());
+                v128 lm1 = X86.Sse2.cmpeq_epi32(layerMask1, X86.Sse2.setzero_si128());
+                v128 z0 = X86.Sse4_1.blendv_ps(zMin0, zTri, lm0);
+                v128 z1 = X86.Sse4_1.blendv_ps(zMin1, zTri, lm1);
+
+                // Compute distances used for merging heuristic
+                v128 d0 = /*abs_ps*/ X86.Sse.and_ps(sdist0, X86.Sse2.set1_epi32(0x7FFFFFFF));
+                v128 d1 = /*abs_ps*/ X86.Sse.and_ps(sdist1, X86.Sse2.set1_epi32(0x7FFFFFFF));
+                v128 d2 = /*abs_ps*/ X86.Sse.and_ps(X86.Sse.sub_ps(z0, z1), X86.Sse2.set1_epi32(0x7FFFFFFF));
+
+                // Find minimum distance
+                v128 c01 = X86.Sse.sub_ps(d0, d1);
+                v128 c02 = X86.Sse.sub_ps(d0, d2);
+                v128 c12 = X86.Sse.sub_ps(d1, d2);
+                // Two tests indicating which layer the incoming triangle will merge with or
+                // overwrite. d0min indicates that the triangle will overwrite layer 0, and
+                // d1min flags that the triangle will overwrite layer 1.
+                v128 d0min = X86.Sse2.or_si128(X86.Sse2.and_si128(c01, c02), X86.Sse2.or_si128(lm0, t0));
+                v128 d1min = X86.Sse2.andnot_si128(d0min, X86.Sse2.or_si128(c12, lm1));
+
+                /* Update depth buffer entry. NOTE: we always merge into layer 0, so if the
+                   triangle should be merged with layer 1, we first swap layer 0 & 1 and then
+                   merge into layer 0. */
+
+                // Update mask based on which layer the triangle overwrites or was merged into
+                v128 inner = X86.Sse4_1.blendv_ps(triMask, layerMask1, d0min);
+
+                // Update the zMin[0] value. There are four outcomes: overwrite with layer 1,
+                // merge with layer 1, merge with zTri or overwrite with layer 1 and then merge
+                // with zTri.
+                v128 e0 = X86.Sse4_1.blendv_ps(z0, z1, d1min);
+                v128 e1 = X86.Sse4_1.blendv_ps(z1, zTri, X86.Sse2.or_si128(d1min, d0min));
+
+                // Update the zMin[1] value. There are three outcomes: keep current value,
+                // overwrite with zTri, or overwrite with z1
+                v128 z1t = X86.Sse4_1.blendv_ps(zTri, z1, d0min);
+
+                tiles[tileIdx].zMin0 = X86.Sse.min_ps(e0, e1);
+                tiles[tileIdx].zMin1 = X86.Sse4_1.blendv_ps(z1t, z0, d1min);
+                tiles[tileIdx].mask = X86.Sse4_1.blendv_ps(inner, layerMask0, d1min);
+            }
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private void UpdateTileEventsYSSE(v128* triEventRemainder, v128* triSlopeTileRemainder, v128* triEdgeY, v128* triEvent, v128* triSlopeTileDelta, v128* triSlopeSign, int i)
+        {
+            if (X86.Sse2.IsSse2Supported)
+            {
+                triEventRemainder[i] = X86.Sse2.sub_epi32(triEventRemainder[i], triSlopeTileRemainder[i]);
+                v128 overflow = X86.Sse2.srai_epi32(triEventRemainder[i], 31);
+                triEventRemainder[i] = X86.Sse2.add_epi32(triEventRemainder[i], X86.Sse2.and_si128(overflow, triEdgeY[i]));
+                triEvent[i] = X86.Sse2.add_epi32(triEvent[i], X86.Sse2.add_epi32(triSlopeTileDelta[i], X86.Sse2.and_si128(overflow, triSlopeSign[i])));
+            }
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private void SortVerticesSSE(v128* vX, v128* vY)
+        {
+            if (X86.Sse4_1.IsSse41Supported)
+            {
+                // Rotate the triangle in the winding order until v0 is the vertex with lowest Y value
+                for (int i = 0; i < 2; i++)
+                {
+                    v128 ey1 = X86.Sse2.sub_epi32(vY[1], vY[0]);
+                    v128 ey2 = X86.Sse2.sub_epi32(vY[2], vY[0]);
+                    v128 swapMask = X86.Sse2.or_si128(X86.Sse2.or_si128(ey1, ey2), X86.Sse2.cmpeq_epi32(ey2, X86.Sse2.setzero_si128()));
+
+                    v128 sX = X86.Sse4_1.blendv_ps(vX[2], vX[0], swapMask);
+                    vX[0] = X86.Sse4_1.blendv_ps(vX[0], vX[1], swapMask);
+                    vX[1] = X86.Sse4_1.blendv_ps(vX[1], vX[2], swapMask);
+                    vX[2] = sX;
+
+                    v128 sY = X86.Sse4_1.blendv_ps(vY[2], vY[0], swapMask);
+                    vY[0] = X86.Sse4_1.blendv_ps(vY[0], vY[1], swapMask);
+                    vY[1] = X86.Sse4_1.blendv_ps(vY[1], vY[2], swapMask);
+                    vY[2] = sY;
+                }
+            }
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private void ComputeDepthPlaneSSE(v128* pVtxX, v128* pVtxY, v128* pVtxZ, out v128 zPixelDx, out v128 zPixelDy)
+        {
+            if (X86.Sse.IsSseSupported)
+            {
+                // Setup z(x,y) = z0 + dx*x + dy*y screen space depth plane equation
+                v128 x2 = X86.Sse.sub_ps(pVtxX[2], pVtxX[0]);
+                v128 x1 = X86.Sse.sub_ps(pVtxX[1], pVtxX[0]);
+                v128 y1 = X86.Sse.sub_ps(pVtxY[1], pVtxY[0]);
+                v128 y2 = X86.Sse.sub_ps(pVtxY[2], pVtxY[0]);
+                v128 z1 = X86.Sse.sub_ps(pVtxZ[1], pVtxZ[0]);
+                v128 z2 = X86.Sse.sub_ps(pVtxZ[2], pVtxZ[0]);
+                v128 d = X86.Sse.div_ps(X86.Sse.set1_ps(1.0f), IntrinsicUtils._mmw_fmsub_ps(x1, y2, X86.Sse.mul_ps(y1, x2)));
+                zPixelDx = X86.Sse.mul_ps(IntrinsicUtils._mmw_fmsub_ps(z1, y2, X86.Sse.mul_ps(y1, z2)), d);
+                zPixelDy = X86.Sse.mul_ps(IntrinsicUtils._mmw_fmsub_ps(x1, z2, X86.Sse.mul_ps(z1, x2)), d);
+            }
+            else
+            {
+                zPixelDx = new v128();
+                zPixelDy = new v128();
+            }
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private void ComputeBoundingBoxSSE(v128* vX, v128* vY, ref ScissorRect scissor, out v128 bbminX, out v128 bbminY, out v128 bbmaxX, out v128 bbmaxY)
+        {
+            if (X86.Sse4_1.IsSse41Supported)
+            {
+                // Find Min/Max vertices
+                bbminX = X86.Sse2.cvttps_epi32(X86.Sse.min_ps(vX[0], X86.Sse.min_ps(vX[1], vX[2])));
+                bbminY = X86.Sse2.cvttps_epi32(X86.Sse.min_ps(vY[0], X86.Sse.min_ps(vY[1], vY[2])));
+                bbmaxX = X86.Sse2.cvttps_epi32(X86.Sse.max_ps(vX[0], X86.Sse.max_ps(vX[1], vX[2])));
+                bbmaxY = X86.Sse2.cvttps_epi32(X86.Sse.max_ps(vY[0], X86.Sse.max_ps(vY[1], vY[2])));
+
+                // Clamp to tile boundaries
+                v128 SimdPadWMask = X86.Sse2.set1_epi32(~(BufferGroup.TileWidth - 1));
+                v128 SimdPadHMask = X86.Sse2.set1_epi32(~(BufferGroup.TileHeight - 1));
+                bbminX = X86.Sse2.and_si128(bbminX, SimdPadWMask);
+                bbmaxX = X86.Sse2.and_si128(X86.Sse2.add_epi32(bbmaxX, X86.Sse2.set1_epi32(BufferGroup.TileWidth)), SimdPadWMask);
+                bbminY = X86.Sse2.and_si128(bbminY, SimdPadHMask);
+                bbmaxY = X86.Sse2.and_si128(X86.Sse2.add_epi32(bbmaxY, X86.Sse2.set1_epi32(BufferGroup.TileHeight)), SimdPadHMask);
+
+                // Clip to scissor
+                bbminX = X86.Sse4_1.max_epi32(bbminX, X86.Sse2.set1_epi32(scissor.mMinX));
+                bbmaxX = X86.Sse4_1.min_epi32(bbmaxX, X86.Sse2.set1_epi32(scissor.mMaxX));
+                bbminY = X86.Sse4_1.max_epi32(bbminY, X86.Sse2.set1_epi32(scissor.mMinY));
+                bbmaxY = X86.Sse4_1.min_epi32(bbmaxY, X86.Sse2.set1_epi32(scissor.mMaxY));
+            }
+            else
+            {
+                bbminX = new v128();
+                bbminY = new v128();
+                bbmaxX = new v128();
+                bbmaxY = new v128();
+            }
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        void ProjectVerticesSSE(v128* ipVtxX, v128* ipVtxY, v128* pVtxX, v128* pVtxY, v128* pVtxZ, v128* vtxX, v128* vtxY, v128* vtxW)
+        {
+            if (X86.Sse2.IsSse2Supported)
+            {
+                const float FP_INV = 1f / (1 << BufferGroup.FpBits);
+                // Project vertices and transform to screen space. Snap to sub-pixel coordinates with BufferGroup.FpBits precision.
+                for (int i = 0; i < 3; i++)
+                {
+                    int idx = 2 - i;
+                    v128 rcpW;
+
+                    if (ProjectionType == BatchCullingProjectionType.Orthographic)
+                    {
+                        rcpW = IntrinsicUtils._mmw_fmadd_ps(X86.Sse.set1_ps(-1.0f), vtxW[i], X86.Sse.set1_ps(1.0f));
+
+                        v128 screenX = IntrinsicUtils._mmw_fmadd_ps(vtxX[i], HalfWidth, PixelCenterX);
+                        v128 screenY = IntrinsicUtils._mmw_fmadd_ps(vtxY[i], HalfHeight, PixelCenterY);
+                        ipVtxX[idx] = X86.Sse2.cvtps_epi32(X86.Sse.mul_ps(screenX, X86.Sse.set1_ps((float)(1 << BufferGroup.FpBits))));
+                        ipVtxY[idx] = X86.Sse2.cvtps_epi32(X86.Sse.mul_ps(screenY, X86.Sse.set1_ps((float)(1 << BufferGroup.FpBits))));
+                    }
+                    else
+                    {
+                        rcpW = X86.Sse.div_ps(X86.Sse.set1_ps(1f), vtxW[i]);
+
+                        v128 screenX = IntrinsicUtils._mmw_fmadd_ps(X86.Sse.mul_ps(vtxX[i], HalfWidth), rcpW, PixelCenterX);
+                        v128 screenY = IntrinsicUtils._mmw_fmadd_ps(X86.Sse.mul_ps(vtxY[i], HalfHeight), rcpW, PixelCenterY);
+
+                        ipVtxX[idx] = X86.Sse2.cvtps_epi32(X86.Sse.mul_ps(screenX, X86.Sse.set1_ps((float)(1 << BufferGroup.FpBits))));
+                        ipVtxY[idx] = X86.Sse2.cvtps_epi32(X86.Sse.mul_ps(screenY, X86.Sse.set1_ps((float)(1 << BufferGroup.FpBits))));
+                    }
+
+                    pVtxX[idx] = X86.Sse.mul_ps(X86.Sse2.cvtepi32_ps(ipVtxX[idx]), X86.Sse.set1_ps(FP_INV));
+                    pVtxY[idx] = X86.Sse.mul_ps(X86.Sse2.cvtepi32_ps(ipVtxY[idx]), X86.Sse.set1_ps(FP_INV));
+                    pVtxZ[idx] = rcpW;
+                }
+            }
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private void RasterizeTriangleBatchSSE(Tile* tiles, v128* ipVtxX, v128* ipVtxY, v128* pVtxX, v128* pVtxY, v128* pVtxZ, uint triMask, ScissorRect scissor)
+        {
+            if (X86.Sse4_1.IsSse41Supported)
+            {
+                //we are computing the bounding box again when we used it before but there are some use cases after, this check cannot be removed atm
+
+                // Compute bounding box and clamp to tile coordinates
+                ComputeBoundingBoxSSE(pVtxX, pVtxY, ref scissor, out var bbPixelMinX, out var bbPixelMinY, out var bbPixelMaxX, out var bbPixelMaxY);
+
+                // Clamp bounding box to tiles (it's already padded in computeBoundingBox)
+                v128 bbTileMinX = X86.Sse2.srai_epi32(bbPixelMinX, BufferGroup.TileWidthShift);
+                v128 bbTileMinY = X86.Sse2.srai_epi32(bbPixelMinY, BufferGroup.TileHeightShift);
+                v128 bbTileMaxX = X86.Sse2.srai_epi32(bbPixelMaxX, BufferGroup.TileWidthShift);
+                v128 bbTileMaxY = X86.Sse2.srai_epi32(bbPixelMaxY, BufferGroup.TileHeightShift);
+                v128 bbTileSizeX = X86.Sse2.sub_epi32(bbTileMaxX, bbTileMinX);
+                v128 bbTileSizeY = X86.Sse2.sub_epi32(bbTileMaxY, bbTileMinY);
+
+                // Cull triangles with zero bounding box
+                v128 bboxSign = X86.Sse2.or_si128(X86.Sse2.sub_epi32(bbTileSizeX, X86.Sse2.set1_epi32(1)), X86.Sse2.sub_epi32(bbTileSizeY, X86.Sse2.set1_epi32(1)));
+                triMask &= (uint)((~X86.Sse.movemask_ps(bboxSign)) & SIMD_ALL_LANES_MASK);
+
+                if (triMask == 0x0)
+                {
+                    return; // View-culled
+                }
+
+                // Set up screen space depth plane
+                ComputeDepthPlaneSSE(pVtxX, pVtxY, pVtxZ, out var zPixelDx, out var zPixelDy);
+
+                // Compute z value at min corner of bounding box. Offset to make sure z is conservative for all 8x4 subtiles
+                v128 bbMinXV0 = X86.Sse.sub_ps(X86.Sse2.cvtepi32_ps(bbPixelMinX), pVtxX[0]);
+                v128 bbMinYV0 = X86.Sse.sub_ps(X86.Sse2.cvtepi32_ps(bbPixelMinY), pVtxY[0]);
+                v128 zPlaneOffset = IntrinsicUtils._mmw_fmadd_ps(zPixelDx, bbMinXV0, IntrinsicUtils._mmw_fmadd_ps(zPixelDy, bbMinYV0, pVtxZ[0]));
+                v128 zTileDx = X86.Sse.mul_ps(zPixelDx, X86.Sse.set1_ps(BufferGroup.TileWidth));
+                v128 zTileDy = X86.Sse.mul_ps(zPixelDy, X86.Sse.set1_ps(BufferGroup.TileHeight));
+
+                zPlaneOffset = X86.Sse.add_ps(zPlaneOffset, X86.Sse.min_ps(X86.Sse2.setzero_si128(), X86.Sse.mul_ps(zPixelDx, X86.Sse.set1_ps(BufferGroup.SubTileWidth))));
+                zPlaneOffset = X86.Sse.add_ps(zPlaneOffset, X86.Sse.min_ps(X86.Sse2.setzero_si128(), X86.Sse.mul_ps(zPixelDy, X86.Sse.set1_ps(BufferGroup.SubTileHeight))));
+
+                // Compute Zmin and Zmax for the triangle (used to narrow the range for difficult tiles)
+                v128 zMin = X86.Sse.min_ps(pVtxZ[0], X86.Sse.min_ps(pVtxZ[1], pVtxZ[2]));
+                v128 zMax = X86.Sse.max_ps(pVtxZ[0], X86.Sse.max_ps(pVtxZ[1], pVtxZ[2]));
+
+                /* Sort vertices (v0 has lowest Y, and the rest is in winding order) and compute edges. Also find the middle
+                    vertex and compute tile */
+
+                // Rotate the triangle in the winding order until v0 is the vertex with lowest Y value
+                SortVerticesSSE(ipVtxX, ipVtxY);
+
+                // Compute edges
+                v128* edgeX = stackalloc v128[3];
+                edgeX[0] = X86.Sse2.sub_epi32(ipVtxX[1], ipVtxX[0]);
+                edgeX[1] = X86.Sse2.sub_epi32(ipVtxX[2], ipVtxX[1]);
+                edgeX[2] = X86.Sse2.sub_epi32(ipVtxX[2], ipVtxX[0]);
+
+                v128* edgeY = stackalloc v128[3];
+                edgeY[0] = X86.Sse2.sub_epi32(ipVtxY[1], ipVtxY[0]);
+                edgeY[1] = X86.Sse2.sub_epi32(ipVtxY[2], ipVtxY[1]);
+                edgeY[2] = X86.Sse2.sub_epi32(ipVtxY[2], ipVtxY[0]);
+
+                // Classify if the middle vertex is on the left or right and compute its position
+                int midVtxRight = ~X86.Sse.movemask_ps(edgeY[1]);
+                v128 midPixelX = X86.Sse4_1.blendv_ps(ipVtxX[1], ipVtxX[2], edgeY[1]);
+                v128 midPixelY = X86.Sse4_1.blendv_ps(ipVtxY[1], ipVtxY[2], edgeY[1]);
+                v128 midTileY = X86.Sse2.srai_epi32(X86.Sse4_1.max_epi32(midPixelY, X86.Sse2.setzero_si128()), BufferGroup.TileHeightShift + BufferGroup.FpBits);
+                v128 bbMidTileY = X86.Sse4_1.max_epi32(bbTileMinY, X86.Sse4_1.min_epi32(bbTileMaxY, midTileY));
+
+                // Compute edge events for the bottom of the bounding box, or for the middle tile in case of
+                // the edge originating from the middle vertex.
+                v128* xDiffi = stackalloc v128[2];
+                xDiffi[0] = X86.Sse2.sub_epi32(ipVtxX[0], X86.Sse2.slli_epi32(bbPixelMinX, BufferGroup.FpBits));
+                xDiffi[1] = X86.Sse2.sub_epi32(midPixelX, X86.Sse2.slli_epi32(bbPixelMinX, BufferGroup.FpBits));
+
+                v128* yDiffi = stackalloc v128[2];
+                yDiffi[0] = X86.Sse2.sub_epi32(ipVtxY[0], X86.Sse2.slli_epi32(bbPixelMinY, BufferGroup.FpBits));
+                yDiffi[1] = X86.Sse2.sub_epi32(midPixelY, X86.Sse2.slli_epi32(bbMidTileY, BufferGroup.FpBits + BufferGroup.TileHeightShift));
+
+                /* Edge slope setup - Note we do not conform to DX/GL rasterization rules */
+
+                // Potentially flip edge to ensure that all edges have positive Y slope.
+                edgeX[1] = X86.Sse4_1.blendv_ps(edgeX[1], /*neg_epi32*/ X86.Sse2.sub_epi32(X86.Sse2.set1_epi32(0), edgeX[1]), edgeY[1]);
+                edgeY[1] = X86.Ssse3.abs_epi32(edgeY[1]);
+
+                // Compute floating point slopes
+                v128* slope = stackalloc v128[3];
+                slope[0] = X86.Sse.div_ps(X86.Sse2.cvtepi32_ps(edgeX[0]), X86.Sse2.cvtepi32_ps(edgeY[0]));
+                slope[1] = X86.Sse.div_ps(X86.Sse2.cvtepi32_ps(edgeX[1]), X86.Sse2.cvtepi32_ps(edgeY[1]));
+                slope[2] = X86.Sse.div_ps(X86.Sse2.cvtepi32_ps(edgeX[2]), X86.Sse2.cvtepi32_ps(edgeY[2]));
+
+                // Modify slope of horizontal edges to make sure they mask out pixels above/below the edge. The slope is set to screen
+                // width to mask out all pixels above or below the horizontal edge. We must also add a small bias to acount for that
+                // vertices may end up off screen due to clipping. We're assuming that the round off error is no bigger than 1.0
+                v128 horizontalSlopeDelta = X86.Sse.set1_ps(2f * (NumPixelsX + 2f * (BufferGroup.GuardBandPixelSize + 1.0f)));
+                v128 horizontalSlope0 = X86.Sse2.cmpeq_epi32(edgeY[0], X86.Sse2.setzero_si128());
+                v128 horizontalSlope1 = X86.Sse2.cmpeq_epi32(edgeY[1], X86.Sse2.setzero_si128());
+                slope[0] = X86.Sse4_1.blendv_ps(slope[0], horizontalSlopeDelta, horizontalSlope0);
+                slope[1] = X86.Sse4_1.blendv_ps(slope[1], /*neg_ps*/ X86.Sse.xor_ps(horizontalSlopeDelta, X86.Sse.set1_ps(-0f)), horizontalSlope1);
+
+                v128* vy = stackalloc v128[3];
+                vy[0] = yDiffi[0];
+                vy[1] = yDiffi[1];
+                vy[2] = yDiffi[0];
+
+                v128 offset0 = X86.Sse2.and_si128(X86.Sse2.add_epi32(yDiffi[0], X86.Sse2.set1_epi32(BufferGroup.FpHalfPixel - 1)), X86.Sse2.set1_epi32((-1 << BufferGroup.FpBits)));
+                v128 offset1 = X86.Sse2.and_si128(X86.Sse2.add_epi32(yDiffi[1], X86.Sse2.set1_epi32(BufferGroup.FpHalfPixel - 1)), X86.Sse2.set1_epi32((-1 << BufferGroup.FpBits)));
+                vy[0] = X86.Sse4_1.blendv_ps(yDiffi[0], offset0, horizontalSlope0);
+                vy[1] = X86.Sse4_1.blendv_ps(yDiffi[1], offset1, horizontalSlope1);
+
+                // Compute edge events for the bottom of the bounding box, or for the middle tile in case of
+                // the edge originating from the middle vertex.
+                v128* slopeSign = stackalloc v128[3];
+                v128* absEdgeX = stackalloc v128[3];
+                v128* slopeTileDelta = stackalloc v128[3];
+                v128* eventStartRemainder = stackalloc v128[3];
+                v128* slopeTileRemainder = stackalloc v128[3];
+                v128* eventStart = stackalloc v128[3];
+
+                for (int i = 0; i < 3; i++)
+                {
+                    // Common, compute slope sign (used to propagate the remainder term when overflowing) is postive or negative x-direction
+                    slopeSign[i] = X86.Sse4_1.blendv_ps(X86.Sse2.set1_epi32(1), X86.Sse2.set1_epi32(-1), edgeX[i]);
+                    absEdgeX[i] = X86.Ssse3.abs_epi32(edgeX[i]);
+
+                    // Delta and error term for one vertical tile step. The exact delta is exactDelta = edgeX / edgeY, due to limited precision we
+                    // repersent the delta as delta = qoutient + remainder / edgeY, where quotient = int(edgeX / edgeY). In this case, since we step
+                    // one tile of scanlines at a time, the slope is computed for a tile-sized step.
+                    slopeTileDelta[i] = X86.Sse2.cvttps_epi32(X86.Sse.mul_ps(slope[i], X86.Sse.set1_ps(BufferGroup.FpTileHeight)));
+                    slopeTileRemainder[i] = X86.Sse2.sub_epi32(X86.Sse2.slli_epi32(absEdgeX[i], BufferGroup.FpTileHeightShift), X86.Sse4_1.mullo_epi32(X86.Ssse3.abs_epi32(slopeTileDelta[i]), edgeY[i]));
+
+                    // Jump to bottom scanline of tile row, this is the bottom of the bounding box, or the middle vertex of the triangle.
+                    // The jump can be in both positive and negative y-direction due to clipping / offscreen vertices.
+                    v128 tileStartDir = X86.Sse4_1.blendv_ps(slopeSign[i], /*neg_epi32*/ X86.Sse2.sub_epi32(X86.Sse2.set1_epi32(0), slopeSign[i]), vy[i]);
+                    v128 tieBreaker = X86.Sse4_1.blendv_ps(X86.Sse2.set1_epi32(0), X86.Sse2.set1_epi32(1), tileStartDir);
+                    v128 tileStartSlope = X86.Sse2.cvttps_epi32(X86.Sse.mul_ps(slope[i], X86.Sse2.cvtepi32_ps(/*neg_epi32*/ X86.Sse2.sub_epi32(X86.Sse2.set1_epi32(0), vy[i]))));
+                    v128 tileStartRemainder = X86.Sse2.sub_epi32(X86.Sse4_1.mullo_epi32(absEdgeX[i], X86.Ssse3.abs_epi32(vy[i])), X86.Sse4_1.mullo_epi32(X86.Ssse3.abs_epi32(tileStartSlope), edgeY[i]));
+
+                    eventStartRemainder[i] = X86.Sse2.sub_epi32(tileStartRemainder, tieBreaker);
+                    v128 overflow = X86.Sse2.srai_epi32(eventStartRemainder[i], 31);
+                    eventStartRemainder[i] = X86.Sse2.add_epi32(eventStartRemainder[i], X86.Sse2.and_si128(overflow, edgeY[i]));
+                    eventStartRemainder[i] = X86.Sse4_1.blendv_ps(eventStartRemainder[i], X86.Sse2.sub_epi32(X86.Sse2.sub_epi32(edgeY[i], eventStartRemainder[i]), X86.Sse2.set1_epi32(1)), vy[i]);
+
+                    //eventStart[i] = xDiffi[i & 1] + tileStartSlope + (overflow & tileStartDir) + X86.Sse2.set1_epi32(FP_HALF_PIXEL - 1) + tieBreaker;
+                    eventStart[i] = X86.Sse2.add_epi32(X86.Sse2.add_epi32(xDiffi[i & 1], tileStartSlope), X86.Sse2.and_si128(overflow, tileStartDir));
+                    eventStart[i] = X86.Sse2.add_epi32(X86.Sse2.add_epi32(eventStart[i], X86.Sse2.set1_epi32(BufferGroup.FpHalfPixel - 1)), tieBreaker);
+                }
+
+                // Split bounding box into bottom - middle - top region.
+                v128 bbBottomIdx = X86.Sse2.add_epi32(bbTileMinX, X86.Sse4_1.mullo_epi32(bbTileMinY, X86.Sse2.set1_epi32(NumTilesX)));
+                v128 bbTopIdx = X86.Sse2.add_epi32(bbTileMinX, X86.Sse4_1.mullo_epi32(X86.Sse2.add_epi32(bbTileMinY, bbTileSizeY), X86.Sse2.set1_epi32(NumTilesX)));
+                v128 bbMidIdx = X86.Sse2.add_epi32(bbTileMinX, X86.Sse4_1.mullo_epi32(midTileY, X86.Sse2.set1_epi32(NumTilesX)));
+
+                // Loop over non-culled triangle and change SIMD axis to per-pixel
+                while (triMask != 0)
+                {
+                    uint triIdx = (uint)IntrinsicUtils.find_clear_lsb(ref triMask);
+                    int triMidVtxRight = (midVtxRight >> (int)triIdx) & 1;
+
+                    // Get Triangle Zmin zMax
+                    v128 zTriMax = X86.Sse.set1_ps(IntrinsicUtils.getFloatLane(zMax, triIdx));
+                    v128 zTriMin = X86.Sse.set1_ps(IntrinsicUtils.getFloatLane(zMin, triIdx));
+
+                    // Setup Zmin value for first set of 8x4 subtiles
+                    v128 SimdSubTileColOffsetF = X86.Sse.setr_ps(0, BufferGroup.SubTileWidth, BufferGroup.SubTileWidth * 2, BufferGroup.SubTileWidth * 3);
+                    v128 z0 = IntrinsicUtils._mmw_fmadd_ps(X86.Sse.set1_ps(IntrinsicUtils.getFloatLane(zPixelDx, triIdx)),
+                                                            SimdSubTileColOffsetF,
+                                                            IntrinsicUtils._mmw_fmadd_ps(X86.Sse.set1_ps(IntrinsicUtils.getFloatLane(zPixelDy, triIdx)),
+                                                            X86.Sse2.setzero_si128(),
+                                                            X86.Sse.set1_ps(IntrinsicUtils.getFloatLane(zPlaneOffset, triIdx))));
+
+                    float zx = IntrinsicUtils.getFloatLane(zTileDx, triIdx);
+                    float zy = IntrinsicUtils.getFloatLane(zTileDy, triIdx);
+
+                    // Get dimension of bounding box bottom, mid & top segments
+                    int bbWidth = IntrinsicUtils.getIntLane(bbTileSizeX, triIdx);
+                    int bbHeight = IntrinsicUtils.getIntLane(bbTileSizeY, triIdx);
+                    int tileRowIdx = IntrinsicUtils.getIntLane(bbBottomIdx, triIdx);
+                    int tileMidRowIdx = IntrinsicUtils.getIntLane(bbMidIdx, triIdx);
+                    int tileEndRowIdx = IntrinsicUtils.getIntLane(bbTopIdx, triIdx);
+
+                    if (bbWidth > BIG_TRIANGLE && bbHeight > BIG_TRIANGLE) // For big triangles we use a more expensive but tighter traversal algorithm
+                    {
+                        if (triMidVtxRight != 0)
+                        {
+                            RasterizeTriangleSSE(tiles, true, 1, triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slope, slopeTileDelta, zTriMin, zTriMax, ref z0, zx, zy, edgeY, absEdgeX, slopeSign, eventStartRemainder, slopeTileRemainder);
+                        }
+                        else
+                        {
+                            RasterizeTriangleSSE(tiles, true, 0, triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slope, slopeTileDelta, zTriMin, zTriMax, ref z0, zx, zy, edgeY, absEdgeX, slopeSign, eventStartRemainder, slopeTileRemainder);
+                        }
+                    }
+                    else
+                    {
+                        if (triMidVtxRight != 0)
+                        {
+                            RasterizeTriangleSSE(tiles, false, 1, triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slope, slopeTileDelta, zTriMin, zTriMax, ref z0, zx, zy, edgeY, absEdgeX, slopeSign, eventStartRemainder, slopeTileRemainder);
+                        }
+                        else
+                        {
+                            RasterizeTriangleSSE(tiles, false, 0, triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slope, slopeTileDelta, zTriMin, zTriMax, ref z0, zx, zy, edgeY, absEdgeX, slopeSign, eventStartRemainder, slopeTileRemainder);
+                        }
+                    }
+                }
+            }
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        void RasterizeTriangleSSE(
+            Tile* tiles,
+            bool isTightTraversal,
+            int midVtxRight,
+            uint triIdx,
+            int bbWidth,
+            int tileRowIdx,
+            int tileMidRowIdx,
+            int tileEndRowIdx,
+            v128* eventStart,
+            v128* slope,
+            v128* slopeTileDelta,
+            v128 zTriMin,
+            v128 zTriMax,
+            ref v128 z0,
+            float zx,
+            float zy,
+            v128* edgeY,
+            v128* absEdgeX,
+            v128* slopeSign,
+            v128* eventStartRemainder,
+            v128* slopeTileRemainder)
+        {
+            if (X86.Sse4_1.IsSse41Supported)
+            {
+                const int LEFT_EDGE_BIAS = -1;
+                const int RIGHT_EDGE_BIAS = 1;
+
+                v128* triSlopeSign = stackalloc v128[3];
+                v128* triSlopeTileDelta = stackalloc v128[3];
+                v128* triEdgeY = stackalloc v128[3];
+                v128* triSlopeTileRemainder = stackalloc v128[3];
+                v128* triEventRemainder = stackalloc v128[3];
+                v128* triEvent = stackalloc v128[3];
+
+                for (int i = 0; i < 3; ++i)
+                {
+                    triSlopeSign[i] = X86.Sse2.set1_epi32(IntrinsicUtils.getIntLane(slopeSign[i], triIdx));
+                    triSlopeTileDelta[i] =
+                        X86.Sse2.set1_epi32(IntrinsicUtils.getIntLane(slopeTileDelta[i], triIdx));
+                    triEdgeY[i] = X86.Sse2.set1_epi32(IntrinsicUtils.getIntLane(edgeY[i], triIdx));
+                    triSlopeTileRemainder[i] =
+                        X86.Sse2.set1_epi32(IntrinsicUtils.getIntLane(slopeTileRemainder[i], triIdx));
+
+                    v128 triSlope = X86.Sse.set1_ps(IntrinsicUtils.getFloatLane(slope[i], triIdx));
+                    v128 triAbsEdgeX = X86.Sse2.set1_epi32(IntrinsicUtils.getIntLane(absEdgeX[i], triIdx));
+                    v128 triStartRemainder =
+                        X86.Sse2.set1_epi32(IntrinsicUtils.getIntLane(eventStartRemainder[i], triIdx));
+                    v128 triEventStart = X86.Sse2.set1_epi32(IntrinsicUtils.getIntLane(eventStart[i], triIdx));
+
+                    v128 SimdLaneYCoordF = X86.Sse.setr_ps(128f, 384f, 640f, 896f);
+                    v128 scanlineDelta = X86.Sse2.cvttps_epi32(X86.Sse.mul_ps(triSlope, SimdLaneYCoordF));
+                    v128 SimdLaneYCoordI = X86.Sse2.setr_epi32(128, 384, 640, 896);
+                    v128 scanlineSlopeRemainder =
+                        X86.Sse2.sub_epi32(X86.Sse4_1.mullo_epi32(triAbsEdgeX, SimdLaneYCoordI),
+                            X86.Sse4_1.mullo_epi32(X86.Ssse3.abs_epi32(scanlineDelta), triEdgeY[i]));
+
+                    triEventRemainder[i] = X86.Sse2.sub_epi32(triStartRemainder, scanlineSlopeRemainder);
+                    v128 overflow = X86.Sse2.srai_epi32(triEventRemainder[i], 31);
+                    triEventRemainder[i] =
+                        X86.Sse2.add_epi32(triEventRemainder[i], X86.Sse2.and_si128(overflow, triEdgeY[i]));
+                    triEvent[i] =
+                        X86.Sse2.add_epi32(X86.Sse2.add_epi32(triEventStart, scanlineDelta),
+                            X86.Sse2.and_si128(overflow, triSlopeSign[i]));
+                }
+
+                // For big triangles track start & end tile for each scanline and only traverse the valid region
+                int startDelta = 0;
+                int endDelta = 0;
+                int topDelta = 0;
+                int startEvent = 0;
+                int endEvent = 0;
+                int topEvent = 0;
+
+                if (isTightTraversal)
+                {
+                    startDelta = IntrinsicUtils.getIntLane(slopeTileDelta[2], triIdx) + LEFT_EDGE_BIAS;
+                    endDelta = IntrinsicUtils.getIntLane(slopeTileDelta[0], triIdx) + RIGHT_EDGE_BIAS;
+                    topDelta = IntrinsicUtils.getIntLane(slopeTileDelta[1], triIdx) +
+                                (midVtxRight != 0 ? RIGHT_EDGE_BIAS : LEFT_EDGE_BIAS);
+
+                    // Compute conservative bounds for the edge events over a 32xN tile
+                    startEvent = IntrinsicUtils.getIntLane(eventStart[2], triIdx) + Mathf.Min(0, startDelta);
+                    endEvent = IntrinsicUtils.getIntLane(eventStart[0], triIdx) + Mathf.Max(0, endDelta) +
+                                (BufferGroup.TileWidth << BufferGroup.FpBits); // TODO: (Apoorva) can be spun out into a const
+
+                    if (midVtxRight != 0)
+                    {
+                        topEvent = IntrinsicUtils.getIntLane(eventStart[1], triIdx) + Mathf.Max(0, topDelta) +
+                                    (BufferGroup.TileWidth << BufferGroup.FpBits); // TODO: (Apoorva) can be spun out into a const
+                    }
+                    else
+                    {
+                        topEvent = IntrinsicUtils.getIntLane(eventStart[1], triIdx) + Mathf.Min(0, topDelta);
+                    }
+                }
+
+                if (tileRowIdx <= tileMidRowIdx)
+                {
+                    int tileStopIdx = Mathf.Min(tileEndRowIdx, tileMidRowIdx);
+
+                    // Traverse the bottom half of the triangle
+                    while (tileRowIdx < tileStopIdx)
+                    {
+                        int start = 0;
+                        int end = bbWidth;
+
+                        if (isTightTraversal)
+                        {
+                            // Compute tighter start and endpoints to avoid traversing empty space
+                            start = Mathf.Max(0, Mathf.Min(bbWidth - 1, startEvent >> (BufferGroup.TileWidthShift + BufferGroup.FpBits))); // TODO: (Apoorva) can be spun out into a const
+                            end = Mathf.Min(bbWidth, ((int)endEvent >> (BufferGroup.TileWidthShift + BufferGroup.FpBits))); // TODO: (Apoorva) can be spun out into a const
+
+                            startEvent += startDelta;
+                            endEvent += endDelta;
+                        }
+
+                        // Traverse the scanline and update the masked hierarchical z buffer
+                        TraverseScanlineSSE(tiles, 1, 1, start, end, tileRowIdx, 0, 2, triEvent, zTriMin, zTriMax, z0,
+                            zx);
+
+                        // move to the next scanline of tiles, update edge events and interpolate z
+                        tileRowIdx += NumTilesX;
+                        z0 = X86.Sse.add_ps(z0, X86.Sse.set1_ps(zy));
+
+                        UpdateTileEventsYSSE(triEventRemainder, triSlopeTileRemainder, triEdgeY, triEvent,
+                            triSlopeTileDelta, triSlopeSign, 0);
+                        UpdateTileEventsYSSE(triEventRemainder, triSlopeTileRemainder, triEdgeY, triEvent,
+                            triSlopeTileDelta, triSlopeSign, 2);
+                    }
+
+                    // Traverse the middle scanline of tiles. We must consider all three edges only in this region
+                    if (tileRowIdx < tileEndRowIdx)
+                    {
+                        int start = 0;
+                        int end = bbWidth;
+
+                        if (isTightTraversal)
+                        {
+                            // Compute tighter start and endpoints to avoid traversing lots of empty space
+                            start = Mathf.Max(0, Mathf.Min(bbWidth - 1, startEvent >> (BufferGroup.TileWidthShift + BufferGroup.FpBits))); // TODO: (Apoorva) can be spun out into a const
+                            end = Mathf.Min(bbWidth, ((int)endEvent >> (BufferGroup.TileWidthShift + BufferGroup.FpBits))); // TODO: (Apoorva) can be spun out into a const
+
+                            // Switch the traversal start / end to account for the upper side edge
+                            endEvent = midVtxRight != 0 ? topEvent : endEvent;
+                            endDelta = midVtxRight != 0 ? topDelta : endDelta;
+                            startEvent = midVtxRight != 0 ? startEvent : topEvent;
+                            startDelta = midVtxRight != 0 ? startDelta : topDelta;
+
+                            startEvent += startDelta;
+                            endEvent += endDelta;
+                        }
+
+                        // Traverse the scanline and update the masked hierarchical z buffer.
+                        if (midVtxRight != 0)
+                        {
+                            TraverseScanlineSSE(tiles, 2, 1, start, end, tileRowIdx, 0, 2, triEvent, zTriMin, zTriMax,
+                                z0, zx);
+                        }
+                        else
+                        {
+                            TraverseScanlineSSE(tiles, 1, 2, start, end, tileRowIdx, 0, 2, triEvent, zTriMin, zTriMax,
+                                z0, zx);
+                        }
+
+                        tileRowIdx += NumTilesX;
+                    }
+
+                    // Traverse the top half of the triangle
+                    if (tileRowIdx < tileEndRowIdx)
+                    {
+                        // move to the next scanline of tiles, update edge events and interpolate z
+                        z0 = X86.Sse.add_ps(z0, X86.Sse.set1_ps(zy));
+                        int i0 = midVtxRight + 0;
+                        int i1 = midVtxRight + 1;
+
+                        UpdateTileEventsYSSE(triEventRemainder, triSlopeTileRemainder, triEdgeY, triEvent,
+                            triSlopeTileDelta, triSlopeSign, i0);
+                        UpdateTileEventsYSSE(triEventRemainder, triSlopeTileRemainder, triEdgeY, triEvent,
+                            triSlopeTileDelta, triSlopeSign, i1);
+
+                        for (; ; )
+                        {
+                            int start = 0;
+                            int end = bbWidth;
+
+                            if (isTightTraversal)
+                            {
+                                // Compute tighter start and endpoints to avoid traversing lots of empty space
+                                start = Mathf.Max(0, Mathf.Min(bbWidth - 1, startEvent >> (BufferGroup.TileWidthShift + BufferGroup.FpBits)));
+                                end = Mathf.Min(bbWidth, (endEvent >> (BufferGroup.TileWidthShift + BufferGroup.FpBits)));
+
+                                startEvent += startDelta;
+                                endEvent += endDelta;
+                            }
+
+                            // Traverse the scanline and update the masked hierarchical z buffer
+                            TraverseScanlineSSE(tiles, 1, 1, start, end, tileRowIdx, midVtxRight + 0,
+                                midVtxRight + 1, triEvent, zTriMin, zTriMax, z0, zx);
+
+                            // move to the next scanline of tiles, update edge events and interpolate z
+                            tileRowIdx += NumTilesX;
+                            if (tileRowIdx >= tileEndRowIdx)
+                            {
+                                break;
+                            }
+
+                            z0 = X86.Sse.add_ps(z0, X86.Sse.set1_ps(zy));
+
+                            UpdateTileEventsYSSE(triEventRemainder, triSlopeTileRemainder, triEdgeY, triEvent,
+                                triSlopeTileDelta, triSlopeSign, i0);
+                            UpdateTileEventsYSSE(triEventRemainder, triSlopeTileRemainder, triEdgeY, triEvent,
+                                triSlopeTileDelta, triSlopeSign, i1);
+                        }
+                    }
+                }
+                else
+                {
+                    if (isTightTraversal)
+                    {
+                        // For large triangles, switch the traversal start / end to account for the upper side edge
+                        endEvent = midVtxRight != 0 ? topEvent : endEvent;
+                        endDelta = midVtxRight != 0 ? topDelta : endDelta;
+                        startEvent = midVtxRight != 0 ? startEvent : topEvent;
+                        startDelta = midVtxRight != 0 ? startDelta : topDelta;
+                    }
+
+                    // Traverse the top half of the triangle
+                    if (tileRowIdx < tileEndRowIdx)
+                    {
+                        int i0 = midVtxRight + 0;
+                        int i1 = midVtxRight + 1;
+
+                        for (; ; )
+                        {
+                            int start = 0;
+                            int end = bbWidth;
+
+                            if (isTightTraversal)
+                            {
+                                // Compute tighter start and endpoints to avoid traversing lots of empty space
+                                start = Mathf.Max(0, Mathf.Min(bbWidth - 1, startEvent >> (BufferGroup.TileWidthShift + BufferGroup.FpBits)));
+                                end = Mathf.Min(bbWidth, (endEvent >> (BufferGroup.TileWidthShift + BufferGroup.FpBits)));
+
+                                startEvent += startDelta;
+                                endEvent += endDelta;
+                            }
+
+                            // Traverse the scanline and update the masked hierarchical z buffer
+                            TraverseScanlineSSE(tiles, 1, 1, start, end, tileRowIdx, midVtxRight + 0,
+                                midVtxRight + 1, triEvent, zTriMin, zTriMax, z0, zx);
+
+                            // move to the next scanline of tiles, update edge events and interpolate z
+                            tileRowIdx += NumTilesX;
+                            if (tileRowIdx >= tileEndRowIdx)
+                            {
+                                break;
+                            }
+
+                            z0 = X86.Sse.add_ps(z0, X86.Sse.set1_ps(zy));
+
+                            UpdateTileEventsYSSE(triEventRemainder, triSlopeTileRemainder, triEdgeY, triEvent,
+                                triSlopeTileDelta, triSlopeSign, i0);
+                            UpdateTileEventsYSSE(triEventRemainder, triSlopeTileRemainder, triEdgeY, triEvent,
+                                triSlopeTileDelta, triSlopeSign, i1);
+                        }
+                    }
+                }
+            }
+        }
+        #endregion
+
+        #region Neon
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private void TraverseScanlineNEON(Tile* tiles, int numRight, int numLeft, int leftOffset, int rightOffset, int tileIdx, int rightEvent, int leftEvent, v128* events, v128 zTriMin, v128 zTriMax, v128 iz0, float zx)
+        {
+            if (Arm.Neon.IsNeonSupported)
+            {
+                v128* right = stackalloc v128[numRight];
+                v128* left = stackalloc v128[numLeft];
+
+                // Floor edge events to integer pixel coordinates (shift out fixed point bits)
+                v128 eventOffset = new v128(leftOffset << BufferGroup.TileWidthShift);
+                v128 emptyBitMask = new v128(0);
+                v128 fullBitMask = new v128(~0);
+                v128 highbit = new v128(0x80000000u);
+                v128 simdTileWidth = new v128(BufferGroup.TileWidth);
+
+                for (int i = 0; i < numRight; ++i)
+                {
+                    right[i] = Arm.Neon.vmaxq_s32(Arm.Neon.vsubq_s32(Arm.Neon.vshrq_n_s32(events[rightEvent + i], BufferGroup.FpBits), eventOffset), emptyBitMask);
+
+                }
+
+                for (int i = 0; i < numLeft; ++i)
+                {
+                    left[i] = Arm.Neon.vmaxq_s32(Arm.Neon.vsubq_s32(Arm.Neon.vshrq_n_s32(events[leftEvent - i], BufferGroup.FpBits), eventOffset), emptyBitMask);
+                }
+
+                v128 z0 = Arm.Neon.vaddq_f32(iz0, new v128(zx * leftOffset));
+                int tileIdxEnd = tileIdx + rightOffset;
+                tileIdx += leftOffset;
+
+                for (; ; )
+                {
+                    // Compute zMin for the overlapped layers
+                    v128 mask = tiles[tileIdx].mask;
+                    v128 zMin0 = IntrinsicUtils._vblendq_f32(Arm.Neon.vceqq_s32(mask, fullBitMask), tiles[tileIdx].zMin0, tiles[tileIdx].zMin1);
+                    v128 zMin1 = IntrinsicUtils._vblendq_f32(Arm.Neon.vceqq_s32(mask, emptyBitMask), tiles[tileIdx].zMin1, tiles[tileIdx].zMin0);
+                    v128 zMinBuf = Arm.Neon.vminq_f32(zMin0, zMin1);
+                    v128 comp = Arm.Neon.vcltq_f32(zTriMax, zMinBuf);
+                    // https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
+                    // Instead of movemask_ps
+                    v64 compPacked = Arm.Neon.vshrn_n_s32(comp, 16);
+
+                    if (compPacked.ULong0 != 0xfffffffffffffffful)
+                    {
+                        // Compute coverage mask for entire 32xN using shift operations
+                        v128 accumulatedMask = IntrinsicUtils._vsllv_ones(left[0]);
+
+                        for (int i = 1; i < numLeft; ++i)
+                        {
+                            accumulatedMask = Arm.Neon.vandq_s8(accumulatedMask, IntrinsicUtils._vsllv_ones(left[i]));
+                        }
+
+                        for (int i = 0; i < numRight; ++i)
+                        {
+                            accumulatedMask = Arm.Neon.vbicq_s8(accumulatedMask, IntrinsicUtils._vsllv_ones(right[i]));
+                        }
+
+                        // Compute interpolated min for each 8x4 subtile and update the masked hierarchical z buffer entry
+                        v128 zSubTileMin = Arm.Neon.vmaxq_f32(z0, zTriMin);
+                        UpdateTileAccurateNEON(tiles, tileIdx, IntrinsicUtils._vtranspose_s8(accumulatedMask), zSubTileMin);
+                    }
+
+                    // Update buffer address, interpolate z and edge events
+                    tileIdx++;
+
+                    if (tileIdx >= tileIdxEnd)
+                    {
+                        break;
                     }
-                    if (internalBinSize == binSize)
+
+                    z0 = Arm.Neon.vaddq_f32(z0, new v128(zx));
+
+                    for (int i = 0; i < numRight; ++i)
                     {
-                        RasterizeMesh(tiles, (float*)binTriangleX.GetUnsafePtr(), (float*)binTriangleY.GetUnsafePtr(), (float*)binTriangleW.GetUnsafePtr(), internalBinSize, scissorRect);
-                        internalBinSize = 0;
+                        // Trick, use sub saturated to avoid checking against < 0 for shift (values should fit in 16 bits)
+                        right[i] = Arm.Neon.vqsubq_u16(right[i], simdTileWidth);
                     }
-                }
-            }
-            if (tempStackSize > 0)
-            {
-                for (int n = 0; n < 3; ++n)
-                {
-                    for (int p = 0; p < 4; ++p)
+
+                    for (int i = 0; i < numLeft; ++i)
                     {
-                        binTriangleX[internalBinSize + p + n * 4] = temp_stack_x[n + p * 3];
-                        binTriangleY[internalBinSize + p + n * 4] = temp_stack_y[n + p * 3];
-                        binTriangleW[internalBinSize + p + n * 4] = temp_stack_w[n + p * 3];
+                        // Trick, use sub saturated to avoid checking against < 0 for shift (values should fit in 16 bits)
+                        left[i] = Arm.Neon.vqsubq_u16(left[i], simdTileWidth);
                     }
                 }
-                internalBinSize += tempStackSize;
-                tempStackSize = 0;
             }
-            if (internalBinSize > 0)
-            {
-                RasterizeMesh(tiles, (float*)binTriangleX.GetUnsafePtr(), (float*)binTriangleY.GetUnsafePtr(), (float*)binTriangleW.GetUnsafePtr(), internalBinSize, scissorRect);
-            }
-            binTriangleX.Dispose();
-            binTriangleY.Dispose();
-            binTriangleW.Dispose();
         }
 
-        // RasterizeMesh now gets as input all triangles that already passed backface culling, clipping and an early z test
-        // this should make it even easier for the system to keep using the full simd words and have better simd occupancy
-        // instead of removing part of the work based on the mask
-        void RasterizeMesh(Tile* tiles, float* binTriangleX, float* binTriangleY, float* binTriangleW, int numVert, ScissorRect screenScissor)
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        void UpdateTileAccurateNEON(Tile* tiles, int tileIdx, v128 coverage, v128 zTriv)
         {
-            if (X86.Sse.IsSseSupported)
+            if (Arm.Neon.IsNeonSupported)
             {
-                // DS: TODO: UNITY BURST FIX
-                //using (var roundingMode = new X86.RoundingScope(X86.MXCSRBits.RoundToNearest))
-                const X86.MXCSRBits roundingMode = X86.MXCSRBits.RoundToNearest;
-                X86.MXCSRBits OldBits = X86.MXCSR;
-                X86.MXCSR = (OldBits & ~X86.MXCSRBits.RoundingControlMask) | roundingMode;
+                v128 zMin0 = tiles[tileIdx].zMin0;
+                v128 zMin1 = tiles[tileIdx].zMin1;
+                v128 mask = tiles[tileIdx].mask;
 
-                int vertexIndex = 0;
+                // Swizzle coverage mask to 8x4 subtiles
+                v128 rastMask = coverage;
 
-                v128* vtxX_prealloc = stackalloc v128[3];
-                v128* vtxY_prealloc = stackalloc v128[3];
-                v128* vtxW_prealloc = stackalloc v128[3];
+                // Perform individual depth tests with layer 0 & 1 and mask out all failing pixels
+                v128 sdist0 = Arm.Neon.vsubq_f32(zMin0, zTriv);
+                v128 sdist1 = Arm.Neon.vsubq_f32(zMin1, zTriv);
+                v128 sign0 = Arm.Neon.vshrq_n_s32(sdist0, 31);
+                v128 sign1 = Arm.Neon.vshrq_n_s32(sdist1, 31);
+                v128 triMask = Arm.Neon.vandq_s8(rastMask, Arm.Neon.vorrq_s8(Arm.Neon.vbicq_s8(sign0, mask), Arm.Neon.vandq_s8(mask, sign1)));
+                // Early out if no pixels survived the depth test
+                // (this test is more accurate than the early culling test in TraverseScanline())
+                v64 narrowSaturatedMask = Arm.Neon.vqmovn_u64(triMask);
+                if (narrowSaturatedMask.ULong0 == 0ul)
+                {
+                    return;
+                }
 
-                v128* pVtxX_prealloc = stackalloc v128[3];
-                v128* pVtxY_prealloc = stackalloc v128[3];
-                v128* pVtxZ_prealloc = stackalloc v128[3];
+#if MOC_ENABLE_STATS
+                STATS_ADD(ref mStats.mOccluders.mNumTilesUpdated, 1);
+#endif
+                v128 t0 = Arm.Neon.vceqzq_s32(triMask);
+                v128 zTri = IntrinsicUtils._vblendq_f32(t0, zTriv, zMin0);
 
-                v128* ipVtxX_prealloc = stackalloc v128[3];
-                v128* ipVtxY_prealloc = stackalloc v128[3];
+                // Test if incoming triangle completely overwrites layer 0 or 1
+                v128 layerMask0 = Arm.Neon.vbicq_s8(Arm.Neon.vmvnq_s32(mask), triMask);
+                v128 layerMask1 = Arm.Neon.vbicq_s8(mask, triMask);
+                v128 lm0 = Arm.Neon.vceqzq_s32(layerMask0);
+                v128 lm1 = Arm.Neon.vceqzq_s32(layerMask1);
+                v128 z0 = IntrinsicUtils._vblendq_f32(lm0, zMin0, zTri);
+                v128 z1 = IntrinsicUtils._vblendq_f32(lm1, zMin1, zTri);
 
-                while (vertexIndex < numVert)
-                {
-                    v128* vtxX = vtxX_prealloc;
-                    v128* vtxY = vtxY_prealloc;
-                    v128* vtxW = vtxW_prealloc;
+                // Compute distances used for merging heuristic
+                v128 d0 = Arm.Neon.vabsq_f32(sdist0);
+                v128 d1 = Arm.Neon.vabsq_f32(sdist1);
+                v128 d2 = Arm.Neon.vabdq_f32(z0, z1);
 
+                // Find minimum distance
+                v128 c01 = Arm.Neon.vsubq_f32(d0, d1);
+                v128 c02 = Arm.Neon.vsubq_f32(d0, d2);
+                v128 c12 = Arm.Neon.vsubq_f32(d1, d2);
 
-                    int numLanes = math.min(SIMD_LANES, numVert - vertexIndex);
-                    uint triMask = (1u << numLanes) - 1;
+                // Two tests indicating which layer the incoming triangle will merge with or
+                // overwrite. d0min indicates that the triangle will overwrite layer 0, and
+                // d1min flags that the triangle will overwrite layer 1.
+                v128 d0min = Arm.Neon.vorrq_s8(Arm.Neon.vandq_s8(c01, c02), Arm.Neon.vorrq_s8(lm0, t0));
+                v128 d1min = Arm.Neon.vbicq_s8(Arm.Neon.vorrq_s8(c12, lm1), d0min);
 
+                /* Update depth buffer entry. NOTE: we always merge into layer 0, so if the
+                   triangle should be merged with layer 1, we first swap layer 0 & 1 and then
+                   merge into layer 0. */
 
-                    for (int i = 0; i < 3; i++)
-                    {
-                        vtxX[i] = X86.Sse.load_ps(&binTriangleX[vertexIndex + i * 4]);
-                        vtxY[i] = X86.Sse.load_ps(&binTriangleY[vertexIndex + i * 4]);
-                        vtxW[i] = X86.Sse.load_ps(&binTriangleW[vertexIndex + i * 4]);
-                    }
+                // Update mask based on which layer the triangle overwrites or was merged into
+                v128 inner = IntrinsicUtils._vblendq_f32(d0min, triMask, layerMask1);
 
-                    vertexIndex += SIMD_LANES * 3;
+                // Update the zMin[0] value. There are four outcomes: overwrite with layer 1,
+                // merge with layer 1, merge with zTri or overwrite with layer 1 and then merge
+                // with zTri.
+                v128 e0 = IntrinsicUtils._vblendq_f32(d1min, z0, z1);
+                v128 e1 = IntrinsicUtils._vblendq_f32(Arm.Neon.vorrq_s8(d1min, d0min), z1, zTri);
 
-                    if (triMask == 0x0)
-                    {
-                        continue;
-                    }
+                // Update the zMin[1] value. There are three outcomes: keep current value,
+                // overwrite with zTri, or overwrite with z1
+                v128 z1t = IntrinsicUtils._vblendq_f32(d0min, zTri, z1);
 
-                    /* Project, transform to screen space and perform backface culling. Note
-                       that we use z = 1.0 / vtx.w for depth, which means that z = 0 is far and
-                       z = 1/m_near is near. For ortho projection, we do z = (z * -1) + 1 to go from z = 0 for far and z = 2 for near
+                tiles[tileIdx].zMin0 = Arm.Neon.vminq_f32(e0, e1);
+                tiles[tileIdx].zMin1 = IntrinsicUtils._vblendq_f32(d1min, z1t, z0);
+                tiles[tileIdx].mask = IntrinsicUtils._vblendq_f32(d1min, inner, layerMask0);
+            }
+        }
 
-                       We must also use a greater than depth test, and in effect
-                       everything is reversed compared to regular z implementations. */
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private void UpdateTileEventsYNEON(v128* triEventRemainder, v128* triSlopeTileRemainder, v128* triEdgeY, v128* triEvent, v128* triSlopeTileDelta, v128* triSlopeSign, int i)
+        {
+            if (Arm.Neon.IsNeonSupported)
+            {
+                triEventRemainder[i] = Arm.Neon.vsubq_s32(triEventRemainder[i], triSlopeTileRemainder[i]);
+                v128 overflow = Arm.Neon.vshrq_n_s32(triEventRemainder[i], 31);
+                triEventRemainder[i] = Arm.Neon.vaddq_s32(triEventRemainder[i], Arm.Neon.vandq_s8(overflow, triEdgeY[i]));
+                triEvent[i] = Arm.Neon.vaddq_s32(triEvent[i], Arm.Neon.vaddq_s32(triSlopeTileDelta[i], Arm.Neon.vandq_s8(overflow, triSlopeSign[i])));
+            }
+        }
 
-                    v128* pVtxX = pVtxX_prealloc;
-                    v128* pVtxY = pVtxY_prealloc;
-                    v128* pVtxZ = pVtxZ_prealloc;
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private void SortVerticesNEON(v128* vX, v128* vY)
+        {
+            if (Arm.Neon.IsNeonSupported)
+            {
+                // Rotate the triangle in the winding order until v0 is the vertex with lowest Y value
+                for (int i = 0; i < 2; i++)
+                {
+                    v128 ey1 = Arm.Neon.vsubq_s32(vY[1], vY[0]);
+                    v128 ey2 = Arm.Neon.vsubq_s32(vY[2], vY[0]);
+                    v128 swapMask = Arm.Neon.vorrq_s8(Arm.Neon.vorrq_s8(ey1, ey2), Arm.Neon.vceqzq_s32(ey2));
 
-                    v128* ipVtxX = ipVtxX_prealloc;
-                    v128* ipVtxY = ipVtxY_prealloc;
-                    ProjectVertices(ipVtxX, ipVtxY, pVtxX, pVtxY, pVtxZ, vtxX, vtxY, vtxW);
+                    v128 sX = IntrinsicUtils._vblendq_f32(swapMask, vX[2], vX[0]);
+                    vX[0] = IntrinsicUtils._vblendq_f32(swapMask, vX[0], vX[1]);
+                    vX[1] = IntrinsicUtils._vblendq_f32(swapMask, vX[1], vX[2]);
+                    vX[2] = sX;
 
-                    /* Setup and rasterize a SIMD batch of triangles */
-                    RasterizeTriangleBatch(tiles, ipVtxX, ipVtxY, pVtxX, pVtxY, pVtxZ, triMask, screenScissor);
+                    v128 sY = IntrinsicUtils._vblendq_f32(swapMask, vY[2], vY[0]);
+                    vY[0] = IntrinsicUtils._vblendq_f32(swapMask, vY[0], vY[1]);
+                    vY[1] = IntrinsicUtils._vblendq_f32(swapMask, vY[1], vY[2]);
+                    vY[2] = sY;
                 }
+            }
+        }
 
-                // DS: TODO: UNITY BURST FIX
-                X86.MXCSR = OldBits;
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private void ComputeDepthPlaneNEON(v128* pVtxX, v128* pVtxY, v128* pVtxZ, out v128 zPixelDx, out v128 zPixelDy)
+        {
+            if (Arm.Neon.IsNeonSupported)
+            {
+                // Setup z(x,y) = z0 + dx*x + dy*y screen space depth plane equation
+                v128 x2 = Arm.Neon.vsubq_f32(pVtxX[2], pVtxX[0]);
+                v128 x1 = Arm.Neon.vsubq_f32(pVtxX[1], pVtxX[0]);
+                v128 y1 = Arm.Neon.vsubq_f32(pVtxY[1], pVtxY[0]);
+                v128 y2 = Arm.Neon.vsubq_f32(pVtxY[2], pVtxY[0]);
+                v128 z1 = Arm.Neon.vsubq_f32(pVtxZ[1], pVtxZ[0]);
+                v128 z2 = Arm.Neon.vsubq_f32(pVtxZ[2], pVtxZ[0]);
+                v128 d = Arm.Neon.vdivq_f32(new v128(1.0f), Arm.Neon.vmlsq_f32(Arm.Neon.vmulq_f32(x1, y2), y1, x2));
+                zPixelDx = Arm.Neon.vmulq_f32(Arm.Neon.vmlsq_f32(Arm.Neon.vmulq_f32(z1, y2), y1, z2), d);
+                zPixelDy = Arm.Neon.vmulq_f32(Arm.Neon.vmlsq_f32(Arm.Neon.vmulq_f32(x1, z2), z1, x2), d);
+            }
+            else
+            {
+                zPixelDx = new v128();
+                zPixelDy = new v128();
             }
         }
 
-        void ProjectVertices(v128* ipVtxX, v128* ipVtxY, v128* pVtxX, v128* pVtxY, v128* pVtxZ, v128* vtxX, v128* vtxY, v128* vtxW)
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private void ComputeBoundingBoxNEON(v128* vX, v128* vY, ref ScissorRect scissor, out v128 bbminX, out v128 bbminY, out v128 bbmaxX, out v128 bbmaxY)
         {
-            if (X86.Sse2.IsSse2Supported)
+            if (Arm.Neon.IsNeonSupported)
+            {
+                // Find Min/Max vertices
+                bbminX = Arm.Neon.vcvtnq_s32_f32(Arm.Neon.vminq_f32(vX[0], Arm.Neon.vminq_f32(vX[1], vX[2])));
+                bbminY = Arm.Neon.vcvtnq_s32_f32(Arm.Neon.vminq_f32(vY[0], Arm.Neon.vminq_f32(vY[1], vY[2])));
+                bbmaxX = Arm.Neon.vcvtnq_s32_f32(Arm.Neon.vmaxq_f32(vX[0], Arm.Neon.vmaxq_f32(vX[1], vX[2])));
+                bbmaxY = Arm.Neon.vcvtnq_s32_f32(Arm.Neon.vmaxq_f32(vY[0], Arm.Neon.vmaxq_f32(vY[1], vY[2])));
+
+                // Clamp to tile boundaries
+                v128 SimdPadWMask = new v128(~(BufferGroup.TileWidth - 1));
+                v128 SimdPadHMask = new v128(~(BufferGroup.TileHeight - 1));
+                bbminX = Arm.Neon.vandq_s8(bbminX, SimdPadWMask);
+                bbmaxX = Arm.Neon.vandq_s8(Arm.Neon.vaddq_s32(bbmaxX, new v128(BufferGroup.TileWidth)), SimdPadWMask);
+                bbminY = Arm.Neon.vandq_s8(bbminY, SimdPadHMask);
+                bbmaxY = Arm.Neon.vandq_s8(Arm.Neon.vaddq_s32(bbmaxY, new v128(BufferGroup.TileHeight)), SimdPadHMask);
+
+                // Clip to scissor
+                bbminX = Arm.Neon.vmaxq_s32(bbminX, new v128(scissor.mMinX));
+                bbmaxX = Arm.Neon.vminq_s32(bbmaxX, new v128(scissor.mMaxX));
+                bbminY = Arm.Neon.vmaxq_s32(bbminY, new v128(scissor.mMinY));
+                bbmaxY = Arm.Neon.vminq_s32(bbmaxY, new v128(scissor.mMaxY));
+            }
+            else
+            {
+                bbminX = new v128();
+                bbminY = new v128();
+                bbmaxX = new v128();
+                bbmaxY = new v128();
+            }
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        void ProjectVerticesNEON(v128* ipVtxX, v128* ipVtxY, v128* pVtxX, v128* pVtxY, v128* pVtxZ, v128* vtxX, v128* vtxY, v128* vtxW)
+        {
+            if (Arm.Neon.IsNeonSupported)
             {
                 const float FP_INV = 1f / (1 << BufferGroup.FpBits);
                 // Project vertices and transform to screen space. Snap to sub-pixel coordinates with BufferGroup.FpBits precision.
@@ -240,51 +1104,55 @@ void ProjectVertices(v128* ipVtxX, v128* ipVtxY, v128* pVtxX, v128* pVtxY, v128*
 
                     if (ProjectionType == BatchCullingProjectionType.Orthographic)
                     {
-                        rcpW = IntrinsicUtils._mmw_fmadd_ps(X86.Sse.set1_ps(-1.0f), vtxW[i], X86.Sse.set1_ps(1.0f));
+                        rcpW = Arm.Neon.vmlaq_f32(new v128(1.0f), vtxW[i], new v128(-1.0f));
+
+                        v128 screenX = Arm.Neon.vmlaq_f32(PixelCenterX, vtxX[i], HalfWidth);
+                        v128 screenY = Arm.Neon.vmlaq_f32(PixelCenterY, vtxY[i], HalfHeight);
+                        ipVtxX[idx] = Arm.Neon.vcvtnq_s32_f32(Arm.Neon.vmulq_f32(screenX, new v128((float)(1 << BufferGroup.FpBits))));
+                        ipVtxY[idx] = Arm.Neon.vcvtnq_s32_f32(Arm.Neon.vmulq_f32(screenY, new v128((float)(1 << BufferGroup.FpBits))));
 
-                        v128 screenX = X86.Sse.add_ps(X86.Sse.mul_ps(vtxX[i], HalfWidth), PixelCenterX);
-                        v128 screenY = X86.Sse.add_ps(X86.Sse.mul_ps(vtxY[i], HalfHeight), PixelCenterY);
-                        ipVtxX[idx] = X86.Sse2.cvtps_epi32(X86.Sse.mul_ps(screenX, X86.Sse.set1_ps((float)(1 << BufferGroup.FpBits))));
-                        ipVtxY[idx] = X86.Sse2.cvtps_epi32(X86.Sse.mul_ps(screenY, X86.Sse.set1_ps((float)(1 << BufferGroup.FpBits))));
                     }
                     else
                     {
-                        rcpW = X86.Sse.div_ps(X86.Sse.set1_ps(1f), vtxW[i]);
+                        rcpW = Arm.Neon.vdivq_f32(new v128(1.0f), vtxW[i]);
 
-                        v128 screenX = IntrinsicUtils._mmw_fmadd_ps(X86.Sse.mul_ps(vtxX[i], HalfWidth), rcpW, PixelCenterX);
-                        v128 screenY = IntrinsicUtils._mmw_fmadd_ps(X86.Sse.mul_ps(vtxY[i], HalfHeight), rcpW, PixelCenterY);
+                        v128 screenX = Arm.Neon.vmlaq_f32(PixelCenterX, Arm.Neon.vmulq_f32(vtxX[i], HalfWidth), rcpW);
+                        v128 screenY = Arm.Neon.vmlaq_f32(PixelCenterY, Arm.Neon.vmulq_f32(vtxY[i], HalfHeight), rcpW);
 
-                        ipVtxX[idx] = X86.Sse2.cvtps_epi32(X86.Sse.mul_ps(screenX, X86.Sse.set1_ps((float)(1 << BufferGroup.FpBits))));
-                        ipVtxY[idx] = X86.Sse2.cvtps_epi32(X86.Sse.mul_ps(screenY, X86.Sse.set1_ps((float)(1 << BufferGroup.FpBits))));
+                        ipVtxX[idx] = Arm.Neon.vcvtnq_s32_f32(Arm.Neon.vmulq_f32(screenX, new v128((float)(1 << BufferGroup.FpBits))));
+                        ipVtxY[idx] = Arm.Neon.vcvtnq_s32_f32(Arm.Neon.vmulq_f32(screenY, new v128((float)(1 << BufferGroup.FpBits))));
                     }
 
-                    pVtxX[idx] = X86.Sse.mul_ps(X86.Sse2.cvtepi32_ps(ipVtxX[idx]), X86.Sse.set1_ps(FP_INV));
-                    pVtxY[idx] = X86.Sse.mul_ps(X86.Sse2.cvtepi32_ps(ipVtxY[idx]), X86.Sse.set1_ps(FP_INV));
+                    pVtxX[idx] = Arm.Neon.vmulq_f32(Arm.Neon.vcvtq_f32_s32(ipVtxX[idx]), new v128(FP_INV));
+                    pVtxY[idx] = Arm.Neon.vmulq_f32(Arm.Neon.vcvtq_f32_s32(ipVtxY[idx]), new v128(FP_INV));
                     pVtxZ[idx] = rcpW;
                 }
             }
         }
 
-        void RasterizeTriangleBatch(Tile* tiles, v128* ipVtxX, v128* ipVtxY, v128* pVtxX, v128* pVtxY, v128* pVtxZ, uint triMask, ScissorRect scissor)
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private void RasterizeTriangleBatchNEON(Tile* tiles, v128* ipVtxX, v128* ipVtxY, v128* pVtxX, v128* pVtxY, v128* pVtxZ, uint triMask, ScissorRect scissor)
         {
-            if (X86.Sse4_1.IsSse41Supported)
+            if (Arm.Neon.IsNeonSupported)
             {
+                v128 one = new v128(1);
+                v128 zero = new v128(0);
                 //we are computing the bounding box again when we used it before but there are some use cases after, this check cannot be removed atm
 
                 // Compute bounding box and clamp to tile coordinates
-                ComputeBoundingBox(pVtxX, pVtxY, ref scissor, out var bbPixelMinX, out var bbPixelMinY, out var bbPixelMaxX, out var bbPixelMaxY);
+                ComputeBoundingBoxNEON(pVtxX, pVtxY, ref scissor, out var bbPixelMinX, out var bbPixelMinY, out var bbPixelMaxX, out var bbPixelMaxY);
 
                 // Clamp bounding box to tiles (it's already padded in computeBoundingBox)
-                v128 bbTileMinX = X86.Sse2.srai_epi32(bbPixelMinX, BufferGroup.TileWidthShift);
-                v128 bbTileMinY = X86.Sse2.srai_epi32(bbPixelMinY, BufferGroup.TileHeightShift);
-                v128 bbTileMaxX = X86.Sse2.srai_epi32(bbPixelMaxX, BufferGroup.TileWidthShift);
-                v128 bbTileMaxY = X86.Sse2.srai_epi32(bbPixelMaxY, BufferGroup.TileHeightShift);
-                v128 bbTileSizeX = X86.Sse2.sub_epi32(bbTileMaxX, bbTileMinX);
-                v128 bbTileSizeY = X86.Sse2.sub_epi32(bbTileMaxY, bbTileMinY);
+                v128 bbTileMinX = Arm.Neon.vshrq_n_s32(bbPixelMinX, BufferGroup.TileWidthShift);
+                v128 bbTileMinY = Arm.Neon.vshrq_n_s32(bbPixelMinY, BufferGroup.TileHeightShift);
+                v128 bbTileMaxX = Arm.Neon.vshrq_n_s32(bbPixelMaxX, BufferGroup.TileWidthShift);
+                v128 bbTileMaxY = Arm.Neon.vshrq_n_s32(bbPixelMaxY, BufferGroup.TileHeightShift);
+                v128 bbTileSizeX = Arm.Neon.vsubq_s32(bbTileMaxX, bbTileMinX);
+                v128 bbTileSizeY = Arm.Neon.vsubq_s32(bbTileMaxY, bbTileMinY);
 
                 // Cull triangles with zero bounding box
-                v128 bboxSign = X86.Sse2.or_si128(X86.Sse2.sub_epi32(bbTileSizeX, X86.Sse2.set1_epi32(1)), X86.Sse2.sub_epi32(bbTileSizeY, X86.Sse2.set1_epi32(1)));
-                triMask &= (uint)((~X86.Sse.movemask_ps(bboxSign)) & SIMD_ALL_LANES_MASK);
+                v128 bboxSign = Arm.Neon.vorrq_s8(Arm.Neon.vsubq_s32(bbTileSizeX, one), Arm.Neon.vsubq_s32(bbTileSizeY, one));
+                triMask &= (uint)((~IntrinsicUtils._vmovemask_f32(bboxSign)) & SIMD_ALL_LANES_MASK);
 
                 if (triMask == 0x0)
                 {
@@ -292,86 +1160,89 @@ void RasterizeTriangleBatch(Tile* tiles, v128* ipVtxX, v128* ipVtxY, v128* pVtxX
                 }
 
                 // Set up screen space depth plane
-                ComputeDepthPlane(pVtxX, pVtxY, pVtxZ, out var zPixelDx, out var zPixelDy);
+                ComputeDepthPlaneNEON(pVtxX, pVtxY, pVtxZ, out var zPixelDx, out var zPixelDy);
 
                 // Compute z value at min corner of bounding box. Offset to make sure z is conservative for all 8x4 subtiles
-                v128 bbMinXV0 = X86.Sse.sub_ps(X86.Sse2.cvtepi32_ps(bbPixelMinX), pVtxX[0]);
-                v128 bbMinYV0 = X86.Sse.sub_ps(X86.Sse2.cvtepi32_ps(bbPixelMinY), pVtxY[0]);
-                v128 zPlaneOffset = IntrinsicUtils._mmw_fmadd_ps(zPixelDx, bbMinXV0, IntrinsicUtils._mmw_fmadd_ps(zPixelDy, bbMinYV0, pVtxZ[0]));
-                v128 zTileDx = X86.Sse.mul_ps(zPixelDx, X86.Sse.set1_ps(BufferGroup.TileWidth));
-                v128 zTileDy = X86.Sse.mul_ps(zPixelDy, X86.Sse.set1_ps(BufferGroup.TileHeight));
-
-                zPlaneOffset = X86.Sse.add_ps(zPlaneOffset, X86.Sse.min_ps(X86.Sse2.setzero_si128(), X86.Sse.mul_ps(zPixelDx, X86.Sse.set1_ps(BufferGroup.SubTileWidth))));
-                zPlaneOffset = X86.Sse.add_ps(zPlaneOffset, X86.Sse.min_ps(X86.Sse2.setzero_si128(), X86.Sse.mul_ps(zPixelDy, X86.Sse.set1_ps(BufferGroup.SubTileHeight))));
+                v128 bbMinXV0 = Arm.Neon.vsubq_f32(Arm.Neon.vcvtq_f32_s32(bbPixelMinX), pVtxX[0]);
+                v128 bbMinYV0 = Arm.Neon.vsubq_f32(Arm.Neon.vcvtq_f32_s32(bbPixelMinY), pVtxY[0]);
+                v128 zPlaneOffset = Arm.Neon.vmlaq_f32(
+                    Arm.Neon.vmlaq_f32(pVtxZ[0], zPixelDy, bbMinYV0),
+                    zPixelDx,
+                    bbMinXV0);
+                v128 zTileDx = Arm.Neon.vmulq_f32(zPixelDx, new v128((float)BufferGroup.TileWidth));
+                v128 zTileDy = Arm.Neon.vmulq_f32(zPixelDy, new v128((float)BufferGroup.TileHeight));
+
+                zPlaneOffset = Arm.Neon.vaddq_f32(zPlaneOffset, Arm.Neon.vminq_f32(zero, Arm.Neon.vmulq_f32(zPixelDx, new v128((float)BufferGroup.SubTileWidth))));
+                zPlaneOffset = Arm.Neon.vaddq_f32(zPlaneOffset, Arm.Neon.vminq_f32(zero, Arm.Neon.vmulq_f32(zPixelDy, new v128((float)BufferGroup.SubTileHeight))));
 
                 // Compute Zmin and Zmax for the triangle (used to narrow the range for difficult tiles)
-                v128 zMin = X86.Sse.min_ps(pVtxZ[0], X86.Sse.min_ps(pVtxZ[1], pVtxZ[2]));
-                v128 zMax = X86.Sse.max_ps(pVtxZ[0], X86.Sse.max_ps(pVtxZ[1], pVtxZ[2]));
+                v128 zMin = Arm.Neon.vminq_f32(pVtxZ[0], Arm.Neon.vminq_f32(pVtxZ[1], pVtxZ[2]));
+                v128 zMax = Arm.Neon.vmaxq_f32(pVtxZ[0], Arm.Neon.vmaxq_f32(pVtxZ[1], pVtxZ[2]));
 
                 /* Sort vertices (v0 has lowest Y, and the rest is in winding order) and compute edges. Also find the middle
-                   vertex and compute tile */
+                    vertex and compute tile */
 
                 // Rotate the triangle in the winding order until v0 is the vertex with lowest Y value
-                SortVertices(ipVtxX, ipVtxY);
+                SortVerticesNEON(ipVtxX, ipVtxY);
 
                 // Compute edges
                 v128* edgeX = stackalloc v128[3];
-                edgeX[0] = X86.Sse2.sub_epi32(ipVtxX[1], ipVtxX[0]);
-                edgeX[1] = X86.Sse2.sub_epi32(ipVtxX[2], ipVtxX[1]);
-                edgeX[2] = X86.Sse2.sub_epi32(ipVtxX[2], ipVtxX[0]);
+                edgeX[0] = Arm.Neon.vsubq_s32(ipVtxX[1], ipVtxX[0]);
+                edgeX[1] = Arm.Neon.vsubq_s32(ipVtxX[2], ipVtxX[1]);
+                edgeX[2] = Arm.Neon.vsubq_s32(ipVtxX[2], ipVtxX[0]);
 
                 v128* edgeY = stackalloc v128[3];
-                edgeY[0] = X86.Sse2.sub_epi32(ipVtxY[1], ipVtxY[0]);
-                edgeY[1] = X86.Sse2.sub_epi32(ipVtxY[2], ipVtxY[1]);
-                edgeY[2] = X86.Sse2.sub_epi32(ipVtxY[2], ipVtxY[0]);
+                edgeY[0] = Arm.Neon.vsubq_s32(ipVtxY[1], ipVtxY[0]);
+                edgeY[1] = Arm.Neon.vsubq_s32(ipVtxY[2], ipVtxY[1]);
+                edgeY[2] = Arm.Neon.vsubq_s32(ipVtxY[2], ipVtxY[0]);
 
                 // Classify if the middle vertex is on the left or right and compute its position
-                int midVtxRight = ~X86.Sse.movemask_ps(edgeY[1]);
-                v128 midPixelX = X86.Sse4_1.blendv_ps(ipVtxX[1], ipVtxX[2], edgeY[1]);
-                v128 midPixelY = X86.Sse4_1.blendv_ps(ipVtxY[1], ipVtxY[2], edgeY[1]);
-                v128 midTileY = X86.Sse2.srai_epi32(X86.Sse4_1.max_epi32(midPixelY, X86.Sse2.setzero_si128()), BufferGroup.TileHeightShift + BufferGroup.FpBits);
-                v128 bbMidTileY = X86.Sse4_1.max_epi32(bbTileMinY, X86.Sse4_1.min_epi32(bbTileMaxY, midTileY));
+                int midVtxRight = ~IntrinsicUtils._vmovemask_f32(edgeY[1]);
+                v128 midPixelX = IntrinsicUtils._vblendq_f32(edgeY[1], ipVtxX[1], ipVtxX[2]);
+                v128 midPixelY = IntrinsicUtils._vblendq_f32(edgeY[1], ipVtxY[1], ipVtxY[2]);
+                v128 midTileY = Arm.Neon.vshrq_n_s32(Arm.Neon.vmaxq_s32(midPixelY, zero), BufferGroup.TileHeightShift + BufferGroup.FpBits);
+                v128 bbMidTileY = Arm.Neon.vmaxq_s32(bbTileMinY, Arm.Neon.vminq_s32(bbTileMaxY, midTileY));
 
                 // Compute edge events for the bottom of the bounding box, or for the middle tile in case of
                 // the edge originating from the middle vertex.
                 v128* xDiffi = stackalloc v128[2];
-                xDiffi[0] = X86.Sse2.sub_epi32(ipVtxX[0], X86.Sse2.slli_epi32(bbPixelMinX, BufferGroup.FpBits));
-                xDiffi[1] = X86.Sse2.sub_epi32(midPixelX, X86.Sse2.slli_epi32(bbPixelMinX, BufferGroup.FpBits));
+                xDiffi[0] = Arm.Neon.vsubq_s32(ipVtxX[0], Arm.Neon.vshlq_n_s32(bbPixelMinX, BufferGroup.FpBits));
+                xDiffi[1] = Arm.Neon.vsubq_s32(midPixelX, Arm.Neon.vshlq_n_s32(bbPixelMinX, BufferGroup.FpBits));
 
                 v128* yDiffi = stackalloc v128[2];
-                yDiffi[0] = X86.Sse2.sub_epi32(ipVtxY[0], X86.Sse2.slli_epi32(bbPixelMinY, BufferGroup.FpBits));
-                yDiffi[1] = X86.Sse2.sub_epi32(midPixelY, X86.Sse2.slli_epi32(bbMidTileY, BufferGroup.FpBits + BufferGroup.TileHeightShift));
+                yDiffi[0] = Arm.Neon.vsubq_s32(ipVtxY[0], Arm.Neon.vshlq_n_s32(bbPixelMinY, BufferGroup.FpBits));
+                yDiffi[1] = Arm.Neon.vsubq_s32(midPixelY, Arm.Neon.vshlq_n_s32(bbMidTileY, BufferGroup.FpBits + BufferGroup.TileHeightShift));
 
                 /* Edge slope setup - Note we do not conform to DX/GL rasterization rules */
 
                 // Potentially flip edge to ensure that all edges have positive Y slope.
-                edgeX[1] = X86.Sse4_1.blendv_ps(edgeX[1], /*neg_epi32*/ X86.Sse2.sub_epi32(X86.Sse2.set1_epi32(0), edgeX[1]), edgeY[1]);
-                edgeY[1] = X86.Ssse3.abs_epi32(edgeY[1]);
+                edgeX[1] = IntrinsicUtils._vblendq_f32(edgeY[1], edgeX[1], Arm.Neon.vnegq_s32(edgeX[1]));
+                edgeY[1] = Arm.Neon.vabsq_s32(edgeY[1]);
 
                 // Compute floating point slopes
                 v128* slope = stackalloc v128[3];
-                slope[0] = X86.Sse.div_ps(X86.Sse2.cvtepi32_ps(edgeX[0]), X86.Sse2.cvtepi32_ps(edgeY[0]));
-                slope[1] = X86.Sse.div_ps(X86.Sse2.cvtepi32_ps(edgeX[1]), X86.Sse2.cvtepi32_ps(edgeY[1]));
-                slope[2] = X86.Sse.div_ps(X86.Sse2.cvtepi32_ps(edgeX[2]), X86.Sse2.cvtepi32_ps(edgeY[2]));
+                slope[0] = Arm.Neon.vdivq_f32(Arm.Neon.vcvtq_f32_s32(edgeX[0]), Arm.Neon.vcvtq_f32_s32(edgeY[0]));
+                slope[1] = Arm.Neon.vdivq_f32(Arm.Neon.vcvtq_f32_s32(edgeX[1]), Arm.Neon.vcvtq_f32_s32(edgeY[1]));
+                slope[2] = Arm.Neon.vdivq_f32(Arm.Neon.vcvtq_f32_s32(edgeX[2]), Arm.Neon.vcvtq_f32_s32(edgeY[2]));
 
                 // Modify slope of horizontal edges to make sure they mask out pixels above/below the edge. The slope is set to screen
                 // width to mask out all pixels above or below the horizontal edge. We must also add a small bias to acount for that
                 // vertices may end up off screen due to clipping. We're assuming that the round off error is no bigger than 1.0
-                v128 horizontalSlopeDelta = X86.Sse.set1_ps(2f * (NumPixelsX + 2f * (BufferGroup.GuardBandPixelSize + 1.0f)));
-                v128 horizontalSlope0 = X86.Sse2.cmpeq_epi32(edgeY[0], X86.Sse2.setzero_si128());
-                v128 horizontalSlope1 = X86.Sse2.cmpeq_epi32(edgeY[1], X86.Sse2.setzero_si128());
-                slope[0] = X86.Sse4_1.blendv_ps(slope[0], horizontalSlopeDelta, horizontalSlope0);
-                slope[1] = X86.Sse4_1.blendv_ps(slope[1], /*neg_ps*/ X86.Sse.xor_ps(horizontalSlopeDelta, X86.Sse.set1_ps(-0f)), horizontalSlope1);
+                v128 horizontalSlopeDelta = new v128(2f * (NumPixelsX + 2f * (BufferGroup.GuardBandPixelSize + 1.0f)));
+                v128 horizontalSlope0 = Arm.Neon.vceqzq_s32(edgeY[0]);
+                v128 horizontalSlope1 = Arm.Neon.vceqzq_s32(edgeY[1]);
+                slope[0] = IntrinsicUtils._vblendq_f32(horizontalSlope0, slope[0], horizontalSlopeDelta);
+                slope[1] = IntrinsicUtils._vblendq_f32(horizontalSlope1, slope[1], Arm.Neon.vnegq_f32(horizontalSlopeDelta));
 
                 v128* vy = stackalloc v128[3];
                 vy[0] = yDiffi[0];
                 vy[1] = yDiffi[1];
                 vy[2] = yDiffi[0];
 
-                v128 offset0 = X86.Sse2.and_si128(X86.Sse2.add_epi32(yDiffi[0], X86.Sse2.set1_epi32(BufferGroup.FpHalfPixel - 1)), X86.Sse2.set1_epi32((-1 << BufferGroup.FpBits)));
-                v128 offset1 = X86.Sse2.and_si128(X86.Sse2.add_epi32(yDiffi[1], X86.Sse2.set1_epi32(BufferGroup.FpHalfPixel - 1)), X86.Sse2.set1_epi32((-1 << BufferGroup.FpBits)));
-                vy[0] = X86.Sse4_1.blendv_ps(yDiffi[0], offset0, horizontalSlope0);
-                vy[1] = X86.Sse4_1.blendv_ps(yDiffi[1], offset1, horizontalSlope1);
+                v128 offset0 = Arm.Neon.vandq_s8(Arm.Neon.vaddq_s32(yDiffi[0], new v128(BufferGroup.FpHalfPixel - 1)), new v128((-1 << BufferGroup.FpBits)));
+                v128 offset1 = Arm.Neon.vandq_s8(Arm.Neon.vaddq_s32(yDiffi[1], new v128(BufferGroup.FpHalfPixel - 1)), new v128((-1 << BufferGroup.FpBits)));
+                vy[0] = IntrinsicUtils._vblendq_f32(horizontalSlope0, yDiffi[0], offset0);
+                vy[1] = IntrinsicUtils._vblendq_f32(horizontalSlope1, yDiffi[1], offset1);
 
                 // Compute edge events for the bottom of the bounding box, or for the middle tile in case of
                 // the edge originating from the middle vertex.
@@ -385,36 +1256,36 @@ vertex and compute tile */
                 for (int i = 0; i < 3; i++)
                 {
                     // Common, compute slope sign (used to propagate the remainder term when overflowing) is postive or negative x-direction
-                    slopeSign[i] = X86.Sse4_1.blendv_ps(X86.Sse2.set1_epi32(1), X86.Sse2.set1_epi32(-1), edgeX[i]);
-                    absEdgeX[i] = X86.Ssse3.abs_epi32(edgeX[i]);
+                    slopeSign[i] = IntrinsicUtils._vblendq_f32(edgeX[i], new v128(1), new v128(-1));
+                    absEdgeX[i] = Arm.Neon.vabsq_s32(edgeX[i]);
 
                     // Delta and error term for one vertical tile step. The exact delta is exactDelta = edgeX / edgeY, due to limited precision we
                     // repersent the delta as delta = qoutient + remainder / edgeY, where quotient = int(edgeX / edgeY). In this case, since we step
                     // one tile of scanlines at a time, the slope is computed for a tile-sized step.
-                    slopeTileDelta[i] = X86.Sse2.cvttps_epi32(X86.Sse.mul_ps(slope[i], X86.Sse.set1_ps(BufferGroup.FpTileHeight)));
-                    slopeTileRemainder[i] = X86.Sse2.sub_epi32(X86.Sse2.slli_epi32(absEdgeX[i], BufferGroup.FpTileHeightShift), X86.Sse4_1.mullo_epi32(X86.Ssse3.abs_epi32(slopeTileDelta[i]), edgeY[i]));
+                    slopeTileDelta[i] = Arm.Neon.vcvtnq_s32_f32(Arm.Neon.vmulq_f32(slope[i], new v128((float)BufferGroup.FpTileHeight)));
+                    slopeTileRemainder[i] = Arm.Neon.vsubq_s32(Arm.Neon.vshlq_n_s32(absEdgeX[i], BufferGroup.FpTileHeightShift), Arm.Neon.vmulq_u32(Arm.Neon.vabsq_s32(slopeTileDelta[i]), edgeY[i]));
 
                     // Jump to bottom scanline of tile row, this is the bottom of the bounding box, or the middle vertex of the triangle.
                     // The jump can be in both positive and negative y-direction due to clipping / offscreen vertices.
-                    v128 tileStartDir = X86.Sse4_1.blendv_ps(slopeSign[i], /*neg_epi32*/ X86.Sse2.sub_epi32(X86.Sse2.set1_epi32(0), slopeSign[i]), vy[i]);
-                    v128 tieBreaker = X86.Sse4_1.blendv_ps(X86.Sse2.set1_epi32(0), X86.Sse2.set1_epi32(1), tileStartDir);
-                    v128 tileStartSlope = X86.Sse2.cvttps_epi32(X86.Sse.mul_ps(slope[i], X86.Sse2.cvtepi32_ps(/*neg_epi32*/ X86.Sse2.sub_epi32(X86.Sse2.set1_epi32(0), vy[i]))));
-                    v128 tileStartRemainder = X86.Sse2.sub_epi32(X86.Sse4_1.mullo_epi32(absEdgeX[i], X86.Ssse3.abs_epi32(vy[i])), X86.Sse4_1.mullo_epi32(X86.Ssse3.abs_epi32(tileStartSlope), edgeY[i]));
+                    v128 tileStartDir = IntrinsicUtils._vblendq_f32(vy[i], slopeSign[i], Arm.Neon.vnegq_s32(slopeSign[i]));
+                    v128 tieBreaker = IntrinsicUtils._vblendq_f32(tileStartDir, zero, one);
+                    v128 tileStartSlope = Arm.Neon.vcvtnq_s32_f32(Arm.Neon.vmulq_f32(slope[i], Arm.Neon.vcvtq_f32_s32(Arm.Neon.vnegq_s32(vy[i]))));
+                    v128 tileStartRemainder = Arm.Neon.vsubq_s32(Arm.Neon.vmulq_u32(absEdgeX[i], Arm.Neon.vabsq_s32(vy[i])), Arm.Neon.vmulq_u32(Arm.Neon.vabsq_s32(tileStartSlope), edgeY[i]));
 
-                    eventStartRemainder[i] = X86.Sse2.sub_epi32(tileStartRemainder, tieBreaker);
-                    v128 overflow = X86.Sse2.srai_epi32(eventStartRemainder[i], 31);
-                    eventStartRemainder[i] = X86.Sse2.add_epi32(eventStartRemainder[i], X86.Sse2.and_si128(overflow, edgeY[i]));
-                    eventStartRemainder[i] = X86.Sse4_1.blendv_ps(eventStartRemainder[i], X86.Sse2.sub_epi32(X86.Sse2.sub_epi32(edgeY[i], eventStartRemainder[i]), X86.Sse2.set1_epi32(1)), vy[i]);
+                    eventStartRemainder[i] = Arm.Neon.vsubq_s32(tileStartRemainder, tieBreaker);
+                    v128 overflow = Arm.Neon.vshrq_n_s32(eventStartRemainder[i], 31);
+                    eventStartRemainder[i] = Arm.Neon.vaddq_s32(eventStartRemainder[i], Arm.Neon.vandq_s8(overflow, edgeY[i]));
+                    eventStartRemainder[i] = IntrinsicUtils._vblendq_f32(vy[i], eventStartRemainder[i], Arm.Neon.vsubq_s32(Arm.Neon.vsubq_s32(edgeY[i], eventStartRemainder[i]), one));
 
                     //eventStart[i] = xDiffi[i & 1] + tileStartSlope + (overflow & tileStartDir) + X86.Sse2.set1_epi32(FP_HALF_PIXEL - 1) + tieBreaker;
-                    eventStart[i] = X86.Sse2.add_epi32(X86.Sse2.add_epi32(xDiffi[i & 1], tileStartSlope), X86.Sse2.and_si128(overflow, tileStartDir));
-                    eventStart[i] = X86.Sse2.add_epi32(X86.Sse2.add_epi32(eventStart[i], X86.Sse2.set1_epi32(BufferGroup.FpHalfPixel - 1)), tieBreaker);
+                    eventStart[i] = Arm.Neon.vaddq_s32(Arm.Neon.vaddq_s32(xDiffi[i & 1], tileStartSlope), Arm.Neon.vandq_s8(overflow, tileStartDir));
+                    eventStart[i] = Arm.Neon.vaddq_s32(Arm.Neon.vaddq_s32(eventStart[i], new v128(BufferGroup.FpHalfPixel - 1)), tieBreaker);
                 }
 
                 // Split bounding box into bottom - middle - top region.
-                v128 bbBottomIdx = X86.Sse2.add_epi32(bbTileMinX, X86.Sse4_1.mullo_epi32(bbTileMinY, X86.Sse2.set1_epi32(NumTilesX)));
-                v128 bbTopIdx = X86.Sse2.add_epi32(bbTileMinX, X86.Sse4_1.mullo_epi32(X86.Sse2.add_epi32(bbTileMinY, bbTileSizeY), X86.Sse2.set1_epi32(NumTilesX)));
-                v128 bbMidIdx = X86.Sse2.add_epi32(bbTileMinX, X86.Sse4_1.mullo_epi32(midTileY, X86.Sse2.set1_epi32(NumTilesX)));
+                v128 bbBottomIdx = Arm.Neon.vaddq_s32(bbTileMinX, Arm.Neon.vmulq_u32(bbTileMinY, new v128(NumTilesX)));
+                v128 bbTopIdx = Arm.Neon.vaddq_s32(bbTileMinX, Arm.Neon.vmulq_u32(Arm.Neon.vaddq_s32(bbTileMinY, bbTileSizeY), new v128(NumTilesX)));
+                v128 bbMidIdx = Arm.Neon.vaddq_s32(bbTileMinX, Arm.Neon.vmulq_u32(midTileY, new v128(NumTilesX)));
 
                 // Loop over non-culled triangle and change SIMD axis to per-pixel
                 while (triMask != 0)
@@ -423,16 +1294,17 @@ vertex and compute tile */
                     int triMidVtxRight = (midVtxRight >> (int)triIdx) & 1;
 
                     // Get Triangle Zmin zMax
-                    v128 zTriMax = X86.Sse.set1_ps(IntrinsicUtils.getFloatLane(zMax, triIdx));
-                    v128 zTriMin = X86.Sse.set1_ps(IntrinsicUtils.getFloatLane(zMin, triIdx));
+                    v128 zTriMax = new v128(IntrinsicUtils.getFloatLane(zMax, triIdx));
+                    v128 zTriMin = new v128(IntrinsicUtils.getFloatLane(zMin, triIdx));
 
                     // Setup Zmin value for first set of 8x4 subtiles
-                    v128 SimdSubTileColOffsetF = X86.Sse.setr_ps(0, BufferGroup.SubTileWidth, BufferGroup.SubTileWidth * 2, BufferGroup.SubTileWidth * 3);
-                    v128 z0 = IntrinsicUtils._mmw_fmadd_ps(X86.Sse.set1_ps(IntrinsicUtils.getFloatLane(zPixelDx, triIdx)),
-                                                           SimdSubTileColOffsetF,
-                                                           IntrinsicUtils._mmw_fmadd_ps(X86.Sse.set1_ps(IntrinsicUtils.getFloatLane(zPixelDy, triIdx)),
-                                                           X86.Sse2.setzero_si128(),
-                                                           X86.Sse.set1_ps(IntrinsicUtils.getFloatLane(zPlaneOffset, triIdx))));
+                    v128 SimdSubTileColOffsetF = new v128(0f, BufferGroup.SubTileWidth, BufferGroup.SubTileWidth * 2, BufferGroup.SubTileWidth * 3);
+                    v128 z0 = Arm.Neon.vmlaq_f32(Arm.Neon.vmlaq_f32(
+                                                            new v128(IntrinsicUtils.getFloatLane(zPlaneOffset, triIdx)),
+                                                            new v128(IntrinsicUtils.getFloatLane(zPixelDy, triIdx)),
+                                                            zero),
+                                                new v128(IntrinsicUtils.getFloatLane(zPixelDx, triIdx)),
+                                                SimdSubTileColOffsetF);
 
                     float zx = IntrinsicUtils.getFloatLane(zTileDx, triIdx);
                     float zy = IntrinsicUtils.getFloatLane(zTileDy, triIdx);
@@ -448,84 +1320,129 @@ vertex and compute tile */
                     {
                         if (triMidVtxRight != 0)
                         {
-                            RasterizeTriangle(tiles, true, 1, triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slope, slopeTileDelta, zTriMin, zTriMax, ref z0, zx, zy, edgeY, absEdgeX, slopeSign, eventStartRemainder, slopeTileRemainder);
+                            RasterizeTriangleNEON(tiles, true, 1, triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slope, slopeTileDelta, zTriMin, zTriMax, ref z0, zx, zy, edgeY, absEdgeX, slopeSign, eventStartRemainder, slopeTileRemainder);
                         }
                         else
                         {
-                            RasterizeTriangle(tiles, true, 0, triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slope, slopeTileDelta, zTriMin, zTriMax, ref z0, zx, zy, edgeY, absEdgeX, slopeSign, eventStartRemainder, slopeTileRemainder);
+                            RasterizeTriangleNEON(tiles, true, 0, triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slope, slopeTileDelta, zTriMin, zTriMax, ref z0, zx, zy, edgeY, absEdgeX, slopeSign, eventStartRemainder, slopeTileRemainder);
                         }
                     }
                     else
                     {
                         if (triMidVtxRight != 0)
                         {
-                            RasterizeTriangle(tiles, false, 1, triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slope, slopeTileDelta, zTriMin, zTriMax, ref z0, zx, zy, edgeY, absEdgeX, slopeSign, eventStartRemainder, slopeTileRemainder);
+                            RasterizeTriangleNEON(tiles, false, 1, triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slope, slopeTileDelta, zTriMin, zTriMax, ref z0, zx, zy, edgeY, absEdgeX, slopeSign, eventStartRemainder, slopeTileRemainder);
                         }
                         else
                         {
-                            RasterizeTriangle(tiles, false, 0, triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slope, slopeTileDelta, zTriMin, zTriMax, ref z0, zx, zy, edgeY, absEdgeX, slopeSign, eventStartRemainder, slopeTileRemainder);
+                            RasterizeTriangleNEON(tiles, false, 0, triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slope, slopeTileDelta, zTriMin, zTriMax, ref z0, zx, zy, edgeY, absEdgeX, slopeSign, eventStartRemainder, slopeTileRemainder);
                         }
                     }
                 }
             }
         }
 
-        void ComputeBoundingBox(v128* vX, v128* vY, ref ScissorRect scissor, out v128 bbminX, out v128 bbminY, out v128 bbmaxX, out v128 bbmaxY)
+        // RasterizeMesh now gets as input all triangles that already passed backface culling, clipping and an early z test
+        // this should make it even easier for the system to keep using the full simd words and have better simd occupancy
+        // instead of removing part of the work based on the mask
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private void RasterizeMesh(Tile* tiles, float* binTriangleX, float* binTriangleY, float* binTriangleW, int numVert, ScissorRect screenScissor)
         {
-            if (X86.Sse4_1.IsSse41Supported)
+            X86.MXCSRBits OldBits = X86.MXCSRBits.RoundToNearest;
+            if (X86.Sse2.IsSse2Supported)
             {
-                // Find Min/Max vertices
-                bbminX = X86.Sse2.cvttps_epi32(X86.Sse.min_ps(vX[0], X86.Sse.min_ps(vX[1], vX[2])));
-                bbminY = X86.Sse2.cvttps_epi32(X86.Sse.min_ps(vY[0], X86.Sse.min_ps(vY[1], vY[2])));
-                bbmaxX = X86.Sse2.cvttps_epi32(X86.Sse.max_ps(vX[0], X86.Sse.max_ps(vX[1], vX[2])));
-                bbmaxY = X86.Sse2.cvttps_epi32(X86.Sse.max_ps(vY[0], X86.Sse.max_ps(vY[1], vY[2])));
+                // Intel implementation needs a rounding workaround
+                OldBits = X86.MXCSR;
+                // DS: TODO: UNITY BURST FIX
+                //using (var roundingMode = new X86.RoundingScope(X86.MXCSRBits.RoundToNearest))
+                const X86.MXCSRBits roundingMode = X86.MXCSRBits.RoundToNearest;
+                X86.MXCSR = (OldBits & ~X86.MXCSRBits.RoundingControlMask) | roundingMode;
+            }
 
-                // Clamp to tile boundaries
-                v128 SimdPadWMask = X86.Sse2.set1_epi32(~(BufferGroup.TileWidth - 1));
-                v128 SimdPadHMask = X86.Sse2.set1_epi32(~(BufferGroup.TileHeight - 1));
-                bbminX = X86.Sse2.and_si128(bbminX, SimdPadWMask);
-                bbmaxX = X86.Sse2.and_si128(X86.Sse2.add_epi32(bbmaxX, X86.Sse2.set1_epi32(BufferGroup.TileWidth)), SimdPadWMask);
-                bbminY = X86.Sse2.and_si128(bbminY, SimdPadHMask);
-                bbmaxY = X86.Sse2.and_si128(X86.Sse2.add_epi32(bbmaxY, X86.Sse2.set1_epi32(BufferGroup.TileHeight)), SimdPadHMask);
+            int vertexIndex = 0;
 
-                // Clip to scissor
-                bbminX = X86.Sse4_1.max_epi32(bbminX, X86.Sse2.set1_epi32(scissor.mMinX));
-                bbmaxX = X86.Sse4_1.min_epi32(bbmaxX, X86.Sse2.set1_epi32(scissor.mMaxX));
-                bbminY = X86.Sse4_1.max_epi32(bbminY, X86.Sse2.set1_epi32(scissor.mMinY));
-                bbmaxY = X86.Sse4_1.min_epi32(bbmaxY, X86.Sse2.set1_epi32(scissor.mMaxY));
-            }
-            else
-            {
-                bbminX = default(v128);
-                bbminY = default(v128);
-                bbmaxX = default(v128);
-                bbmaxY = default(v128);
-            }
-        }
+            v128* vtxX_prealloc = stackalloc v128[3];
+            v128* vtxY_prealloc = stackalloc v128[3];
+            v128* vtxW_prealloc = stackalloc v128[3];
 
-        void ComputeDepthPlane(v128* pVtxX, v128* pVtxY, v128* pVtxZ, out v128 zPixelDx, out v128 zPixelDy)
-        {
-            if (X86.Sse.IsSseSupported)
-            {
-                // Setup z(x,y) = z0 + dx*x + dy*y screen space depth plane equation
-                v128 x2 = X86.Sse.sub_ps(pVtxX[2], pVtxX[0]);
-                v128 x1 = X86.Sse.sub_ps(pVtxX[1], pVtxX[0]);
-                v128 y1 = X86.Sse.sub_ps(pVtxY[1], pVtxY[0]);
-                v128 y2 = X86.Sse.sub_ps(pVtxY[2], pVtxY[0]);
-                v128 z1 = X86.Sse.sub_ps(pVtxZ[1], pVtxZ[0]);
-                v128 z2 = X86.Sse.sub_ps(pVtxZ[2], pVtxZ[0]);
-                v128 d = X86.Sse.div_ps(X86.Sse.set1_ps(1.0f), IntrinsicUtils._mmw_fmsub_ps(x1, y2, X86.Sse.mul_ps(y1, x2)));
-                zPixelDx = X86.Sse.mul_ps(IntrinsicUtils._mmw_fmsub_ps(z1, y2, X86.Sse.mul_ps(y1, z2)), d);
-                zPixelDy = X86.Sse.mul_ps(IntrinsicUtils._mmw_fmsub_ps(x1, z2, X86.Sse.mul_ps(z1, x2)), d);
-            }
-            else
+            v128* pVtxX_prealloc = stackalloc v128[3];
+            v128* pVtxY_prealloc = stackalloc v128[3];
+            v128* pVtxZ_prealloc = stackalloc v128[3];
+
+            v128* ipVtxX_prealloc = stackalloc v128[3];
+            v128* ipVtxY_prealloc = stackalloc v128[3];
+
+            while (vertexIndex < numVert)
             {
-                zPixelDx = default(v128);
-                zPixelDy = default(v128);
+                v128* vtxX = vtxX_prealloc;
+                v128* vtxY = vtxY_prealloc;
+                v128* vtxW = vtxW_prealloc;
+
+                int numLanes = math.min(SIMD_LANES, numVert - vertexIndex);
+                uint triMask = (1u << numLanes) - 1;
+
+                for (int i = 0; i < 3; i++)
+                {
+                    if (X86.Sse2.IsSse2Supported)
+                    {
+                        vtxX[i] = X86.Sse.load_ps(&binTriangleX[vertexIndex + i * 4]);
+                        vtxY[i] = X86.Sse.load_ps(&binTriangleY[vertexIndex + i * 4]);
+                        vtxW[i] = X86.Sse.load_ps(&binTriangleW[vertexIndex + i * 4]);
+                    }
+                    else if (Arm.Neon.IsNeonSupported)
+                    {
+                        vtxX[i] = Arm.Neon.vld1q_f32(&binTriangleX[vertexIndex + i * 4]);
+                        vtxY[i] = Arm.Neon.vld1q_f32(&binTriangleY[vertexIndex + i * 4]);
+                        vtxW[i] = Arm.Neon.vld1q_f32(&binTriangleW[vertexIndex + i * 4]);
+                    }
+                }
+
+                vertexIndex += SIMD_LANES * 3;
+
+                if (triMask == 0x0)
+                {
+                    continue;
+                }
+
+                /* Project and transform to screen space. Note
+                    that we use z = 1.0 / vtx.w for depth, which means that z = 0 is far and
+                    z = 1/m_near is near. For ortho projection, we do z = (z * -1) + 1 to go from z = 0 for far and z = 2 for near
+
+                    We must also use a greater than depth test, and in effect
+                    everything is reversed compared to regular z implementations. */
+
+                v128* pVtxX = pVtxX_prealloc;
+                v128* pVtxY = pVtxY_prealloc;
+                v128* pVtxZ = pVtxZ_prealloc;
+
+                v128* ipVtxX = ipVtxX_prealloc;
+                v128* ipVtxY = ipVtxY_prealloc;
+                if (X86.Sse2.IsSse2Supported)
+                {
+                    ProjectVerticesSSE(ipVtxX, ipVtxY, pVtxX, pVtxY, pVtxZ, vtxX, vtxY, vtxW);
+                }
+                else if (Arm.Neon.IsNeonSupported)
+                {
+                    ProjectVerticesNEON(ipVtxX, ipVtxY, pVtxX, pVtxY, pVtxZ, vtxX, vtxY, vtxW);
+                }
+
+                /* Setup and rasterize a SIMD batch of triangles */
+                if (X86.Sse2.IsSse2Supported)
+                {
+                    RasterizeTriangleBatchSSE(tiles, ipVtxX, ipVtxY, pVtxX, pVtxY, pVtxZ, triMask, screenScissor);
+                    // Don't forget to restore the rounding mode
+                    // DS: TODO: UNITY BURST FIX
+                    X86.MXCSR = OldBits;
+                }
+                else if (Arm.Neon.IsNeonSupported)
+                {
+                    RasterizeTriangleBatchNEON(tiles, ipVtxX, ipVtxY, pVtxX, pVtxY, pVtxZ, triMask, screenScissor);
+                }
             }
         }
 
-        void RasterizeTriangle(
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        void RasterizeTriangleNEON(
             Tile* tiles,
             bool isTightTraversal,
             int midVtxRight,
@@ -548,7 +1465,7 @@ void RasterizeTriangle(
             v128* eventStartRemainder,
             v128* slopeTileRemainder)
         {
-            if (X86.Sse4_1.IsSse41Supported)
+            if (Arm.Neon.IsNeonSupported)
             {
                 const int LEFT_EDGE_BIAS = -1;
                 const int RIGHT_EDGE_BIAS = 1;
@@ -562,33 +1479,25 @@ void RasterizeTriangle(
 
                 for (int i = 0; i < 3; ++i)
                 {
-                    triSlopeSign[i] = X86.Sse2.set1_epi32(IntrinsicUtils.getIntLane(slopeSign[i], triIdx));
-                    triSlopeTileDelta[i] =
-                        X86.Sse2.set1_epi32(IntrinsicUtils.getIntLane(slopeTileDelta[i], triIdx));
-                    triEdgeY[i] = X86.Sse2.set1_epi32(IntrinsicUtils.getIntLane(edgeY[i], triIdx));
-                    triSlopeTileRemainder[i] =
-                        X86.Sse2.set1_epi32(IntrinsicUtils.getIntLane(slopeTileRemainder[i], triIdx));
-
-                    v128 triSlope = X86.Sse.set1_ps(IntrinsicUtils.getFloatLane(slope[i], triIdx));
-                    v128 triAbsEdgeX = X86.Sse2.set1_epi32(IntrinsicUtils.getIntLane(absEdgeX[i], triIdx));
-                    v128 triStartRemainder =
-                        X86.Sse2.set1_epi32(IntrinsicUtils.getIntLane(eventStartRemainder[i], triIdx));
-                    v128 triEventStart = X86.Sse2.set1_epi32(IntrinsicUtils.getIntLane(eventStart[i], triIdx));
-
-                    v128 SimdLaneYCoordF = X86.Sse.setr_ps(128f, 384f, 640f, 896f);
-                    v128 scanlineDelta = X86.Sse2.cvttps_epi32(X86.Sse.mul_ps(triSlope, SimdLaneYCoordF));
-                    v128 SimdLaneYCoordI = X86.Sse2.setr_epi32(128, 384, 640, 896);
-                    v128 scanlineSlopeRemainder =
-                        X86.Sse2.sub_epi32(X86.Sse4_1.mullo_epi32(triAbsEdgeX, SimdLaneYCoordI),
-                            X86.Sse4_1.mullo_epi32(X86.Ssse3.abs_epi32(scanlineDelta), triEdgeY[i]));
-
-                    triEventRemainder[i] = X86.Sse2.sub_epi32(triStartRemainder, scanlineSlopeRemainder);
-                    v128 overflow = X86.Sse2.srai_epi32(triEventRemainder[i], 31);
-                    triEventRemainder[i] =
-                        X86.Sse2.add_epi32(triEventRemainder[i], X86.Sse2.and_si128(overflow, triEdgeY[i]));
-                    triEvent[i] =
-                        X86.Sse2.add_epi32(X86.Sse2.add_epi32(triEventStart, scanlineDelta),
-                            X86.Sse2.and_si128(overflow, triSlopeSign[i]));
+                    triSlopeSign[i] = new v128(IntrinsicUtils.getIntLane(slopeSign[i], triIdx));
+                    triSlopeTileDelta[i] = new v128(IntrinsicUtils.getIntLane(slopeTileDelta[i], triIdx));
+                    triEdgeY[i] = new v128(IntrinsicUtils.getIntLane(edgeY[i], triIdx));
+                    triSlopeTileRemainder[i] = new v128(IntrinsicUtils.getIntLane(slopeTileRemainder[i], triIdx));
+
+                    v128 triSlope = new v128(IntrinsicUtils.getFloatLane(slope[i], triIdx));
+                    v128 triAbsEdgeX = new v128(IntrinsicUtils.getIntLane(absEdgeX[i], triIdx));
+                    v128 triStartRemainder = new v128(IntrinsicUtils.getIntLane(eventStartRemainder[i], triIdx));
+                    v128 triEventStart = new v128(IntrinsicUtils.getIntLane(eventStart[i], triIdx));
+
+                    v128 SimdLaneYCoordF = new v128(128f, 384f, 640f, 896f);
+                    v128 scanlineDelta = Arm.Neon.vcvtnq_s32_f32(Arm.Neon.vmulq_f32(triSlope, SimdLaneYCoordF));
+                    v128 SimdLaneYCoordI = new v128(128, 384, 640, 896);
+                    v128 scanlineSlopeRemainder = Arm.Neon.vsubq_s32(Arm.Neon.vmulq_u32(triAbsEdgeX, SimdLaneYCoordI), Arm.Neon.vmulq_u32(Arm.Neon.vabsq_s32(scanlineDelta), triEdgeY[i]));
+
+                    triEventRemainder[i] = Arm.Neon.vsubq_s32(triStartRemainder, scanlineSlopeRemainder);
+                    v128 overflow = Arm.Neon.vshrq_n_s32(triEventRemainder[i], 31);
+                    triEventRemainder[i] = Arm.Neon.vaddq_s32(triEventRemainder[i], Arm.Neon.vandq_s8(overflow, triEdgeY[i]));
+                    triEvent[i] = Arm.Neon.vaddq_s32(Arm.Neon.vaddq_s32(triEventStart, scanlineDelta), Arm.Neon.vandq_s8(overflow, triSlopeSign[i]));
                 }
 
                 // For big triangles track start & end tile for each scanline and only traverse the valid region
@@ -604,17 +1513,17 @@ void RasterizeTriangle(
                     startDelta = IntrinsicUtils.getIntLane(slopeTileDelta[2], triIdx) + LEFT_EDGE_BIAS;
                     endDelta = IntrinsicUtils.getIntLane(slopeTileDelta[0], triIdx) + RIGHT_EDGE_BIAS;
                     topDelta = IntrinsicUtils.getIntLane(slopeTileDelta[1], triIdx) +
-                               (midVtxRight != 0 ? RIGHT_EDGE_BIAS : LEFT_EDGE_BIAS);
+                                (midVtxRight != 0 ? RIGHT_EDGE_BIAS : LEFT_EDGE_BIAS);
 
                     // Compute conservative bounds for the edge events over a 32xN tile
                     startEvent = IntrinsicUtils.getIntLane(eventStart[2], triIdx) + Mathf.Min(0, startDelta);
                     endEvent = IntrinsicUtils.getIntLane(eventStart[0], triIdx) + Mathf.Max(0, endDelta) +
-                               (BufferGroup.TileWidth << BufferGroup.FpBits); // TODO: (Apoorva) can be spun out into a const
+                                (BufferGroup.TileWidth << BufferGroup.FpBits); // TODO: (Apoorva) can be spun out into a const
 
                     if (midVtxRight != 0)
                     {
                         topEvent = IntrinsicUtils.getIntLane(eventStart[1], triIdx) + Mathf.Max(0, topDelta) +
-                                   (BufferGroup.TileWidth << BufferGroup.FpBits); // TODO: (Apoorva) can be spun out into a const
+                                    (BufferGroup.TileWidth << BufferGroup.FpBits); // TODO: (Apoorva) can be spun out into a const
                     }
                     else
                     {
@@ -635,24 +1544,23 @@ void RasterizeTriangle(
                         if (isTightTraversal)
                         {
                             // Compute tighter start and endpoints to avoid traversing empty space
-                            start = Mathf.Max(0, Mathf.Min(bbWidth - 1, startEvent >> (BufferGroup.TileWidthShift + BufferGroup.FpBits))); // TODO: (Apoorva) can be spun out into a const
-                            end = Mathf.Min(bbWidth, ((int)endEvent >> (BufferGroup.TileWidthShift + BufferGroup.FpBits))); // TODO: (Apoorva) can be spun out into a const
+                            start = Mathf.Max(0, Mathf.Min(bbWidth - 1, startEvent >> (BufferGroup.TileWidthShift + BufferGroup.FpBits)));
+                            end = Mathf.Min(bbWidth, (endEvent >> (BufferGroup.TileWidthShift + BufferGroup.FpBits)));
 
                             startEvent += startDelta;
                             endEvent += endDelta;
                         }
 
                         // Traverse the scanline and update the masked hierarchical z buffer
-                        TraverseScanline(tiles, 1, 1, start, end, tileRowIdx, 0, 2, triEvent, zTriMin, zTriMax, z0,
-                            zx);
+                        TraverseScanlineNEON(tiles, 1, 1, start, end, tileRowIdx, 0, 2, triEvent, zTriMin, zTriMax, z0, zx);
 
                         // move to the next scanline of tiles, update edge events and interpolate z
                         tileRowIdx += NumTilesX;
-                        z0 = X86.Sse.add_ps(z0, X86.Sse.set1_ps(zy));
+                        z0 = Arm.Neon.vaddq_f32(z0, new v128(zy));
 
-                        UpdateTileEventsY(triEventRemainder, triSlopeTileRemainder, triEdgeY, triEvent,
+                        UpdateTileEventsYNEON(triEventRemainder, triSlopeTileRemainder, triEdgeY, triEvent,
                             triSlopeTileDelta, triSlopeSign, 0);
-                        UpdateTileEventsY(triEventRemainder, triSlopeTileRemainder, triEdgeY, triEvent,
+                        UpdateTileEventsYNEON(triEventRemainder, triSlopeTileRemainder, triEdgeY, triEvent,
                             triSlopeTileDelta, triSlopeSign, 2);
                     }
 
@@ -665,8 +1573,8 @@ void RasterizeTriangle(
                         if (isTightTraversal)
                         {
                             // Compute tighter start and endpoints to avoid traversing lots of empty space
-                            start = Mathf.Max(0, Mathf.Min(bbWidth - 1, startEvent >> (BufferGroup.TileWidthShift + BufferGroup.FpBits))); // TODO: (Apoorva) can be spun out into a const
-                            end = Mathf.Min(bbWidth, ((int)endEvent >> (BufferGroup.TileWidthShift + BufferGroup.FpBits))); // TODO: (Apoorva) can be spun out into a const
+                            start = Mathf.Max(0, Mathf.Min(bbWidth - 1, startEvent >> (BufferGroup.TileWidthShift + BufferGroup.FpBits)));
+                            end = Mathf.Min(bbWidth, (endEvent >> (BufferGroup.TileWidthShift + BufferGroup.FpBits)));
 
                             // Switch the traversal start / end to account for the upper side edge
                             endEvent = midVtxRight != 0 ? topEvent : endEvent;
@@ -681,12 +1589,12 @@ void RasterizeTriangle(
                         // Traverse the scanline and update the masked hierarchical z buffer.
                         if (midVtxRight != 0)
                         {
-                            TraverseScanline(tiles, 2, 1, start, end, tileRowIdx, 0, 2, triEvent, zTriMin, zTriMax,
+                            TraverseScanlineNEON(tiles, 2, 1, start, end, tileRowIdx, 0, 2, triEvent, zTriMin, zTriMax,
                                 z0, zx);
                         }
                         else
                         {
-                            TraverseScanline(tiles, 1, 2, start, end, tileRowIdx, 0, 2, triEvent, zTriMin, zTriMax,
+                            TraverseScanlineNEON(tiles, 1, 2, start, end, tileRowIdx, 0, 2, triEvent, zTriMin, zTriMax,
                                 z0, zx);
                         }
 
@@ -697,13 +1605,13 @@ void RasterizeTriangle(
                     if (tileRowIdx < tileEndRowIdx)
                     {
                         // move to the next scanline of tiles, update edge events and interpolate z
-                        z0 = X86.Sse.add_ps(z0, X86.Sse.set1_ps(zy));
+                        z0 = Arm.Neon.vaddq_f32(z0, new v128(zy));
                         int i0 = midVtxRight + 0;
                         int i1 = midVtxRight + 1;
 
-                        UpdateTileEventsY(triEventRemainder, triSlopeTileRemainder, triEdgeY, triEvent,
+                        UpdateTileEventsYNEON(triEventRemainder, triSlopeTileRemainder, triEdgeY, triEvent,
                             triSlopeTileDelta, triSlopeSign, i0);
-                        UpdateTileEventsY(triEventRemainder, triSlopeTileRemainder, triEdgeY, triEvent,
+                        UpdateTileEventsYNEON(triEventRemainder, triSlopeTileRemainder, triEdgeY, triEvent,
                             triSlopeTileDelta, triSlopeSign, i1);
 
                         for (; ; )
@@ -722,7 +1630,7 @@ void RasterizeTriangle(
                             }
 
                             // Traverse the scanline and update the masked hierarchical z buffer
-                            TraverseScanline(tiles, 1, 1, start, end, tileRowIdx, midVtxRight + 0,
+                            TraverseScanlineNEON(tiles, 1, 1, start, end, tileRowIdx, midVtxRight + 0,
                                 midVtxRight + 1, triEvent, zTriMin, zTriMax, z0, zx);
 
                             // move to the next scanline of tiles, update edge events and interpolate z
@@ -732,11 +1640,11 @@ void RasterizeTriangle(
                                 break;
                             }
 
-                            z0 = X86.Sse.add_ps(z0, X86.Sse.set1_ps(zy));
+                            z0 = Arm.Neon.vaddq_f32(z0, new v128(zy));
 
-                            UpdateTileEventsY(triEventRemainder, triSlopeTileRemainder, triEdgeY, triEvent,
+                            UpdateTileEventsYNEON(triEventRemainder, triSlopeTileRemainder, triEdgeY, triEvent,
                                 triSlopeTileDelta, triSlopeSign, i0);
-                            UpdateTileEventsY(triEventRemainder, triSlopeTileRemainder, triEdgeY, triEvent,
+                            UpdateTileEventsYNEON(triEventRemainder, triSlopeTileRemainder, triEdgeY, triEvent,
                                 triSlopeTileDelta, triSlopeSign, i1);
                         }
                     }
@@ -774,7 +1682,7 @@ void RasterizeTriangle(
                             }
 
                             // Traverse the scanline and update the masked hierarchical z buffer
-                            TraverseScanline(tiles, 1, 1, start, end, tileRowIdx, midVtxRight + 0,
+                            TraverseScanlineNEON(tiles, 1, 1, start, end, tileRowIdx, midVtxRight + 0,
                                 midVtxRight + 1, triEvent, zTriMin, zTriMax, z0, zx);
 
                             // move to the next scanline of tiles, update edge events and interpolate z
@@ -784,208 +1692,118 @@ void RasterizeTriangle(
                                 break;
                             }
 
-                            z0 = X86.Sse.add_ps(z0, X86.Sse.set1_ps(zy));
+                            z0 = Arm.Neon.vaddq_f32(z0, new v128(zy));
 
-                            UpdateTileEventsY(triEventRemainder, triSlopeTileRemainder, triEdgeY, triEvent,
+                            UpdateTileEventsYNEON(triEventRemainder, triSlopeTileRemainder, triEdgeY, triEvent,
                                 triSlopeTileDelta, triSlopeSign, i0);
-                            UpdateTileEventsY(triEventRemainder, triSlopeTileRemainder, triEdgeY, triEvent,
+                            UpdateTileEventsYNEON(triEventRemainder, triSlopeTileRemainder, triEdgeY, triEvent,
                                 triSlopeTileDelta, triSlopeSign, i1);
                         }
                     }
                 }
             }
         }
+        #endregion
 
-        void TraverseScanline(Tile* tiles, int numRight, int numLeft, int leftOffset, int rightOffset, int tileIdx, int rightEvent, int leftEvent, v128* events, v128 zTriMin, v128 zTriMax, v128 iz0, float zx)
+        public void Execute(int i)
         {
-            if (X86.Sse4_1.IsSse41Supported)
-            {
-                // Floor edge events to integer pixel coordinates (shift out fixed point bits)
-                int eventOffset = leftOffset << BufferGroup.TileWidthShift;
+            var tiles = &TilesBasePtr[0];
+            ScissorRect scissorRect = new ScissorRect();
 
-                v128* right = stackalloc v128[numRight];
-                v128* left = stackalloc v128[numLeft];
+            int2 pixelsPerTile = new int2(NumPixelsX / NumTilesX, NumPixelsY / NumTilesY);
 
-                for (int i = 0; i < numRight; ++i)
-                {
-                    right[i] = X86.Sse4_1.max_epi32(X86.Sse2.sub_epi32(X86.Sse2.srai_epi32(events[rightEvent + i], BufferGroup.FpBits), X86.Sse2.set1_epi32(eventOffset)), X86.Sse2.setzero_si128());
-                }
+            float* temp_stack_x = stackalloc float[12];
+            float* temp_stack_y = stackalloc float[12];
+            float* temp_stack_w = stackalloc float[12];
+            int tempStackSize = 0;
 
-                for (int i = 0; i < numLeft; ++i)
-                {
-                    left[i] = X86.Sse4_1.max_epi32(X86.Sse2.sub_epi32(X86.Sse2.srai_epi32(events[leftEvent - i], BufferGroup.FpBits), X86.Sse2.set1_epi32(eventOffset)), X86.Sse2.setzero_si128());
-                }
+            int countOfTilesX = NumTilesX / TilesPerBinX;
+            scissorRect.mMinX = (i % countOfTilesX) * pixelsPerTile.x * TilesPerBinX;
+            scissorRect.mMaxX = scissorRect.mMinX + pixelsPerTile.x * TilesPerBinX;
+            scissorRect.mMinY = (i / countOfTilesX) * pixelsPerTile.y * TilesPerBinY;
+            scissorRect.mMaxY = scissorRect.mMinY + pixelsPerTile.y * TilesPerBinY;
 
-                v128 z0 = X86.Sse.add_ps(iz0, X86.Sse.set1_ps(zx * leftOffset));
-                int tileIdxEnd = tileIdx + rightOffset;
-                tileIdx += leftOffset;
+            float4 clipRect = new float4(scissorRect.mMinX, scissorRect.mMinY, scissorRect.mMaxX, scissorRect.mMaxY);
+            clipRect = (2 * clipRect.xyzw / (new float2(NumPixelsX, NumPixelsY).xyxy) - 1);
 
-                for (; ; )
-                {
-                    // Compute zMin for the overlapped layers
-                    v128 mask = tiles[tileIdx].mask;
-                    v128 zMin0 = X86.Sse4_1.blendv_ps(tiles[tileIdx].zMin0, tiles[tileIdx].zMin1, X86.Sse2.cmpeq_epi32(mask, X86.Sse2.set1_epi32(~0)));
-                    v128 zMin1 = X86.Sse4_1.blendv_ps(tiles[tileIdx].zMin1, tiles[tileIdx].zMin0, X86.Sse2.cmpeq_epi32(mask, X86.Sse2.setzero_si128()));
-                    v128 zMinBuf = X86.Sse.min_ps(zMin0, zMin1);
-                    v128 dist0 = X86.Sse.sub_ps(zTriMax, zMinBuf);
+            int bufferIndex = ( WorkerIndex - 1 );
+            float* binTriangleX = (float*)BinTriangleXBasePtr + BinSize * bufferIndex;
+            float* binTriangleY = (float*)BinTriangleYBasePtr + BinSize * bufferIndex;
+            float* binTriangleW = (float*)BinTriangleWBasePtr + BinSize * bufferIndex;
 
-                    if (X86.Sse.movemask_ps(dist0) != SIMD_ALL_LANES_MASK)
-                    {
-                        // Compute coverage mask for entire 32xN using shift operations
-                        v128 accumulatedMask = IntrinsicUtils._mmw_sllv_ones(left[0]);
+            // For each mesh
+            // if the mesh aabb is inside the bin aabb
+            // check all each triangle and test against the bin aabb
+            // if inside the bin, add in, once the bin is full render it
+            // once the loop finish, render the remaining triangles in the bin
+            int internalBinSize = 0;
+            for (int m = 0; m < ClippedOccluders.Length; m += 1)
+            {
+                float2 max = ClippedOccluders[m].screenMax.xy;
+                float2 min = ClippedOccluders[m].screenMin.xy;
 
-                        for (int i = 1; i < numLeft; ++i)
-                        {
-                            accumulatedMask = X86.Sse2.and_si128(accumulatedMask, IntrinsicUtils._mmw_sllv_ones(left[i]));
-                        }
+                if (math.any(min > clipRect.zw) || math.any(max < clipRect.xy))
+                    continue;
 
-                        for (int i = 0; i < numRight; ++i)
-                        {
-                            accumulatedMask = X86.Sse2.andnot_si128(IntrinsicUtils._mmw_sllv_ones(right[i]), accumulatedMask);
-                        }
+                ClippedOccluder clipped = ClippedOccluders[m];
 
-                        // Compute interpolated min for each 8x4 subtile and update the masked hierarchical z buffer entry
-                        v128 zSubTileMin = X86.Sse.max_ps(z0, zTriMin);
-                        UpdateTileAccurate(tiles, tileIdx, IntrinsicUtils._mmw_transpose_epi8(accumulatedMask), zSubTileMin);
-                    }
+                int k = 0;
+                for (int j = 0; j < clipped.expandedVertexSize; j += 3, ++k)
+                {
+                    float4 triExtents = ClippedTriExtents[clipped.sourceIndexOffset * 2 + k];
+                    min = triExtents.xy;
+                    max = triExtents.zw;
 
-                    // Update buffer address, interpolate z and edge events
-                    tileIdx++;
+                    if (math.any(min > clipRect.zw) || math.any(max < clipRect.xy))
+                        continue;
 
-                    if (tileIdx >= tileIdxEnd)
+                    for (int n = 0; n < 3; ++n)
                     {
-                        break;
+                        float3 vert = ClippedVerts[clipped.sourceIndexOffset * 6 + j + n];
+                        temp_stack_x[tempStackSize] = vert.x;
+                        temp_stack_y[tempStackSize] = vert.y;
+                        temp_stack_w[tempStackSize] = vert.z;
+                        tempStackSize++;
                     }
 
-                    z0 = X86.Sse.add_ps(z0, X86.Sse.set1_ps(zx));
-
-                    v128 SimdTileWidth = X86.Sse2.set1_epi32(BufferGroup.TileWidth);
-
-                    for (int i = 0; i < numRight; ++i)
+                    if (tempStackSize == 12)
                     {
-                        right[i] = X86.Sse2.subs_epu16(right[i], SimdTileWidth);  // Trick, use sub saturated to avoid checking against < 0 for shift (values should fit in 16 bits)
+                        for (int n = 0; n < 3; ++n)
+                        {
+                            for (int p = 0; p < 4; ++p)
+                            {
+                                binTriangleX[internalBinSize + p + n * 4] = temp_stack_x[n + p * 3];
+                                binTriangleY[internalBinSize + p + n * 4] = temp_stack_y[n + p * 3];
+                                binTriangleW[internalBinSize + p + n * 4] = temp_stack_w[n + p * 3];
+                            }
+                        }
+                        internalBinSize += 12;
+                        tempStackSize = 0;
                     }
-
-                    for (int i = 0; i < numLeft; ++i)
+                    if (internalBinSize == BinSize)
                     {
-                        left[i] = X86.Sse2.subs_epu16(left[i], SimdTileWidth);
+                        RasterizeMesh(tiles, binTriangleX, binTriangleY, binTriangleW, internalBinSize, scissorRect);
+                        internalBinSize = 0;
                     }
                 }
             }
-        }
-
-        void UpdateTileAccurate(Tile* tiles, int tileIdx, v128 coverage, v128 zTriv)
-        {
-            if (X86.Sse4_1.IsSse41Supported)
+            if (tempStackSize > 0)
             {
-                v128 zMin0 = tiles[tileIdx].zMin0;
-                v128 zMin1 = tiles[tileIdx].zMin1;
-                v128 mask = tiles[tileIdx].mask;
-
-                // Swizzle coverage mask to 8x4 subtiles
-                v128 rastMask = coverage;
-
-                // Perform individual depth tests with layer 0 & 1 and mask out all failing pixels
-                v128 sdist0 = X86.Sse.sub_ps(zMin0, zTriv);
-                v128 sdist1 = X86.Sse.sub_ps(zMin1, zTriv);
-                v128 sign0 = X86.Sse2.srai_epi32(sdist0, 31);
-                v128 sign1 = X86.Sse2.srai_epi32(sdist1, 31);
-                v128 triMask = X86.Sse2.and_si128(rastMask, X86.Sse2.or_si128(X86.Sse2.andnot_si128(mask, sign0), X86.Sse2.and_si128(mask, sign1)));
-
-                // Early out if no pixels survived the depth test (this test is more accurate than
-                // the early culling test in TraverseScanline())
-                v128 t0 = X86.Sse2.cmpeq_epi32(triMask, X86.Sse2.setzero_si128());
-                v128 t0inv = /*not_epi32*/ X86.Sse2.xor_si128(t0, X86.Sse2.set1_epi32(~0));
-
-                if (X86.Sse4_1.testz_si128(t0inv, t0inv) != 0)
+                for (int n = 0; n < 3; ++n)
                 {
-                    return;
+                    for (int p = 0; p < 4; ++p)
+                    {
+                        binTriangleX[internalBinSize + p + n * 4] = temp_stack_x[n + p * 3];
+                        binTriangleY[internalBinSize + p + n * 4] = temp_stack_y[n + p * 3];
+                        binTriangleW[internalBinSize + p + n * 4] = temp_stack_w[n + p * 3];
+                    }
                 }
-
-#if MOC_ENABLE_STATS
-            STATS_ADD(ref mStats.mOccluders.mNumTilesUpdated, 1);
-#endif
-
-                v128 zTri = X86.Sse4_1.blendv_ps(zTriv, zMin0, t0);
-
-                // Test if incoming triangle completely overwrites layer 0 or 1
-                v128 layerMask0 = X86.Sse2.andnot_si128(triMask, /*not_epi32*/ X86.Sse2.xor_si128(mask, X86.Sse2.set1_epi32(~0)));
-                v128 layerMask1 = X86.Sse2.andnot_si128(triMask, mask);
-                v128 lm0 = X86.Sse2.cmpeq_epi32(layerMask0, X86.Sse2.setzero_si128());
-                v128 lm1 = X86.Sse2.cmpeq_epi32(layerMask1, X86.Sse2.setzero_si128());
-                v128 z0 = X86.Sse4_1.blendv_ps(zMin0, zTri, lm0);
-                v128 z1 = X86.Sse4_1.blendv_ps(zMin1, zTri, lm1);
-
-                // Compute distances used for merging heuristic
-                v128 d0 = /*abs_ps*/ X86.Sse.and_ps(sdist0, X86.Sse2.set1_epi32(0x7FFFFFFF));
-                v128 d1 = /*abs_ps*/ X86.Sse.and_ps(sdist1, X86.Sse2.set1_epi32(0x7FFFFFFF));
-                v128 d2 = /*abs_ps*/ X86.Sse.and_ps(X86.Sse.sub_ps(z0, z1), X86.Sse2.set1_epi32(0x7FFFFFFF));
-
-                // Find minimum distance
-                v128 c01 = X86.Sse.sub_ps(d0, d1);
-                v128 c02 = X86.Sse.sub_ps(d0, d2);
-                v128 c12 = X86.Sse.sub_ps(d1, d2);
-                // Two tests indicating which layer the incoming triangle will merge with or
-                // overwrite. d0min indicates that the triangle will overwrite layer 0, and
-                // d1min flags that the triangle will overwrite layer 1.
-                v128 d0min = X86.Sse2.or_si128(X86.Sse2.and_si128(c01, c02), X86.Sse2.or_si128(lm0, t0));
-                v128 d1min = X86.Sse2.andnot_si128(d0min, X86.Sse2.or_si128(c12, lm1));
-
-                /* Update depth buffer entry. NOTE: we always merge into layer 0, so if the
-                   triangle should be merged with layer 1, we first swap layer 0 & 1 and then
-                   merge into layer 0. */
-
-                // Update mask based on which layer the triangle overwrites or was merged into
-                v128 inner = X86.Sse4_1.blendv_ps(triMask, layerMask1, d0min);
-
-                // Update the zMin[0] value. There are four outcomes: overwrite with layer 1,
-                // merge with layer 1, merge with zTri or overwrite with layer 1 and then merge
-                // with zTri.
-                v128 e0 = X86.Sse4_1.blendv_ps(z0, z1, d1min);
-                v128 e1 = X86.Sse4_1.blendv_ps(z1, zTri, X86.Sse2.or_si128(d1min, d0min));
-
-                // Update the zMin[1] value. There are three outcomes: keep current value,
-                // overwrite with zTri, or overwrite with z1
-                v128 z1t = X86.Sse4_1.blendv_ps(zTri, z1, d0min);
-
-                tiles[tileIdx].zMin0 = X86.Sse.min_ps(e0, e1);
-                tiles[tileIdx].zMin1 = X86.Sse4_1.blendv_ps(z1t, z0, d1min);
-                tiles[tileIdx].mask = X86.Sse4_1.blendv_ps(inner, layerMask0, d1min);
-            }
-        }
-
-        void UpdateTileEventsY(v128* triEventRemainder, v128* triSlopeTileRemainder, v128* triEdgeY, v128* triEvent, v128* triSlopeTileDelta, v128* triSlopeSign, int i)
-        {
-            if (X86.Sse2.IsSse2Supported)
-            {
-                triEventRemainder[i] = X86.Sse2.sub_epi32(triEventRemainder[i], triSlopeTileRemainder[i]);
-                v128 overflow = X86.Sse2.srai_epi32(triEventRemainder[i], 31);
-                triEventRemainder[i] = X86.Sse2.add_epi32(triEventRemainder[i], X86.Sse2.and_si128(overflow, triEdgeY[i]));
-                triEvent[i] = X86.Sse2.add_epi32(triEvent[i], X86.Sse2.add_epi32(triSlopeTileDelta[i], X86.Sse2.and_si128(overflow, triSlopeSign[i])));
+                internalBinSize += tempStackSize;
+                tempStackSize = 0;
             }
-        }
-
-        void SortVertices(v128* vX, v128* vY)
-        {
-            if (X86.Sse4_1.IsSse41Supported)
+            if (internalBinSize > 0)
             {
-                // Rotate the triangle in the winding order until v0 is the vertex with lowest Y value
-                for (int i = 0; i < 2; i++)
-                {
-                    v128 ey1 = X86.Sse2.sub_epi32(vY[1], vY[0]);
-                    v128 ey2 = X86.Sse2.sub_epi32(vY[2], vY[0]);
-                    v128 swapMask = X86.Sse2.or_si128(X86.Sse2.or_si128(ey1, ey2), X86.Sse2.cmpeq_epi32(ey2, X86.Sse2.setzero_si128()));
-
-                    v128 sX = X86.Sse4_1.blendv_ps(vX[2], vX[0], swapMask);
-                    vX[0] = X86.Sse4_1.blendv_ps(vX[0], vX[1], swapMask);
-                    vX[1] = X86.Sse4_1.blendv_ps(vX[1], vX[2], swapMask);
-                    vX[2] = sX;
-
-                    v128 sY = X86.Sse4_1.blendv_ps(vY[2], vY[0], swapMask);
-                    vY[0] = X86.Sse4_1.blendv_ps(vY[0], vY[1], swapMask);
-                    vY[1] = X86.Sse4_1.blendv_ps(vY[1], vY[2], swapMask);
-                    vY[2] = sY;
-                }
+                RasterizeMesh(tiles, binTriangleX, binTriangleY, binTriangleW, internalBinSize, scissorRect);
             }
         }
     }
diff --git a/Unity.Entities.Graphics/Occlusion/Masked/TestJob.cs b/Unity.Entities.Graphics/Occlusion/Masked/TestJob.cs
index 4e04a5b..fa7a41f 100644
--- a/Unity.Entities.Graphics/Occlusion/Masked/TestJob.cs
+++ b/Unity.Entities.Graphics/Occlusion/Masked/TestJob.cs
@@ -1,5 +1,6 @@
 #if ENABLE_UNITY_OCCLUSION && (HDRP_10_0_0_OR_NEWER || URP_10_0_0_OR_NEWER)
 
+using System.Runtime.CompilerServices;
 using Unity.Burst;
 using Unity.Burst.Intrinsics;
 using Unity.Collections;
@@ -28,9 +29,7 @@ unsafe struct TestJob : IJobParallelForDefer
         [ReadOnly] public BatchCullingViewType ViewType;
         [ReadOnly] public int SplitIndex;
         [ReadOnly, NativeDisableUnsafePtrRestriction] public Tile* Tiles;
-#if UNITY_EDITOR
         [ReadOnly] public bool DisplayOnlyOccluded;
-#endif
 
         public void Execute(int index)
         {
@@ -76,7 +75,6 @@ the current chunk has any occlusion test jobs on it. */
             );
 
             bool chunkVisible = (chunkCullingResult == CullingResult.VISIBLE);
-#if UNITY_EDITOR
             /* If we want to invert occlusion for debug purposes, we want to draw _only_ occluded entities. For this, we
                want to run occlusion on every chunk, regardless of that chunk's test. A clearer but branch-ey way to
                write this is:
@@ -85,7 +83,6 @@ the current chunk has any occlusion test jobs on it. */
                    chunkVisible = true;
                } */
             chunkVisible |= DisplayOnlyOccluded;
-#endif
             if (!chunkVisible)
             {
                 /* The chunk's bounding box fails the visibility test, which means that it's either frustum culled or
@@ -136,7 +133,7 @@ might be frustum culled. So there's no need to process it further.
                        occlusion cull. */
                     bool entityAlreadyFrustumCulled =
                         ViewType == BatchCullingViewType.Light &&
-                        ((chunkVisibility->SplitMasks[entityIndex] & (1 <<SplitIndex)) == 0);
+                        ((chunkVisibility->SplitMasks[entityIndex] & (1 << SplitIndex)) == 0);
 
                     bool entityVisible = false;
                     if (!entityAlreadyFrustumCulled)
@@ -156,7 +153,6 @@ occlusion cull. */
                         entityVisible = (result == CullingResult.VISIBLE);
                     }
 
-#if UNITY_EDITOR
                     /* This effectively XORs the two booleans, and only flips visible when the inversion boolean is true. A
                        clearer but branch-ey way to write this is:
 
@@ -164,7 +160,6 @@ occlusion cull. */
                            entityVisible = !entityVisible;
                        } */
                     entityVisible = (entityVisible != DisplayOnlyOccluded);
-#endif
                     /* Set the index we just processed to zero, indicating that it's not pending any more */
                     pendingBitfield ^= 1ul << tzIndex;
                     /* Set entity's visibility according to our occlusion test */
@@ -191,7 +186,8 @@ This code will change once we handle all splits in the same job */
             }
         }
 
-        public static CullingResult TestRect(
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static CullingResult TestRectSSE(
             float2 min,
             float2 max,
             float wmin,
@@ -203,11 +199,6 @@ public static CullingResult TestRect(
             v128 PixelCenter
         )
         {
-            if (min.x > 1.0f || min.y > 1.0f || max.x < -1.0f || max.y < -1.0f)
-            {
-                return CullingResult.VIEW_CULLED;
-            }
-
             if (X86.Sse4_1.IsSse41Supported)
             {
                 // Compute screen space bounding box and guard for out of bounds
@@ -216,7 +207,7 @@ v128 PixelCenter
                 pixelBBoxi = X86.Sse4_1.max_epi32(X86.Sse2.setzero_si128(), X86.Sse4_1.min_epi32(ScreenSize, pixelBBoxi));
 
                 // Pad bounding box to (32xN) tiles. Tile BB is used for looping / traversal
-                v128 SimdTilePad = X86.Sse2.setr_epi32(0, BufferGroup.TileWidth, 0,  BufferGroup.TileHeight);
+                v128 SimdTilePad = X86.Sse2.setr_epi32(0, BufferGroup.TileWidth, 0, BufferGroup.TileHeight);
                 v128 SimdTilePadMask = X86.Sse2.setr_epi32(
                     ~(BufferGroup.TileWidth - 1),
                     ~(BufferGroup.TileWidth - 1),
@@ -269,11 +260,11 @@ v128 PixelCenter
                     zMax = X86.Sse.div_ps(X86.Sse.set1_ps(1f), X86.Sse.set1_ps(wmin));
                 }
 
-                for (; ; )
+                for (; tileRowIdx < tileRowIdxEnd; tileRowIdx += NumTilesX)
                 {
                     v128 pixelX = startPixelX;
 
-                    for (int tx = txMin; ;)
+                    for (int tx = txMin; tx < txMax; tx++)
                     {
                         int tileIdx = tileRowIdx + tx;
 
@@ -298,28 +289,173 @@ v128 PixelCenter
                             return CullingResult.VISIBLE;
                         }
 
-                        if (++tx >= txMax)
-                        {
-                            break;
-                        }
-
                         pixelX = X86.Sse2.add_epi32(pixelX, X86.Sse2.set1_epi32(BufferGroup.TileWidth));
                     }
 
-                    tileRowIdx += NumTilesX;
+                    pixelY = X86.Sse2.add_epi32(pixelY, X86.Sse2.set1_epi32(BufferGroup.TileHeight));
+                }
+
+                return CullingResult.OCCLUDED;
+            }
+            else
+            {
+                return CullingResult.VISIBLE;
+            }
+        }
 
-                    if (tileRowIdx >= tileRowIdxEnd)
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static CullingResult TestRectNEON(
+            float2 min,
+            float2 max,
+            float wmin,
+            Tile* tiles,
+            BatchCullingProjectionType projectionType,
+            int NumTilesX,
+            v128 ScreenSize,
+            v128 HalfSize,
+            v128 PixelCenter
+        )
+        {
+            if (Arm.Neon.IsNeonSupported)
+            {
+                v128 zero = new v128(0);
+                v128 oneF = new v128(1.0f);
+                v128 negOneF = new v128(-1.0f);
+                v128 fullMask = new v128(~0);
+                v128 wideTileWidth = new v128(BufferGroup.TileWidth);
+                v128 wideTileHeight = new v128(BufferGroup.TileHeight);
+
+                // Compute screen space bounding box and guard for out of bounds
+                v128 pixelBBox = Arm.Neon.vmlaq_f32(PixelCenter, new v128(min.x, max.x, max.y, min.y), HalfSize);
+                v128 pixelBBoxi = Arm.Neon.vcvtnq_s32_f32(pixelBBox);
+                pixelBBoxi = Arm.Neon.vmaxq_s32(zero, Arm.Neon.vminq_s32(ScreenSize, pixelBBoxi));
+
+                // Pad bounding box to (32xN) tiles. Tile BB is used for looping / traversal
+                v128 SimdTilePad = new v128(0, BufferGroup.TileWidth, 0, BufferGroup.TileHeight);
+                v128 SimdTilePadMask = new v128(
+                    ~(BufferGroup.TileWidth - 1),
+                    ~(BufferGroup.TileWidth - 1),
+                    ~(BufferGroup.TileHeight - 1),
+                    ~(BufferGroup.TileHeight - 1)
+                );
+                v128 tileBBoxi = Arm.Neon.vandq_s8(Arm.Neon.vaddq_s32(pixelBBoxi, SimdTilePad), SimdTilePadMask);
+
+                int txMin = tileBBoxi.SInt0 >> BufferGroup.TileWidthShift;
+                int txMax = tileBBoxi.SInt1 >> BufferGroup.TileWidthShift;
+                int tileRowIdx = (tileBBoxi.SInt2 >> BufferGroup.TileHeightShift) * NumTilesX;
+                int tileRowIdxEnd = (tileBBoxi.SInt3 >> BufferGroup.TileHeightShift) * NumTilesX;
+
+                // Pad bounding box to (8x4) subtiles. Skip SIMD lanes outside the subtile BB
+                v128 SimdSubTilePad = new v128(0, BufferGroup.SubTileWidth, 0, BufferGroup.SubTileHeight);
+                v128 SimdSubTilePadMask = new v128(
+                    ~(BufferGroup.SubTileWidth - 1),
+                    ~(BufferGroup.SubTileWidth - 1),
+                    ~(BufferGroup.SubTileHeight - 1),
+                    ~(BufferGroup.SubTileHeight - 1)
+                );
+                v128 subTileBBoxi = Arm.Neon.vandq_s8(Arm.Neon.vaddq_s32(pixelBBoxi, SimdSubTilePad), SimdSubTilePadMask);
+
+                v128 stxmin = new v128(subTileBBoxi.SInt0 - 1); // - 1 to be able to use GT test
+                v128 stymin = new v128(subTileBBoxi.SInt2 - 1); // - 1 to be able to use GT test
+                v128 stxmax = new v128(subTileBBoxi.SInt1);
+                v128 stymax = new v128(subTileBBoxi.SInt3);
+
+                // Setup pixel coordinates used to discard lanes outside subtile BB
+                v128 SimdSubTileColOffset = new v128(
+                    0,
+                    BufferGroup.SubTileWidth,
+                    BufferGroup.SubTileWidth * 2,
+                    BufferGroup.SubTileWidth * 3
+                );
+                v128 startPixelX = Arm.Neon.vaddq_s32(SimdSubTileColOffset, new v128(tileBBoxi.SInt0));
+                // TODO: (Apoorva) LHS is zero. We can just use the RHS directly.
+                v128 pixelY = Arm.Neon.vaddq_s32(zero, new v128(tileBBoxi.SInt2));
+
+                // Compute z from w. Note that z is reversed order, 0 = far, 1/near = near, which
+                // means we use a greater than test, so zMax is used to test for visibility. (z goes from 0 = far to 2 = near for ortho)
+
+                v128 zMax;
+                v128 wMin = new v128(wmin);
+                if (projectionType == BatchCullingProjectionType.Orthographic)
+                {
+                    zMax = Arm.Neon.vmlaq_f32(oneF, negOneF, wMin);
+                }
+                else
+                {
+                    zMax = Arm.Neon.vdivq_f32(oneF, wMin);
+                }
+
+                for (; tileRowIdx < tileRowIdxEnd; tileRowIdx += NumTilesX)
+                {
+                    v128 pixelX = startPixelX;
+
+                    for (int tx = txMin; tx < txMax; tx++)
                     {
-                        break;
+                        int tileIdx = tileRowIdx + tx;
+
+                        // Fetch zMin from masked hierarchical Z buffer
+                        v128 mask = tiles[tileIdx].mask;
+                        v128 zMin0 = IntrinsicUtils._vblendq_f32(Arm.Neon.vceqq_s32(mask, fullMask), tiles[tileIdx].zMin0, tiles[tileIdx].zMin1);
+                        v128 zMin1 = IntrinsicUtils._vblendq_f32(Arm.Neon.vceqq_s32(mask, zero), tiles[tileIdx].zMin1, tiles[tileIdx].zMin0);
+                        v128 zBuf = Arm.Neon.vminq_f32(zMin0, zMin1);
+
+                        // Perform conservative greater than test against hierarchical Z buffer (zMax >= zBuf means the subtile is visible)
+                        v128 zPass = Arm.Neon.vcgeq_f32(zMax, zBuf);  //zPass = zMax >= zBuf ? ~0 : 0
+
+                        // Mask out lanes corresponding to subtiles outside the bounding box
+                        v128 bboxTestMin = Arm.Neon.vandq_s8(Arm.Neon.vcgtq_s32(pixelX, stxmin), Arm.Neon.vcgtq_s32(pixelY, stymin));
+                        v128 bboxTestMax = Arm.Neon.vandq_s8(Arm.Neon.vcgtq_s32(stxmax, pixelX), Arm.Neon.vcgtq_s32(stymax, pixelY));
+                        v128 boxMask = Arm.Neon.vandq_s8(bboxTestMin, bboxTestMax);
+                        zPass = Arm.Neon.vandq_s8(zPass, boxMask);
+
+                        // If not all tiles failed the conservative z test we can immediately terminate the test
+                        v64 zTestResult = Arm.Neon.vqmovn_u64(zPass);
+                        if (zTestResult.ULong0 != 0ul)
+                        {
+                            return CullingResult.VISIBLE;
+                        }
+
+                        pixelX = Arm.Neon.vaddq_s32(pixelX, wideTileWidth);
                     }
 
-                    pixelY = X86.Sse2.add_epi32(pixelY, X86.Sse2.set1_epi32(BufferGroup.TileHeight));
+                    pixelY = Arm.Neon.vaddq_s32(pixelY, wideTileHeight);
                 }
 
                 return CullingResult.OCCLUDED;
             }
             else
-                throw new System.NotImplementedException();
+            {
+                return CullingResult.VISIBLE;
+            }
+        }
+
+        public static CullingResult TestRect(
+            float2 min,
+            float2 max,
+            float wmin,
+            Tile* tiles,
+            BatchCullingProjectionType projectionType,
+            int NumTilesX,
+            v128 ScreenSize,
+            v128 HalfSize,
+            v128 PixelCenter
+        )
+        {
+            if (min.x > 1.0f || min.y > 1.0f || max.x < -1.0f || max.y < -1.0f)
+            {
+                return CullingResult.VIEW_CULLED;
+            }
+
+            if (X86.Sse4_1.IsSse41Supported)
+            {
+                return TestRectSSE( min, max, wmin, tiles, projectionType, NumTilesX, ScreenSize, HalfSize, PixelCenter );
+            }
+            else if (Arm.Neon.IsNeonSupported)
+            {
+                return TestRectNEON( min, max, wmin, tiles, projectionType, NumTilesX, ScreenSize, HalfSize, PixelCenter );
+            }
+
+            return CullingResult.VISIBLE;
         }
     }
 }
diff --git a/Unity.Entities.Graphics/Occlusion/Masked/Visualization/DebugSettings.cs b/Unity.Entities.Graphics/Occlusion/Masked/Visualization/DebugSettings.cs
index f631866..5cd4129 100644
--- a/Unity.Entities.Graphics/Occlusion/Masked/Visualization/DebugSettings.cs
+++ b/Unity.Entities.Graphics/Occlusion/Masked/Visualization/DebugSettings.cs
@@ -1,4 +1,4 @@
-﻿#if ENABLE_UNITY_OCCLUSION && (HDRP_10_0_0_OR_NEWER || URP_10_0_0_OR_NEWER)
+#if ENABLE_UNITY_OCCLUSION && (HDRP_10_0_0_OR_NEWER || URP_10_0_0_OR_NEWER)
 // This class contains the debug settings exposed to the rendering debugger window
 using System;
 using System.Collections.Generic;
@@ -58,11 +58,19 @@ public DebugSettings()
 
         public void Register()
         {
+#if PLATFORM_ANDROID
+            // FK: No support for this feature on ARM platform with 32Bit since Neon Intrinsics aren't supported
+            // Yury: Android is the only 32-bit Arm platform we support
+            bool is32Bit = System.IntPtr.Size == 4;
+            if (is32Bit)
+            {
+                return;
+            }
+#endif
             var widgetList = new List<DebugUI.Widget>();
             widgetList.Add(new DebugUI.Container
             {
                 displayName = "Occlusion Culling",
-                flags = DebugUI.Flags.EditorOnly,
                 children =
                 {
                     new DebugUI.BoolField
diff --git a/Unity.Entities.Graphics/Occlusion/Masked/Visualization/DebugView.cs b/Unity.Entities.Graphics/Occlusion/Masked/Visualization/DebugView.cs
index 1175b0f..8884da0 100644
--- a/Unity.Entities.Graphics/Occlusion/Masked/Visualization/DebugView.cs
+++ b/Unity.Entities.Graphics/Occlusion/Masked/Visualization/DebugView.cs
@@ -132,24 +132,30 @@ public void RenderToTextures(
             EntityQuery testQuery,
             EntityQuery meshQuery,
             BufferGroup bufferGroup,
-            DebugRenderMode mode,
-            bool isOcclusionBrowseWindowVisible
+            DebugRenderMode mode
+#if UNITY_EDITOR
+            , bool isOcclusionBrowseWindowVisible
+#endif
         )
         {
-            if(AnyMeshOrMaterialNull())
+            if (AnyMeshOrMaterialNull())
             {
                 CreateMeshAndMaterials();
             }
             s_CmdLayers.Clear();
             // Write the CPU-rasterized depth buffer to a GPU texture, and then blit it to the overlay
             if (mode == DebugRenderMode.Depth ||
-                mode == DebugRenderMode.Test ||
-                isOcclusionBrowseWindowVisible)
+                mode == DebugRenderMode.Test
+#if UNITY_EDITOR
+                || isOcclusionBrowseWindowVisible
+#endif
+                )
             {
                 s_MaskedDepthToPixelDepth.Begin();
                 int width = bufferGroup.NumPixelsX;
                 int height = bufferGroup.NumPixelsY;
                 int numTilesX = bufferGroup.NumTilesX;
+                int numTilesY = bufferGroup.NumTilesY;
                 var job = new DecodeMaskedDepthJob()
                 {
                     // In
@@ -160,7 +166,7 @@ bool isOcclusionBrowseWindowVisible
                     // Out
                     DecodedZBuffer = m_CPUDepth,
                 };
-                job.Schedule((width * height), 64).Complete();
+                job.Schedule((numTilesX * numTilesY), 64).Complete();
 
                 gpuDepth.SetPixelData(m_CPUDepth, 0);
                 gpuDepth.Apply();
diff --git a/Unity.Entities.Graphics/Occlusion/Masked/Visualization/DecodeMaskedDepthJob.cs b/Unity.Entities.Graphics/Occlusion/Masked/Visualization/DecodeMaskedDepthJob.cs
index 1c913db..973ccbd 100644
--- a/Unity.Entities.Graphics/Occlusion/Masked/Visualization/DecodeMaskedDepthJob.cs
+++ b/Unity.Entities.Graphics/Occlusion/Masked/Visualization/DecodeMaskedDepthJob.cs
@@ -1,6 +1,7 @@
 #if ENABLE_UNITY_OCCLUSION && (HDRP_10_0_0_OR_NEWER || URP_10_0_0_OR_NEWER)
 
 using Unity.Burst;
+using Unity.Burst.Intrinsics;
 using Unity.Collections;
 using Unity.Collections.LowLevel.Unsafe;
 using Unity.Jobs;
@@ -18,34 +19,76 @@ unsafe struct DecodeMaskedDepthJob : IJobParallelFor
 
         [WriteOnly] public NativeArray<float> DecodedZBuffer;
 
+        // i => tile index
         public void Execute(int i)
         {
-            int x = i % NumPixelsX;
-            int y = NumPixelsY - i / NumPixelsX;
-
-            // Compute 32xN tile index (SIMD value offset)
-            int tx = x / BufferGroup.TileWidth;
-            int ty = y / BufferGroup.TileHeight;
-            int tileIdx = ty * NumTilesX + tx;
-
-            // Compute 8x4 subtile index (SIMD lane offset)
-            int stx = (x % BufferGroup.TileWidth) / BufferGroup.SubTileWidth;
-            int sty = (y % BufferGroup.TileHeight) / BufferGroup.SubTileHeight;
-            int subTileIdx = sty * 4 + stx;
-
-            // Compute pixel index in subtile (bit index in 32-bit word)
-            int px = (x % BufferGroup.SubTileWidth);
-            int py = (y % BufferGroup.SubTileHeight);
-            int bitIdx = py * 8 + px;
-
-            int pixelLayer = (IntrinsicUtils.getIntLane(Tiles[tileIdx].mask, (uint) subTileIdx) >>
-                              bitIdx) & 1;
-            float pixelDepth = IntrinsicUtils.getFloatLane(
-                pixelLayer == 0 ? Tiles[tileIdx].zMin0 : Tiles[tileIdx].zMin1,
-                (uint) subTileIdx
-            );
-
-            DecodedZBuffer[i] = pixelDepth;
+            float* zBuffer = (float*)DecodedZBuffer.GetUnsafePtr();
+
+            // this is a 32x4 tile
+            var tile = Tiles[i];
+
+            int numTilesX = NumPixelsX / BufferGroup.TileWidth;
+            int numTilesY = NumPixelsY / BufferGroup.TileHeight;
+
+            int tx = i % numTilesX;
+            int ty = i / numTilesX;
+
+            // iterate over the four 8x4 subtiles
+            for (int j = 0; j < 4; j++)
+            {
+                // prepare two vectors of zMin0 and zMin1
+                // splat j's element
+                var subTilez0 = new v128(IntrinsicUtils.getFloatLane(tile.zMin0, (uint)j));
+                var subTilez1 = new v128(IntrinsicUtils.getFloatLane(tile.zMin1, (uint)j));
+
+                var testMask = new v128(1, 2, 4, 8);
+
+                // the mask is 32 bit, 8x4 bits
+                // iterate over each byte
+                for (int k = 0; k < 4; k++)
+                {
+                    // extract mask for the subtile
+                    byte subTileMask = IntrinsicUtils.getByteLane(tile.mask, (uint)(j * 4 + k));
+
+                    // now, make low and high half-bytes into a int32x4 mask for blending
+                    // high
+                    int highHalfByte = subTileMask >> 4;
+                    var highMask = new v128(highHalfByte);
+                    // low
+                    int lowHalfByte = subTileMask & 15;
+                    var lowMask = new v128(lowHalfByte);
+
+                    if (Arm.Neon.IsNeonSupported)
+                    {
+                        var blendMaskHigh = Arm.Neon.vtstq_s32(highMask, testMask);
+                        var zResultHigh = Arm.Neon.vbslq_s8(blendMaskHigh, subTilez1, subTilez0);
+
+                        var blendMaskLow = Arm.Neon.vtstq_s32(lowMask, testMask);
+                        var zResultLow = Arm.Neon.vbslq_s8(blendMaskLow, subTilez1, subTilez0);
+
+                        int index = ((NumPixelsY - (BufferGroup.TileHeight * ty + k)) * NumPixelsX + BufferGroup.TileWidth * tx + BufferGroup.SubTileWidth * j);
+
+                        // save to DecodedZBuffer
+                        // this generates STP which is most efficient
+                        Arm.Neon.vst1q_f32(zBuffer + index, zResultLow);
+                        Arm.Neon.vst1q_f32(zBuffer + index + 4, zResultHigh);
+                    }
+                    else if (X86.Sse4_1.IsSse41Supported)
+                    {
+                        var invBlendMaskHigh = X86.Sse2.cmpeq_epi32(X86.Sse2.and_si128(highMask, testMask), X86.Sse2.setzero_si128());
+                        var zResultHigh = X86.Sse4_1.blendv_ps(subTilez1, subTilez0, invBlendMaskHigh);
+
+                        var invBlendMaskLow = X86.Sse2.cmpeq_epi32(X86.Sse2.and_si128(lowMask, testMask), X86.Sse2.setzero_si128());
+                        var zResultLow = X86.Sse4_1.blendv_ps(subTilez1, subTilez0, invBlendMaskLow);
+
+                        int index = ((NumPixelsY - (BufferGroup.TileHeight * ty + k)) * NumPixelsX + BufferGroup.TileWidth * tx + BufferGroup.SubTileWidth * j);
+
+                        v128* zBufferSimd = (v128*)zBuffer;
+                        zBufferSimd[index / 4] = zResultLow;
+                        zBufferSimd[index / 4 + 1] = zResultHigh;
+                    }
+                }
+            }
         }
     }
 }
diff --git a/Unity.Entities.Graphics/Occlusion/Occluder.cs b/Unity.Entities.Graphics/Occlusion/Occluder.cs
index d64f410..a5bb491 100644
--- a/Unity.Entities.Graphics/Occlusion/Occluder.cs
+++ b/Unity.Entities.Graphics/Occlusion/Occluder.cs
@@ -3,16 +3,31 @@
 
 namespace Unity.Rendering.Occlusion
 {
+    /// <summary>
+    /// Specifies which entities are occluders and configures occluder settings.
+    /// </summary>
     public class Occluder : MonoBehaviour
     {
+        /// <summary>
+        /// The mesh to use for occlusion culling calculations.
+        /// </summary>
         [FormerlySerializedAs("Mesh")] public Mesh mesh;
 
+        /// <summary>
+        /// The position offset to apply to the occluder mesh.
+        /// </summary>
         [FormerlySerializedAs("relativePosition")]
         public Vector3 localPosition = Vector3.zero;
 
+        /// <summary>
+        /// The rotation offset to apply to the occluder mesh.
+        /// </summary>
         [FormerlySerializedAs("relativeRotation")]
         public Quaternion localRotation = Quaternion.identity;
 
+        /// <summary>
+        /// The scale offset to apply to the occluder mesh.
+        /// </summary>
         [FormerlySerializedAs("relativeScale")]
         public Vector3 localScale = Vector3.one;
 
diff --git a/Unity.Entities.Graphics/Occlusion/OcclusionSortJob.cs b/Unity.Entities.Graphics/Occlusion/OcclusionSortJob.cs
index 0001517..150f0de 100644
--- a/Unity.Entities.Graphics/Occlusion/OcclusionSortJob.cs
+++ b/Unity.Entities.Graphics/Occlusion/OcclusionSortJob.cs
@@ -1,35 +1,16 @@
 #if ENABLE_UNITY_OCCLUSION && (HDRP_10_0_0_OR_NEWER || URP_10_0_0_OR_NEWER)
 
-using Unity.Burst;
-using Unity.Collections;
-using Unity.Jobs;
 using System.Collections.Generic;
 using Unity.Rendering.Occlusion.Masked;
 
 
 namespace Unity.Rendering.Occlusion
 {
-    [BurstCompile]
-    struct OcclusionSortMeshesJob : IJob
+    struct Compare : IComparer<ClippedOccluder>
     {
-        public NativeArray<ClippedOccluder> ClippedOccluders;
-
-
-        struct Compare : IComparer<ClippedOccluder>
-        {
-            int IComparer<ClippedOccluder>.Compare(ClippedOccluder x, ClippedOccluder y)
-            {
-                return x.screenMin.z.CompareTo(y.screenMin.z);
-            }
-        }
-
-        public void Execute()
+        int IComparer<ClippedOccluder>.Compare(ClippedOccluder x, ClippedOccluder y)
         {
-            if (ClippedOccluders.Length == 0)
-                return;
-
-            // TODO:  might want to do a proper parallel sort instead
-            ClippedOccluders.Sort(new Compare());
+            return x.screenMin.z.CompareTo(y.screenMin.z);
         }
     }
 }
diff --git a/Unity.Entities.Graphics/Occlusion/OcclusionView.cs b/Unity.Entities.Graphics/Occlusion/OcclusionView.cs
index 9763b1f..420fa3d 100644
--- a/Unity.Entities.Graphics/Occlusion/OcclusionView.cs
+++ b/Unity.Entities.Graphics/Occlusion/OcclusionView.cs
@@ -4,30 +4,58 @@
 
 namespace Unity.Rendering.Occlusion
 {
+    
+    /// <summary>
+    /// Represents occlusion view settings.
+    /// </summary>
     public struct OcclusionViewSettings
     {
+        /// <summary>
+        /// Indicates whether to process occlusion culling for the occlusion view.
+        /// </summary>
         public bool enabled;
+        /// <summary>
+        /// The width of the occlusion buffer.
+        /// </summary>
         public uint width;
+        /// <summary>
+        /// The height of the occlusion buffer.
+        /// </summary>
         public uint height;
     }
 
+    /// <summary>
+    /// Explicitly specifies which frustum views are occlusion views and configures occlusion view settings.
+    /// </summary>
     [ExecuteInEditMode]
     [DisallowMultipleComponent]
     public class OcclusionView : MonoBehaviour
     {
+        /// <summary>
+        /// Indicates whether to process occlusion culling for the attached frustum view.
+        /// </summary>
         public bool OcclusionEnabled = true;
 
+        /// <summary>
+        /// The width of the occlusion buffer.
+        /// </summary>
         public uint OcclusionBufferWidth = DefaultBufferSize;
 
+        /// <summary>
+        /// The height of the occlusion buffer.
+        /// </summary>
         public uint OcclusionBufferHeight = DefaultBufferSize;
 
+        /// <summary>
+        /// The default value for the occlusion buffer height and width.
+        /// </summary>
         public static readonly uint DefaultBufferSize = 512;
 
 #if ENABLE_UNITY_OCCLUSION && (HDRP_10_0_0_OR_NEWER || URP_10_0_0_OR_NEWER)
 
         void Update()
         {
-            if (World.DefaultGameObjectInjectionWorld == null || !OcclusionEnabled)
+            if (World.DefaultGameObjectInjectionWorld == null)
                 return;
 
             var entitiesGraphicsSystem = World.DefaultGameObjectInjectionWorld.GetOrCreateSystemManaged<EntitiesGraphicsSystem>();
diff --git a/Unity.Entities.Graphics/Occlusion/UnityOcclusion.cs b/Unity.Entities.Graphics/Occlusion/UnityOcclusion.cs
index b0b0ce1..9d989b7 100644
--- a/Unity.Entities.Graphics/Occlusion/UnityOcclusion.cs
+++ b/Unity.Entities.Graphics/Occlusion/UnityOcclusion.cs
@@ -1,5 +1,5 @@
 #if ENABLE_UNITY_OCCLUSION && (HDRP_10_0_0_OR_NEWER || URP_10_0_0_OR_NEWER)
-// #define WAIT_FOR_EACH_JOB // This is useful for profiling individual jobs, but should be commented out for performance
+//#define WAIT_FOR_EACH_JOB // This is useful for profiling individual jobs, but should be commented out for performance
 
 using Unity.Collections;
 using Unity.Collections.LowLevel.Unsafe;
@@ -27,6 +27,11 @@ unsafe class OcclusionCulling
         EntityQuery m_ReadonlyTestQuery;
         EntityQuery m_ReadonlyMeshQuery;
 
+        const int m_binSize  = 3 * 1024;
+        NativeArray<float> m_binTriangleX;
+        NativeArray<float> m_binTriangleY;
+        NativeArray<float> m_binTriangleW;
+
         static readonly ProfilerMarker s_Cull = new ProfilerMarker("Occlusion.Cull");
         static readonly ProfilerMarker s_SetResolution = new ProfilerMarker("Occlusion.Cull.SetResolution");
         static readonly ProfilerMarker s_Clear = new ProfilerMarker("Occlusion.Cull.Clear");
@@ -72,6 +77,12 @@ public void Create(EntityManager entityManager)
                     ComponentType.ReadOnly<EntitiesGraphicsChunkInfo>()
                 },
             });
+
+            // +1 because of main thread helping out...
+            int workerCount = JobsUtility.JobWorkerCount + 1;
+            m_binTriangleX = new NativeArray<float>( workerCount * m_binSize, Allocator.Persistent );
+            m_binTriangleY = new NativeArray<float>( workerCount * m_binSize, Allocator.Persistent );
+            m_binTriangleW = new NativeArray<float>( workerCount * m_binSize, Allocator.Persistent );
         }
 
         public void Dispose()
@@ -81,6 +92,10 @@ public void Dispose()
             {
                 bufferGroup.Dispose();
             }
+
+            m_binTriangleX.Dispose();
+            m_binTriangleY.Dispose();
+            m_binTriangleW.Dispose();
         }
 
         JobHandle CullView(
@@ -129,7 +144,12 @@ available worker threads */
                 numTotalIndices += meshes[i].indexCount;
             }
 
-            var transformedVerts = new NativeArray<float4>(maxVertsInMesh * JobsUtility.MaxJobThreadCount, Allocator.TempJob, NativeArrayOptions.UninitializedMemory);
+#if UNITY_2022_2_14F1_OR_NEWER
+            int maxThreadCount = JobsUtility.ThreadIndexCount;
+#else
+            int maxThreadCount = JobsUtility.MaxJobThreadCount;
+#endif
+            var transformedVerts = new NativeArray<float4>(maxVertsInMesh * maxThreadCount, Allocator.TempJob, NativeArrayOptions.UninitializedMemory);
             // Multiply index count by 6 because that's the most amount of points that can be generated during clipping
             var clippedVerts = new NativeArray<float3>(6 * numTotalIndices, Allocator.TempJob, NativeArrayOptions.UninitializedMemory);
             // Triangle min max contains the screen space aabb of the triangles to not recompute it for each bin in the
@@ -170,10 +190,9 @@ available worker threads */
             // TODO: Look at perf. Evaluate whether running this job is even worth it. It only takes 0.02ms in Viking Village,
             // which is why I haven't looked at it yet.
             s_SortMeshes.Begin();
-            var sortJob = new OcclusionSortMeshesJob
-            {
-                ClippedOccluders = clippedOccluders,
-            }.Schedule(transformJob);
+
+            var sortJob = clippedOccluders.SortJob(new Compare()).Schedule(transformJob);
+
 #if WAIT_FOR_EACH_JOB
             sortJob.Complete();
 #endif // WAIT_FOR_EACH_JOB
@@ -207,6 +226,7 @@ available worker threads */
                 const int TilesPerBinY = 4;//128 tiles per X axis, values can be 1 2 4 8 16 32 64 128
                 const int TilesPerBin = TilesPerBinX * TilesPerBinY;
                 int numBins = bufferGroup.NumTilesX * bufferGroup.NumTilesY / TilesPerBin;
+
                 rasterizeJob = new RasterizeJob
                 {
                     ClippedOccluders = clippedOccluders,
@@ -231,10 +251,15 @@ available worker threads */
                     TilesBasePtr = (Tile*) bufferGroup.Tiles.GetUnsafePtr(),
                     TilesPerBinX = TilesPerBinX,
                     TilesPerBinY = TilesPerBinY,
+                    BinTriangleXBasePtr = (float*)m_binTriangleX.GetUnsafePtr(),
+                    BinTriangleYBasePtr = (float*)m_binTriangleY.GetUnsafePtr(),
+                    BinTriangleWBasePtr = (float*)m_binTriangleW.GetUnsafePtr(),
+                    BinSize = m_binSize,
                 }.ScheduleParallel(numBins, 1, JobHandle.CombineDependencies(clearJob, sortJob));
     #if WAIT_FOR_EACH_JOB
                 rasterizeJob.Complete();
     #endif // WAIT_FOR_EACH_JOB
+
                 s_Rasterize.End();
             }else
             {
@@ -260,9 +285,7 @@ available worker threads */
                 ViewType = viewType,
                 SplitIndex = splitIndex,
                 Tiles = (Tile*)bufferGroup.Tiles.GetUnsafePtr(),
-#if UNITY_EDITOR
                 DisplayOnlyOccluded = InvertOcclusion,
-#endif
             }.ScheduleWithIndirectList(visibilityItems, 1, JobHandle.CombineDependencies(rasterizeJob, computeBoundsJob));
 #if WAIT_FOR_EACH_JOB
             testJob.Complete();
@@ -296,6 +319,15 @@ IndirectList<ChunkVisibilityItem> visibilityItems
 #endif
         )
         {
+#if PLATFORM_ANDROID
+            // FK: No support for this feature on ARM platform with 32Bit since Neon Intrinsics aren't supported
+            // Yury: Android is the only 32-bit Arm platform we support
+            bool is32Bit = System.IntPtr.Size == 4;
+            if (is32Bit)
+            {
+                return new JobHandle();
+            }
+#endif
             if (World.DefaultGameObjectInjectionWorld == null)
             {
                 return new JobHandle();
@@ -356,6 +388,9 @@ IndirectList<ChunkVisibilityItem> visibilityItems
                     s_SetResolution.End();
                 }
 
+                if (!bufferGroup.Enabled)
+                    continue;
+
                 bool invertOcclusion = debugSettings.debugRenderMode == DebugRenderMode.Inverted;
 
                 JobHandle viewJob = CullView(
@@ -410,24 +445,28 @@ internal void UpdateSettings(OcclusionView occlusionView)
 
                 if (!occlusionView.OcclusionEnabled)
                 {
-                    BufferGroups.Remove(viewID);
+                    bufferGroup.Enabled = false;
                 }
                 else
                 {
 #if UNITY_EDITOR
                     if (bufferGroup.NumPixelsX != occlusionView.OcclusionBufferWidth ||
-                        bufferGroup.NumPixelsY != occlusionView.OcclusionBufferHeight)
+                        bufferGroup.NumPixelsY != occlusionView.OcclusionBufferHeight ||
+                        bufferGroup.Enabled != occlusionView.OcclusionEnabled)
                     {
                         OcclusionBrowseWindow.Refresh();
                     }
 #endif
 
+                    bufferGroup.Enabled = true;
                     bufferGroup.SetResolutionAndClip(
                         (int)occlusionView.OcclusionBufferWidth,
                         (int)occlusionView.OcclusionBufferHeight,
                         bufferGroup.ProjectionType,
                         bufferGroup.NearClip);
                 }
+
+                BufferGroups[viewID] = bufferGroup;
             }
         }
 
diff --git a/Unity.Entities.Graphics/RenderFilterSettings.cs b/Unity.Entities.Graphics/RenderFilterSettings.cs
index 926f1ca..81048a0 100644
--- a/Unity.Entities.Graphics/RenderFilterSettings.cs
+++ b/Unity.Entities.Graphics/RenderFilterSettings.cs
@@ -87,7 +87,11 @@ public struct RenderFilterSettings : ISharedComponentData, IEquatable<RenderFilt
         public bool IsInMotionPass =>
             MotionMode != MotionVectorGenerationMode.Camera;
 
-        /// <inheritdoc/>
+        /// <summary>
+        /// Indicates whether the current instance is equal to the specified object.
+        /// </summary>
+        /// <param name="obj">The object to compare with the current instance.</param>
+        /// <returns>Returns true if the current instance is equal to the specified object. Otherwise, returns false.</returns>
         public override bool Equals(object obj)
         {
             if (obj is RenderFilterSettings)
@@ -96,13 +100,20 @@ public override bool Equals(object obj)
             return false;
         }
 
-        /// <inheritdoc/>
+        /// <summary>
+        /// Indicates whether the current instance is equal to the specified RenderFilterSettings.
+        /// </summary>
+        /// <param name="other">The RenderFilterSettings to compare with the current instance.</param>
+        /// <returns>Returns true if the current instance is equal to the specified RenderFilterSettings. Otherwise, returns false.</returns>
         public bool Equals(RenderFilterSettings other)
         {
             return Layer == other.Layer && RenderingLayerMask == other.RenderingLayerMask && MotionMode == other.MotionMode && ShadowCastingMode == other.ShadowCastingMode && ReceiveShadows == other.ReceiveShadows && StaticShadowCaster == other.StaticShadowCaster;
         }
 
-        /// <inheritdoc/>
+        /// <summary>
+        /// Calculates the hash code for this object.
+        /// </summary>
+        /// <returns>The hash code.</returns>
         public override int GetHashCode()
         {
             var hash = new xxHash3.StreamingState(true);
diff --git a/Unity.Entities.Graphics/RenderMeshArray.cs b/Unity.Entities.Graphics/RenderMeshArray.cs
index 4b5feaf..b158b5f 100644
--- a/Unity.Entities.Graphics/RenderMeshArray.cs
+++ b/Unity.Entities.Graphics/RenderMeshArray.cs
@@ -64,7 +64,7 @@ public static int ArrayIndexToStaticIndex(int index) => (index < 0)
         /// <param name="materialIndexInRenderMeshArray">The material index in <see cref="RenderMeshArray.Materials"/>.</param>
         /// <param name="meshIndexInRenderMeshArray">The mesh index in <see cref="RenderMeshArray.Meshes"/>.</param>
         /// <param name="submeshIndex">An optional submesh ID.</param>
-        /// <returns></returns>
+        /// <returns>Returns the MaterialMeshInfo instance that contains the material and mesh indices.</returns>
         public static MaterialMeshInfo FromRenderMeshArrayIndices(
             int materialIndexInRenderMeshArray,
             int meshIndexInRenderMeshArray,
@@ -89,7 +89,6 @@ private MaterialMeshInfo(int materialIndex, int meshIndex, sbyte submeshIndex =
         /// <param name="materialID">The material ID from <see cref="EntitiesGraphicsSystem.RegisterMaterial"/>.</param>
         /// <param name="meshID">The mesh ID from <see cref="EntitiesGraphicsSystem.RegisterMesh"/>.</param>
         /// <param name="submeshIndex">An optional submesh ID.</param>
-        /// <returns></returns>
         public MaterialMeshInfo(BatchMaterialID materialID, BatchMeshID meshID, sbyte submeshIndex = 0)
             : this((int)materialID.value, (int)meshID.value, submeshIndex)
         {}
@@ -422,13 +421,20 @@ public bool Equals(RenderMeshArray other)
             return math.all(GetHash128() == other.GetHash128());
         }
 
-        /// <inheritdoc/>
+        /// <summary>
+        /// Determines whether two object instances are equal based on their hashes.
+        /// </summary>
+        /// <param name="obj">The object to compare with the current object.</param>
+        /// <returns>Returns true if the specified object is equal to the current object. Otherwise, returns false.</returns>
         public override bool Equals(object obj)
         {
             return obj is RenderMeshArray other && Equals(other);
         }
 
-        /// <inheritdoc/>
+        /// <summary>
+        /// Calculates the hash code for this object.
+        /// </summary>
+        /// <returns>The hash code.</returns>
         public override int GetHashCode()
         {
             return (int) GetHash128().x;
diff --git a/Unity.Entities.Graphics/Resources/Occlusion/OcclusionDebugOccluders.shader b/Unity.Entities.Graphics/Resources/Occlusion/OcclusionDebugOccluders.shader
index d764145..69a6482 100644
--- a/Unity.Entities.Graphics/Resources/Occlusion/OcclusionDebugOccluders.shader
+++ b/Unity.Entities.Graphics/Resources/Occlusion/OcclusionDebugOccluders.shader
@@ -59,7 +59,6 @@ Shader "Hidden/OcclusionDebugOccluders"
 
             FrameOut frag(v2f i)
             {
-                UNITY_SETUP_INSTANCE_ID(i);
                 fixed4 col = i.color;
 
                 float2 dp = normalize(float2(ddx(i.vertex.z), ddy(i.vertex.z)));
diff --git a/Unity.Entities.Graphics/SparseUploader.cs b/Unity.Entities.Graphics/SparseUploader.cs
index 47f21c7..98ea6e5 100644
--- a/Unity.Entities.Graphics/SparseUploader.cs
+++ b/Unity.Entities.Graphics/SparseUploader.cs
@@ -112,6 +112,11 @@ public unsafe struct ThreadedSparseUploader
         // TODO: safety handle?
         [NativeDisableUnsafePtrRestriction] internal ThreadedSparseUploaderData* m_Data;
 
+        /// <summary>
+        /// Indicates whether the SparseUploader is valid and can be used.
+        /// </summary>
+        public bool IsValid => m_Data != null;
+
         private bool TryAlloc(int operationSize, int dataSize, out byte* ptr, out int operationOffset, out int dataOffset)
         {
             // Fetch current buffer and ensure we are not already out of GPU buffers to allocate from;
@@ -609,15 +614,18 @@ internal static int NumFramesInFlight
 
         private void RecoverBuffers()
         {
-            var numFree = 0;
+            int numFree = 0;
 
             // Count frames instead of using async readback to determine completion, because
             // using async readback prevents Unity from letting the device idle, which is really
             // bad for power usage.
             // Add 1 to the device frame count to account for two frames overlapping on
             // CPU side before reaching the GPU.
-            if (m_FrameData.Count > (NumFramesInFlight + 1))
-                numFree = 1;
+            int maxBufferedFrames = NumFramesInFlight + 1;
+
+            // If we have more buffered frames than the maximum, free all the excess
+            if (m_FrameData.Count > maxBufferedFrames)
+                numFree = m_FrameData.Count - maxBufferedFrames;
 
             for (int i = 0; i < numFree; ++i)
             {
@@ -722,7 +730,19 @@ private void StepFrame()
         /// <param name="tsu">The ThreadedSparseUploader to consume and process upload dispatches for. You must have created this with a call to SparseUploader.Begin.</param>
         public void EndAndCommit(ThreadedSparseUploader tsu)
         {
-            var numBuffers = m_ThreadData->m_NumBuffers;
+            // Enforce that EndAndCommit is only called with a valid ThreadedSparseUploader
+            if (!tsu.IsValid)
+            {
+                Debug.LogError("Invalid ThreadedSparseUploader passed to EndAndCommit");
+                return;
+            }
+
+            int numBuffers = m_ThreadData->m_NumBuffers;
+
+            // If there is no work for us to do, early out so we don't add empty entries into m_FrameData
+            if (numBuffers == 0 && !m_MappedBuffers.IsCreated)
+                return;
+
             var frameData = m_FreeFrameData.Count > 0 ? m_FreeFrameData.Pop() : new FrameData();
             for (int iBuf = 0; iBuf < numBuffers; ++iBuf)
             {
@@ -752,7 +772,6 @@ public void EndAndCommit(ThreadedSparseUploader tsu)
             if (m_MappedBuffers.IsCreated)
                 m_MappedBuffers.Dispose();
 
-
             StepFrame();
         }
 
diff --git a/Unity.Entities.Graphics/Unity.Entities.Graphics.asmdef b/Unity.Entities.Graphics/Unity.Entities.Graphics.asmdef
index 358bee7..74866a0 100644
--- a/Unity.Entities.Graphics/Unity.Entities.Graphics.asmdef
+++ b/Unity.Entities.Graphics/Unity.Entities.Graphics.asmdef
@@ -44,10 +44,10 @@
             "define": "SRP_10_0_0_OR_NEWER"
         },
         {
-            "name": "com.unity.tiny",
-            "expression": "0.21.9",
-            "define": "TINY_0_22_0_OR_NEWER"
+            "name": "Unity",
+            "expression": "2022.2.14f1",
+            "define": "UNITY_2022_2_14F1_OR_NEWER"
         }
     ],
     "noEngineReferences": false
-}
\ No newline at end of file
+}
diff --git a/Unity.Entities.Graphics/UpdateEntitiesGraphicsChunksStructure.cs b/Unity.Entities.Graphics/UpdateEntitiesGraphicsChunksStructure.cs
index 4bcd1a3..cff36ea 100644
--- a/Unity.Entities.Graphics/UpdateEntitiesGraphicsChunksStructure.cs
+++ b/Unity.Entities.Graphics/UpdateEntitiesGraphicsChunksStructure.cs
@@ -18,7 +18,9 @@ public partial class UpdateHybridChunksStructure : SystemBase
         private EntityQuery m_HasHybridChunkInfo;
 #endif
 
-        /// <inheritdoc/>
+        /// <summary>
+        /// Called when this system is created.
+        /// </summary>
         protected override void OnCreate()
         {
             m_MissingHybridChunkInfo = GetEntityQuery(new EntityQueryDesc
@@ -62,7 +64,9 @@ protected override void OnCreate()
 #endif
         }
 
-        /// <inheritdoc/>
+        /// <summary>
+        /// Called when this system is updated.
+        /// </summary>
         protected override void OnUpdate()
         {
             UnityEngine.Profiling.Profiler.BeginSample("UpdateHybridChunksStructure");
diff --git a/ValidationExceptions.json b/ValidationExceptions.json
new file mode 100644
index 0000000..211a00f
--- /dev/null
+++ b/ValidationExceptions.json
@@ -0,0 +1,10 @@
+{
+    "ErrorExceptions": [
+        {
+            "ValidationTest": "API Validation",
+            "ExceptionMessage": "Additions require a new minor or major version.",
+            "PackageVersion": "1.0.8"
+        }
+   ],
+    "WarningExceptions": []
+}
\ No newline at end of file
diff --git a/ValidationExceptions.json.meta b/ValidationExceptions.json.meta
new file mode 100644
index 0000000..de17440
--- /dev/null
+++ b/ValidationExceptions.json.meta
@@ -0,0 +1,7 @@
+fileFormatVersion: 2
+guid: a6ab760f2c623364898cd058997005f5
+TextScriptImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/package.json b/package.json
index f87eb09..d0bc213 100644
--- a/package.json
+++ b/package.json
@@ -1,12 +1,12 @@
 {
   "name": "com.unity.entities.graphics",
   "displayName": "Entities Graphics",
-  "version": "1.0.0-pre.65",
+  "version": "1.0.8",
   "unity": "2022.2",
-  "unityRelease": "2f1",
+  "unityRelease": "15f1",
   "description": "The Entities Graphics package provides systems and components for drawing meshes using DOTS, including support for instanced mesh rendering and LOD.",
   "dependencies": {
-    "com.unity.entities": "1.0.0-pre.65",
+    "com.unity.entities": "1.0.8",
     "com.unity.modules.particlesystem": "1.0.0",
     "com.unity.render-pipelines.core": "14.0.6"
   },
@@ -17,15 +17,15 @@
     "unity"
   ],
   "_upm": {
-    "changelog": "### Added\n\n* Burst Occlusion Culling occlusion browser tool\n\n### Changed\n\n* Disable Entities Graphics error message if there is no active SRP."
+    "changelog": "### Added\n\n* Support for CPU-based (Burst) masked occlusion culling on Neon processors.\n* Explicit usage of Burst's FMA optimization, parallel sort, and missing changes from previous review feedback of boc-neon branch\n\n### Changed\n\n* Greatly improved performance of CPU-based (Burst) masked occlusion culling.\n* Greatly improved performance of Depth and Test debug views used with CPU-based (Burst) masked occlusion culling.\n* Reduced the amount of memory allocated by allocating based on the maximum number of worker threads the running platform requires rather than defaulting to using a theoretical upper-bound of 128 worker threads.\n\n### Fixed\n\n* Entities Graphics Occlusion shader throws errors when building the project\n* Fixed a GraphicsBuffer leak that could occur in cases where Entities Graphics is running without any entities to render.\n* enabling/disabling per-view occlusion"
   },
   "upmCi": {
-    "footprint": "ccf6c26f75c90aeefbd73f32ae087fa743a9640a"
+    "footprint": "dd45e7aa3a48f1e92bc2e58d8028c2c946fbbd72"
   },
   "documentationUrl": "https://docs.unity3d.com/Packages/com.unity.entities.graphics@1.0/manual/index.html",
   "repository": {
     "url": "https://github.cds.internal.unity3d.com/unity/dots.git",
     "type": "git",
-    "revision": "85530c484ef7afedc5b128081152ec5e1f6f30e2"
+    "revision": "25dfb648cd0fabcf13fed46219e5fbe0fdabe4ac"
   }
 }