diff --git a/modules/host/kernel.nix b/modules/host/kernel.nix
index 83d0d9ff7..444859458 100644
--- a/modules/host/kernel.nix
+++ b/modules/host/kernel.nix
@@ -6,10 +6,22 @@
   pkgs,
   ...
 }: let
-  baseKernel = pkgs.linux_latest;
-
+  baseKernel =
+    if hyp_cfg.enable
+    then
+      pkgs.linux_6_1.override {
+        argsOverride = rec {
+          src = pkgs.fetchurl {
+            url = "mirror://kernel/linux/kernel/v6.x/linux-${version}.tar.xz";
+            hash = "sha256-qH4kHsFdU0UsTv4hlxOjdp2IzENrW5jPbvsmLEr/FcA=";
+          };
+          version = "6.1.55";
+          modDirVersion = "6.1.55";
+        };
+      }
+    else pkgs.linux_latest;
   hardened_kernel = pkgs.linuxManualConfig rec {
-    inherit (baseKernel) src modDirVersion;
+    inherit (baseKernel) src modDirVersion kernelPatches;
     version = "${baseKernel.version}-ghaf-hardened";
     /*
     baseline "make tinyconfig"
@@ -55,19 +67,42 @@
     - also see https://github.com/NixOS/nixpkgs/issues/109280
       for the context >
     */
+
     configfile = ./ghaf_host_hardened_baseline;
     allowImportFromDerivation = true;
   };
 
-  cfg = config.ghaf.host.kernel_hardening;
+  pkvm_patch = lib.mkIf config.ghaf.hardware.x86_64.common.enable [
+    {
+      name = "pkvm-patch";
+      patch = ../virtualization/pkvm/0001-pkvm-enable-pkvm-on-intel-x86-6.1-lts.patch;
+      structuredExtraConfig = with lib.kernel; {
+        KVM_INTEL = yes;
+        KSM = no;
+        PKVM_INTEL = yes;
+        PKVM_INTEL_DEBUG = yes;
+        PKVM_GUEST = yes;
+        EARLY_PRINTK_USB_XDBC = yes;
+        RETPOLINE = yes;
+      };
+    }
+  ];
+
+  kern_cfg = config.ghaf.host.kernel_hardening;
+  hyp_cfg = config.ghaf.host.hypervisor_hardening;
 in
   with lib; {
     options.ghaf.host.kernel_hardening = {
       enable = mkEnableOption "Host kernel hardening";
     };
 
-    config = mkIf cfg.enable {
+    options.ghaf.host.hypervisor_hardening = {
+      enable = mkEnableOption "Hypervisor hardening";
+    };
+
+    config = mkIf kern_cfg.enable {
       boot.kernelPackages = pkgs.linuxPackagesFor hardened_kernel;
+      boot.kernelPatches = mkIf (hyp_cfg.enable && "${baseKernel.version}" == "6.1.55") pkvm_patch;
       # https://github.com/NixOS/nixpkgs/issues/109280#issuecomment-973636212
       nixpkgs.overlays = [
         (_final: prev: {
diff --git a/modules/virtualization/pkvm/0001-pkvm-enable-pkvm-on-intel-x86-6.1-lts.patch b/modules/virtualization/pkvm/0001-pkvm-enable-pkvm-on-intel-x86-6.1-lts.patch
new file mode 100644
index 000000000..b99602189
--- /dev/null
+++ b/modules/virtualization/pkvm/0001-pkvm-enable-pkvm-on-intel-x86-6.1-lts.patch
@@ -0,0 +1,21564 @@
+From 57625c591800467ae5eeabbeba25c42121310c7e Mon Sep 17 00:00:00 2001
+From: Kalle Marjamaki <kalle.marjamaki@unikie.com>
+Date: Thu, 28 Sep 2023 13:37:09 +0300
+Subject: [PATCH] pkvm: enable pkvm on intel x86, 6.1 lts
+
+Signed-off-by: Kalle Marjamaki <kalle.marjamaki@unikie.com>
+Signed-off-by: Janne Karhunen <janne.karhunen@gmail.com>
+---
+ arch/arm64/include/asm/kvm_host.h             |    8 +-
+ arch/arm64/include/asm/kvm_pkvm.h             |   31 +-
+ .../asm/pkvm_spinlock.h}                      |   31 +-
+ arch/arm64/kvm/Makefile                       |    3 +
+ arch/arm64/kvm/arm.c                          |    8 +-
+ arch/arm64/kvm/hyp/hyp-constants.c            |    4 +-
+ arch/arm64/kvm/hyp/include/nvhe/gfp.h         |   34 -
+ arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |    4 +-
+ arch/arm64/kvm/hyp/include/nvhe/memory.h      |   48 -
+ arch/arm64/kvm/hyp/include/nvhe/mm.h          |   10 +-
+ arch/arm64/kvm/hyp/nvhe/Makefile              |    4 +-
+ arch/arm64/kvm/hyp/nvhe/early_alloc.c         |    2 +-
+ arch/arm64/kvm/hyp/nvhe/mem_protect.c         |   54 +-
+ arch/arm64/kvm/hyp/nvhe/mm.c                  |   30 +-
+ arch/arm64/kvm/hyp/nvhe/psci-relay.c          |    2 +-
+ arch/arm64/kvm/hyp/nvhe/setup.c               |   28 +-
+ arch/arm64/kvm/pkvm.c                         |   87 +-
+ arch/arm64/mm/init.c                          |    2 +-
+ arch/x86/Kconfig                              |   11 +
+ arch/x86/coco/Makefile                        |    3 +-
+ arch/x86/coco/core.c                          |   34 +
+ arch/x86/coco/pkvm/Makefile                   |    3 +
+ arch/x86/coco/pkvm/pkvm.c                     |  113 +
+ arch/x86/coco/pkvm/pkvmcall.S                 |   42 +
+ arch/x86/coco/tdx/tdx.c                       |  115 +-
+ arch/x86/coco/virt_exception.c                |  126 +
+ arch/x86/include/asm/coco.h                   |    1 +
+ arch/x86/include/asm/hypervisor.h             |    2 +
+ arch/x86/include/asm/idtentry.h               |    2 +-
+ arch/x86/include/asm/kvm-x86-ops.h            |    2 +
+ arch/x86/include/asm/kvm_host.h               |   18 +
+ arch/x86/include/asm/kvm_pkvm.h               |  250 +
+ arch/x86/include/asm/pkvm.h                   |  151 +
+ arch/x86/include/asm/pkvm_image.h             |   48 +
+ arch/x86/include/asm/pkvm_image_vars.h        |   23 +
+ arch/x86/include/asm/pkvm_spinlock.h          |   62 +
+ arch/x86/include/asm/tdx.h                    |   19 +-
+ arch/x86/include/asm/virt_exception.h         |   41 +
+ arch/x86/include/asm/vmx.h                    |    7 +
+ arch/x86/include/uapi/asm/kvm.h               |    3 +
+ arch/x86/kernel/cpu/Makefile                  |    1 +
+ arch/x86/kernel/cpu/hypervisor.c              |    3 +
+ arch/x86/kernel/cpu/pkvm.c                    |   33 +
+ arch/x86/kernel/idt.c                         |    2 +-
+ arch/x86/kernel/setup.c                       |    3 +
+ arch/x86/kernel/traps.c                       |    9 +-
+ arch/x86/kernel/vmlinux.lds.S                 |   37 +
+ arch/x86/kvm/Kconfig                          |   24 +
+ arch/x86/kvm/Makefile                         |    1 +
+ arch/x86/kvm/mmu.h                            |   16 +
+ arch/x86/kvm/mmu/mmu.c                        |   47 +-
+ arch/x86/kvm/mmu/paging_tmpl.h                |    3 +-
+ arch/x86/kvm/mmu/spte.h                       |    1 +
+ arch/x86/kvm/mmu/tdp_mmu.c                    |    7 +-
+ arch/x86/kvm/svm/svm.c                        |    6 +
+ arch/x86/kvm/vmx/pkvm/.gitignore              |    1 +
+ arch/x86/kvm/vmx/pkvm/Makefile                |   29 +
+ arch/x86/kvm/vmx/pkvm/hyp/Makefile            |   79 +
+ arch/x86/kvm/vmx/pkvm/hyp/bug.h               |   23 +
+ arch/x86/kvm/vmx/pkvm/hyp/cpu.h               |   53 +
+ arch/x86/kvm/vmx/pkvm/hyp/debug.h             |   20 +
+ arch/x86/kvm/vmx/pkvm/hyp/early_alloc.c       |   76 +
+ arch/x86/kvm/vmx/pkvm/hyp/early_alloc.h       |   15 +
+ arch/x86/kvm/vmx/pkvm/hyp/ept.c               | 1066 ++++
+ arch/x86/kvm/vmx/pkvm/hyp/ept.h               |   70 +
+ arch/x86/kvm/vmx/pkvm/hyp/idt.S               |   67 +
+ arch/x86/kvm/vmx/pkvm/hyp/init_finalise.c     |  371 ++
+ arch/x86/kvm/vmx/pkvm/hyp/io.h                |   82 +
+ arch/x86/kvm/vmx/pkvm/hyp/io_emulate.c        |  374 ++
+ arch/x86/kvm/vmx/pkvm/hyp/io_emulate.h        |   67 +
+ arch/x86/kvm/vmx/pkvm/hyp/iommu.c             | 2372 ++++++++
+ arch/x86/kvm/vmx/pkvm/hyp/iommu.h             |   16 +
+ arch/x86/kvm/vmx/pkvm/hyp/iommu_debug.c       |  199 +
+ arch/x86/kvm/vmx/pkvm/hyp/iommu_internal.h    |  347 ++
+ arch/x86/kvm/vmx/pkvm/hyp/iommu_spgt.c        |  106 +
+ arch/x86/kvm/vmx/pkvm/hyp/iommu_spgt.h        |   19 +
+ arch/x86/kvm/vmx/pkvm/hyp/irq.c               |   60 +
+ arch/x86/kvm/vmx/pkvm/hyp/lapic.c             |  222 +
+ arch/x86/kvm/vmx/pkvm/hyp/lapic.h             |   12 +
+ arch/x86/kvm/vmx/pkvm/hyp/lib/list_debug.c    |   16 +
+ arch/x86/kvm/vmx/pkvm/hyp/lib/memcpy_64.S     |   26 +
+ arch/x86/kvm/vmx/pkvm/hyp/lib/memset_64.S     |   24 +
+ arch/x86/kvm/vmx/pkvm/hyp/lib/retpoline.S     |  115 +
+ arch/x86/kvm/vmx/pkvm/hyp/mem_protect.c       | 1013 ++++
+ arch/x86/kvm/vmx/pkvm/hyp/mem_protect.h       |  205 +
+ arch/x86/kvm/vmx/pkvm/hyp/memory.c            |  363 ++
+ arch/x86/kvm/vmx/pkvm/hyp/memory.h            |   51 +
+ arch/x86/kvm/vmx/pkvm/hyp/mmu.c               |  258 +
+ arch/x86/kvm/vmx/pkvm/hyp/mmu.h               |   28 +
+ arch/x86/kvm/vmx/pkvm/hyp/nested.c            | 1485 +++++
+ arch/x86/kvm/vmx/pkvm/hyp/nested.h            |   32 +
+ arch/x86/kvm/vmx/pkvm/hyp/pci.c               |  350 ++
+ arch/x86/kvm/vmx/pkvm/hyp/pci.h               |   24 +
+ arch/x86/kvm/vmx/pkvm/hyp/pgtable.c           |  801 +++
+ arch/x86/kvm/vmx/pkvm/hyp/pgtable.h           |  155 +
+ arch/x86/kvm/vmx/pkvm/hyp/pkvm.c              |  470 ++
+ arch/x86/kvm/vmx/pkvm/hyp/pkvm.lds.S          |   10 +
+ arch/x86/kvm/vmx/pkvm/hyp/pkvm_hyp.h          |  187 +
+ .../vmx/pkvm/hyp/pkvm_nested_vmcs_fields.h    |  191 +
+ arch/x86/kvm/vmx/pkvm/hyp/ptdev.c             |  213 +
+ arch/x86/kvm/vmx/pkvm/hyp/ptdev.h             |   53 +
+ arch/x86/kvm/vmx/pkvm/hyp/trace.c             |  117 +
+ arch/x86/kvm/vmx/pkvm/hyp/trace.h             |   15 +
+ arch/x86/kvm/vmx/pkvm/hyp/vmexit.c            |  360 ++
+ arch/x86/kvm/vmx/pkvm/hyp/vmexit.h            |   11 +
+ arch/x86/kvm/vmx/pkvm/hyp/vmsr.c              |  120 +
+ arch/x86/kvm/vmx/pkvm/hyp/vmsr.h              |   11 +
+ arch/x86/kvm/vmx/pkvm/hyp/vmx.c               |   79 +
+ arch/x86/kvm/vmx/pkvm/hyp/vmx.h               |   63 +
+ arch/x86/kvm/vmx/pkvm/hyp/vmx_asm.S           |  186 +
+ arch/x86/kvm/vmx/pkvm/hyp/vmx_ops.h           |  173 +
+ arch/x86/kvm/vmx/pkvm/include/capabilities.h  |   95 +
+ arch/x86/kvm/vmx/pkvm/include/pkvm.h          |  155 +
+ arch/x86/kvm/vmx/pkvm/include/pkvm_trace.h    |   29 +
+ arch/x86/kvm/vmx/pkvm/pkvm_constants.c        |   26 +
+ arch/x86/kvm/vmx/pkvm/pkvm_constants.h        |   21 +
+ arch/x86/kvm/vmx/pkvm/pkvm_debugfs.c          |  204 +
+ arch/x86/kvm/vmx/pkvm/pkvm_host.c             | 1300 +++++
+ arch/x86/kvm/vmx/vmcs12.c                     |    6 +
+ arch/x86/kvm/vmx/vmcs12.h                     |   16 +-
+ arch/x86/kvm/vmx/vmx.c                        |  259 +-
+ arch/x86/kvm/vmx/vmx_lib.h                    |  241 +
+ arch/x86/kvm/vmx/vmx_ops.h                    |   19 +-
+ arch/x86/kvm/x86.c                            |   60 +-
+ arch/x86/mm/pat/set_memory.c                  |    4 +
+ drivers/iommu/intel/debugfs.c                 |   14 +-
+ drivers/iommu/intel/dmar.c                    |  108 +-
+ drivers/iommu/intel/iommu.c                   |   60 +-
+ drivers/iommu/intel/iommu.h                   |   18 +-
+ drivers/iommu/intel/irq_remapping.c           |   24 +-
+ drivers/iommu/intel/pasid.c                   |    4 +-
+ drivers/iommu/intel/svm.c                     |   34 +-
+ include/asm-generic/vmlinux.lds.h             |   16 +
+ include/linux/intel-iommu.h                   |  863 +++
+ include/linux/kvm_host.h                      |   34 +-
+ include/linux/kvm_types.h                     |    1 +
+ include/uapi/linux/kvm.h                      |    1 +
+ include/uapi/linux/kvm_para.h                 |   11 +
+ tools/arch/x86/include/uapi/asm/kvm.h         |    3 +
+ tools/include/uapi/linux/kvm.h                |    1 +
+ virt/kvm/kvm_main.c                           |   35 +-
+ virt/kvm/pkvm/buddy_memory.h                  |   36 +
+ virt/kvm/pkvm/gfp.h                           |   35 +
+ .../hyp/nvhe => virt/kvm/pkvm}/page_alloc.c   |  135 +-
+ virt/kvm/pkvm/pkvm.c                          |   85 +
+ virt/kvm/pkvm/pkvm_spinlock.h                 |   47 +
+ virt/kvm/vfio.c                               |   13 +
+ 150 files changed, 28003 insertions(+), 808 deletions(-)
+ rename arch/arm64/{kvm/hyp/include/nvhe/spinlock.h => include/asm/pkvm_spinlock.h} (73%)
+ delete mode 100644 arch/arm64/kvm/hyp/include/nvhe/gfp.h
+ delete mode 100644 arch/arm64/kvm/hyp/include/nvhe/memory.h
+ create mode 100644 arch/x86/coco/pkvm/Makefile
+ create mode 100644 arch/x86/coco/pkvm/pkvm.c
+ create mode 100644 arch/x86/coco/pkvm/pkvmcall.S
+ create mode 100644 arch/x86/coco/virt_exception.c
+ create mode 100644 arch/x86/include/asm/kvm_pkvm.h
+ create mode 100644 arch/x86/include/asm/pkvm.h
+ create mode 100644 arch/x86/include/asm/pkvm_image.h
+ create mode 100644 arch/x86/include/asm/pkvm_image_vars.h
+ create mode 100644 arch/x86/include/asm/pkvm_spinlock.h
+ create mode 100644 arch/x86/include/asm/virt_exception.h
+ create mode 100644 arch/x86/kernel/cpu/pkvm.c
+ create mode 100644 arch/x86/kvm/vmx/pkvm/.gitignore
+ create mode 100644 arch/x86/kvm/vmx/pkvm/Makefile
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/Makefile
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/bug.h
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/cpu.h
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/debug.h
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/early_alloc.c
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/early_alloc.h
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/ept.c
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/ept.h
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/idt.S
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/init_finalise.c
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/io.h
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/io_emulate.c
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/io_emulate.h
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/iommu.c
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/iommu.h
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/iommu_debug.c
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/iommu_internal.h
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/iommu_spgt.c
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/iommu_spgt.h
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/irq.c
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/lapic.c
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/lapic.h
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/lib/list_debug.c
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/lib/memcpy_64.S
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/lib/memset_64.S
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/lib/retpoline.S
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/mem_protect.c
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/mem_protect.h
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/memory.c
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/memory.h
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/mmu.c
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/mmu.h
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/nested.c
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/nested.h
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/pci.c
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/pci.h
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/pgtable.c
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/pgtable.h
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/pkvm.c
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/pkvm.lds.S
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/pkvm_hyp.h
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/pkvm_nested_vmcs_fields.h
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/ptdev.c
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/ptdev.h
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/trace.c
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/trace.h
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/vmexit.c
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/vmexit.h
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/vmsr.c
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/vmsr.h
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/vmx.c
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/vmx.h
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/vmx_asm.S
+ create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/vmx_ops.h
+ create mode 100644 arch/x86/kvm/vmx/pkvm/include/capabilities.h
+ create mode 100644 arch/x86/kvm/vmx/pkvm/include/pkvm.h
+ create mode 100644 arch/x86/kvm/vmx/pkvm/include/pkvm_trace.h
+ create mode 100644 arch/x86/kvm/vmx/pkvm/pkvm_constants.c
+ create mode 100644 arch/x86/kvm/vmx/pkvm/pkvm_constants.h
+ create mode 100644 arch/x86/kvm/vmx/pkvm/pkvm_debugfs.c
+ create mode 100644 arch/x86/kvm/vmx/pkvm/pkvm_host.c
+ create mode 100644 arch/x86/kvm/vmx/vmx_lib.h
+ create mode 100644 include/linux/intel-iommu.h
+ create mode 100644 virt/kvm/pkvm/buddy_memory.h
+ create mode 100644 virt/kvm/pkvm/gfp.h
+ rename {arch/arm64/kvm/hyp/nvhe => virt/kvm/pkvm}/page_alloc.c (56%)
+ create mode 100644 virt/kvm/pkvm/pkvm.c
+ create mode 100644 virt/kvm/pkvm/pkvm_spinlock.h
+
+diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
+index 577cf444c113..77f906dcbd50 100644
+--- a/arch/arm64/include/asm/kvm_host.h
++++ b/arch/arm64/include/asm/kvm_host.h
+@@ -962,11 +962,11 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);
+ 
+ int kvm_trng_call(struct kvm_vcpu *vcpu);
+ #ifdef CONFIG_KVM
+-extern phys_addr_t hyp_mem_base;
+-extern phys_addr_t hyp_mem_size;
+-void __init kvm_hyp_reserve(void);
++extern phys_addr_t pkvm_mem_base;
++extern phys_addr_t pkvm_mem_size;
++void __init pkvm_reserve(void);
+ #else
+-static inline void kvm_hyp_reserve(void) { }
++static inline void pkvm_reserve(void) { }
+ #endif
+ 
+ void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu);
+diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h
+index 9f4ad2a8df59..6ea44fde0672 100644
+--- a/arch/arm64/include/asm/kvm_pkvm.h
++++ b/arch/arm64/include/asm/kvm_pkvm.h
+@@ -8,11 +8,32 @@
+ 
+ #include <linux/memblock.h>
+ #include <asm/kvm_pgtable.h>
++#include <asm/kvm_mmu.h>
+ 
+-#define HYP_MEMBLOCK_REGIONS 128
++#define PKVM_MEMBLOCK_REGIONS 128
+ 
+-extern struct memblock_region kvm_nvhe_sym(hyp_memory)[];
+-extern unsigned int kvm_nvhe_sym(hyp_memblock_nr);
++#define __hyp_va(phys)	((void *)((phys_addr_t)(phys) - hyp_physvirt_offset))
++
++static inline void *hyp_phys_to_virt(phys_addr_t phys)
++{
++	return __hyp_va(phys);
++}
++
++static inline phys_addr_t hyp_virt_to_phys(void *addr)
++{
++	return __hyp_pa(addr);
++}
++
++#define __pkvm_pa __hyp_pa
++#define __pkvm_va __hyp_va
++
++#define pkvm_sym kvm_nvhe_sym
++
++extern struct memblock_region kvm_nvhe_sym(pkvm_memory)[];
++extern unsigned int kvm_nvhe_sym(pkvm_memblock_nr);
++
++int pkvm_pre_reserve_check(void);
++u64 pkvm_total_reserve_pages(void);
+ 
+ static inline unsigned long __hyp_pgtable_max_pages(unsigned long nr_pages)
+ {
+@@ -32,8 +53,8 @@ static inline unsigned long __hyp_pgtable_total_pages(void)
+ 	unsigned long res = 0, i;
+ 
+ 	/* Cover all of memory with page-granularity */
+-	for (i = 0; i < kvm_nvhe_sym(hyp_memblock_nr); i++) {
+-		struct memblock_region *reg = &kvm_nvhe_sym(hyp_memory)[i];
++	for (i = 0; i < kvm_nvhe_sym(pkvm_memblock_nr); i++) {
++		struct memblock_region *reg = &kvm_nvhe_sym(pkvm_memory)[i];
+ 		res += __hyp_pgtable_max_pages(reg->size >> PAGE_SHIFT);
+ 	}
+ 
+diff --git a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h b/arch/arm64/include/asm/pkvm_spinlock.h
+similarity index 73%
+rename from arch/arm64/kvm/hyp/include/nvhe/spinlock.h
+rename to arch/arm64/include/asm/pkvm_spinlock.h
+index 4652fd04bdbe..21f204f7c9c5 100644
+--- a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h
++++ b/arch/arm64/include/asm/pkvm_spinlock.h
+@@ -10,14 +10,14 @@
+  * Copyright (C) 2012 ARM Ltd.
+  */
+ 
+-#ifndef __ARM64_KVM_NVHE_SPINLOCK_H__
+-#define __ARM64_KVM_NVHE_SPINLOCK_H__
++#ifndef __ASM_ARM64_PKVM_SPINLOCK_H__
++#define __ASM_ARM64_PKVM_SPINLOCK_H__
+ 
+ #include <asm/alternative.h>
+ #include <asm/lse.h>
+ #include <asm/rwonce.h>
+ 
+-typedef union hyp_spinlock {
++typedef union arch_pkvm_spinlock {
+ 	u32	__val;
+ 	struct {
+ #ifdef __AARCH64EB__
+@@ -26,17 +26,14 @@ typedef union hyp_spinlock {
+ 		u16 owner, next;
+ #endif
+ 	};
+-} hyp_spinlock_t;
++} arch_pkvm_spinlock_t;
+ 
+-#define hyp_spin_lock_init(l)						\
+-do {									\
+-	*(l) = (hyp_spinlock_t){ .__val = 0 };				\
+-} while (0)
++#define __ARCH_PKVM_SPINLOCK_UNLOCKED	{ 0 }
+ 
+-static inline void hyp_spin_lock(hyp_spinlock_t *lock)
++static inline void arch_pkvm_spin_lock(arch_pkvm_spinlock_t *lock)
+ {
+ 	u32 tmp;
+-	hyp_spinlock_t lockval, newval;
++	arch_pkvm_spinlock_t lockval, newval;
+ 
+ 	asm volatile(
+ 	/* Atomically increment the next ticket. */
+@@ -71,7 +68,7 @@ static inline void hyp_spin_lock(hyp_spinlock_t *lock)
+ 	: "memory");
+ }
+ 
+-static inline void hyp_spin_unlock(hyp_spinlock_t *lock)
++static inline void arch_pkvm_spin_unlock(arch_pkvm_spinlock_t *lock)
+ {
+ 	u64 tmp;
+ 
+@@ -90,15 +87,15 @@ static inline void hyp_spin_unlock(hyp_spinlock_t *lock)
+ 	: "memory");
+ }
+ 
+-static inline bool hyp_spin_is_locked(hyp_spinlock_t *lock)
++static inline bool arch_pkvm_spin_is_locked(arch_pkvm_spinlock_t *lock)
+ {
+-	hyp_spinlock_t lockval = READ_ONCE(*lock);
++	arch_pkvm_spinlock_t lockval = READ_ONCE(*lock);
+ 
+ 	return lockval.owner != lockval.next;
+ }
+ 
+ #ifdef CONFIG_NVHE_EL2_DEBUG
+-static inline void hyp_assert_lock_held(hyp_spinlock_t *lock)
++static inline void arch_pkvm_assert_lock_held(arch_pkvm_spinlock_t *lock)
+ {
+ 	/*
+ 	 * The __pkvm_init() path accesses protected data-structures without
+@@ -108,10 +105,10 @@ static inline void hyp_assert_lock_held(hyp_spinlock_t *lock)
+ 	 * wait until it is set before checking the lock state.
+ 	 */
+ 	if (static_branch_likely(&kvm_protected_mode_initialized))
+-		BUG_ON(!hyp_spin_is_locked(lock));
++		BUG_ON(!arch_pkvm_spin_is_locked(lock));
+ }
+ #else
+-static inline void hyp_assert_lock_held(hyp_spinlock_t *lock) { }
++static inline void arch_pkvm_assert_lock_held(arch_pkvm_spinlock_t *lock) { }
+ #endif
+ 
+-#endif /* __ARM64_KVM_NVHE_SPINLOCK_H__ */
++#endif /* __ASM_ARM64_PKVM_SPINLOCK_H__ */
+diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
+index 5e33c2d4645a..9691fd90de6b 100644
+--- a/arch/arm64/kvm/Makefile
++++ b/arch/arm64/kvm/Makefile
+@@ -22,6 +22,8 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
+ 	 vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \
+ 	 vgic/vgic-its.o vgic/vgic-debug.o
+ 
++kvm-y += ../../../virt/kvm/pkvm/pkvm.o
++
+ kvm-$(CONFIG_HW_PERF_EVENTS)  += pmu-emul.o pmu.o
+ 
+ always-y := hyp_constants.h hyp-constants.s
+@@ -31,6 +33,7 @@ define rule_gen_hyp_constants
+ endef
+ 
+ CFLAGS_hyp-constants.o = -I $(srctree)/$(src)/hyp/include
++CFLAGS_hyp-constants.o += -I $(srctree)/virt/kvm/pkvm
+ $(obj)/hyp-constants.s: $(src)/hyp/hyp-constants.c FORCE
+ 	$(call if_changed_dep,cc_s_c)
+ 
+diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
+index 6cc380a15eb7..720961355dcc 100644
+--- a/arch/arm64/kvm/arm.c
++++ b/arch/arm64/kvm/arm.c
+@@ -1888,7 +1888,7 @@ static int do_pkvm_init(u32 hyp_va_bits)
+ 
+ 	preempt_disable();
+ 	cpu_hyp_init_context();
+-	ret = kvm_call_hyp_nvhe(__pkvm_init, hyp_mem_base, hyp_mem_size,
++	ret = kvm_call_hyp_nvhe(__pkvm_init, pkvm_mem_base, pkvm_mem_size,
+ 				num_possible_cpus(), kern_hyp_va(per_cpu_base),
+ 				hyp_va_bits);
+ 	cpu_hyp_init_features();
+@@ -1941,10 +1941,10 @@ static void kvm_hyp_init_symbols(void)
+ 
+ static int kvm_hyp_init_protection(u32 hyp_va_bits)
+ {
+-	void *addr = phys_to_virt(hyp_mem_base);
++	void *addr = phys_to_virt(pkvm_mem_base);
+ 	int ret;
+ 
+-	ret = create_hyp_mappings(addr, addr + hyp_mem_size, PAGE_HYP);
++	ret = create_hyp_mappings(addr, addr + pkvm_mem_size, PAGE_HYP);
+ 	if (ret)
+ 		return ret;
+ 
+@@ -1970,7 +1970,7 @@ static int init_hyp_mode(void)
+ 	 * The protected Hyp-mode cannot be initialized if the memory pool
+ 	 * allocation has failed.
+ 	 */
+-	if (is_protected_kvm_enabled() && !hyp_mem_base)
++	if (is_protected_kvm_enabled() && !pkvm_mem_base)
+ 		goto out_err;
+ 
+ 	/*
+diff --git a/arch/arm64/kvm/hyp/hyp-constants.c b/arch/arm64/kvm/hyp/hyp-constants.c
+index b3742a6691e8..3e604ce08796 100644
+--- a/arch/arm64/kvm/hyp/hyp-constants.c
++++ b/arch/arm64/kvm/hyp/hyp-constants.c
+@@ -1,10 +1,10 @@
+ // SPDX-License-Identifier: GPL-2.0-only
+ 
+ #include <linux/kbuild.h>
+-#include <nvhe/memory.h>
++#include <buddy_memory.h>
+ 
+ int main(void)
+ {
+-	DEFINE(STRUCT_HYP_PAGE_SIZE,	sizeof(struct hyp_page));
++	DEFINE(STRUCT_HYP_PAGE_SIZE,	sizeof(struct pkvm_page));
+ 	return 0;
+ }
+diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
+deleted file mode 100644
+index 0a048dc06a7d..000000000000
+--- a/arch/arm64/kvm/hyp/include/nvhe/gfp.h
++++ /dev/null
+@@ -1,34 +0,0 @@
+-/* SPDX-License-Identifier: GPL-2.0-only */
+-#ifndef __KVM_HYP_GFP_H
+-#define __KVM_HYP_GFP_H
+-
+-#include <linux/list.h>
+-
+-#include <nvhe/memory.h>
+-#include <nvhe/spinlock.h>
+-
+-#define HYP_NO_ORDER	USHRT_MAX
+-
+-struct hyp_pool {
+-	/*
+-	 * Spinlock protecting concurrent changes to the memory pool as well as
+-	 * the struct hyp_page of the pool's pages until we have a proper atomic
+-	 * API at EL2.
+-	 */
+-	hyp_spinlock_t lock;
+-	struct list_head free_area[MAX_ORDER];
+-	phys_addr_t range_start;
+-	phys_addr_t range_end;
+-	unsigned short max_order;
+-};
+-
+-/* Allocation */
+-void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order);
+-void hyp_split_page(struct hyp_page *page);
+-void hyp_get_page(struct hyp_pool *pool, void *addr);
+-void hyp_put_page(struct hyp_pool *pool, void *addr);
+-
+-/* Used pages cannot be freed */
+-int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages,
+-		  unsigned int reserved_pages);
+-#endif /* __KVM_HYP_GFP_H */
+diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+index 80e99836eac7..6ea3f31e7741 100644
+--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
++++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+@@ -10,7 +10,7 @@
+ #include <asm/kvm_hyp.h>
+ #include <asm/kvm_pgtable.h>
+ #include <asm/virt.h>
+-#include <nvhe/spinlock.h>
++#include <pkvm_spinlock.h>
+ 
+ /*
+  * SW bits 0-1 are reserved to track the memory ownership state of each page:
+@@ -47,7 +47,7 @@ struct host_kvm {
+ 	struct kvm_arch arch;
+ 	struct kvm_pgtable pgt;
+ 	struct kvm_pgtable_mm_ops mm_ops;
+-	hyp_spinlock_t lock;
++	pkvm_spinlock_t lock;
+ };
+ extern struct host_kvm host_kvm;
+ 
+diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h
+deleted file mode 100644
+index 592b7edb3edb..000000000000
+--- a/arch/arm64/kvm/hyp/include/nvhe/memory.h
++++ /dev/null
+@@ -1,48 +0,0 @@
+-/* SPDX-License-Identifier: GPL-2.0-only */
+-#ifndef __KVM_HYP_MEMORY_H
+-#define __KVM_HYP_MEMORY_H
+-
+-#include <asm/kvm_mmu.h>
+-#include <asm/page.h>
+-
+-#include <linux/types.h>
+-
+-struct hyp_page {
+-	unsigned short refcount;
+-	unsigned short order;
+-};
+-
+-extern u64 __hyp_vmemmap;
+-#define hyp_vmemmap ((struct hyp_page *)__hyp_vmemmap)
+-
+-#define __hyp_va(phys)	((void *)((phys_addr_t)(phys) - hyp_physvirt_offset))
+-
+-static inline void *hyp_phys_to_virt(phys_addr_t phys)
+-{
+-	return __hyp_va(phys);
+-}
+-
+-static inline phys_addr_t hyp_virt_to_phys(void *addr)
+-{
+-	return __hyp_pa(addr);
+-}
+-
+-#define hyp_phys_to_pfn(phys)	((phys) >> PAGE_SHIFT)
+-#define hyp_pfn_to_phys(pfn)	((phys_addr_t)((pfn) << PAGE_SHIFT))
+-#define hyp_phys_to_page(phys)	(&hyp_vmemmap[hyp_phys_to_pfn(phys)])
+-#define hyp_virt_to_page(virt)	hyp_phys_to_page(__hyp_pa(virt))
+-#define hyp_virt_to_pfn(virt)	hyp_phys_to_pfn(__hyp_pa(virt))
+-
+-#define hyp_page_to_pfn(page)	((struct hyp_page *)(page) - hyp_vmemmap)
+-#define hyp_page_to_phys(page)  hyp_pfn_to_phys((hyp_page_to_pfn(page)))
+-#define hyp_page_to_virt(page)	__hyp_va(hyp_page_to_phys(page))
+-#define hyp_page_to_pool(page)	(((struct hyp_page *)page)->pool)
+-
+-static inline int hyp_page_count(void *addr)
+-{
+-	struct hyp_page *p = hyp_virt_to_page(addr);
+-
+-	return p->refcount;
+-}
+-
+-#endif /* __KVM_HYP_MEMORY_H */
+diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h
+index 42d8eb9bfe72..9a18d3c1d6f1 100644
+--- a/arch/arm64/kvm/hyp/include/nvhe/mm.h
++++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h
+@@ -7,11 +7,11 @@
+ #include <linux/memblock.h>
+ #include <linux/types.h>
+ 
+-#include <nvhe/memory.h>
+-#include <nvhe/spinlock.h>
++#include <buddy_memory.h>
++#include <pkvm_spinlock.h>
+ 
+ extern struct kvm_pgtable pkvm_pgtable;
+-extern hyp_spinlock_t pkvm_pgd_lock;
++extern pkvm_spinlock_t pkvm_pgd_lock;
+ 
+ int hyp_create_idmap(u32 hyp_va_bits);
+ int hyp_map_vectors(void);
+@@ -28,10 +28,10 @@ static inline void hyp_vmemmap_range(phys_addr_t phys, unsigned long size,
+ 				     unsigned long *start, unsigned long *end)
+ {
+ 	unsigned long nr_pages = size >> PAGE_SHIFT;
+-	struct hyp_page *p = hyp_phys_to_page(phys);
++	struct pkvm_page *p = pkvm_phys_to_page(phys);
+ 
+ 	*start = (unsigned long)p;
+-	*end = *start + nr_pages * sizeof(struct hyp_page);
++	*end = *start + nr_pages * sizeof(struct pkvm_page);
+ 	*start = ALIGN_DOWN(*start, PAGE_SIZE);
+ 	*end = ALIGN(*end, PAGE_SIZE);
+ }
+diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
+index be0a2bc3e20d..8e246861616b 100644
+--- a/arch/arm64/kvm/hyp/nvhe/Makefile
++++ b/arch/arm64/kvm/hyp/nvhe/Makefile
+@@ -10,6 +10,7 @@ asflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS
+ # will explode instantly (Words of Marc Zyngier). So introduce a generic flag
+ # __DISABLE_TRACE_MMIO__ to disable MMIO tracing for nVHE KVM.
+ ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS -D__DISABLE_TRACE_MMIO__
++ccflags-y += -I $(srctree)/virt/kvm/pkvm
+ ccflags-y += -fno-stack-protector	\
+ 	     -DDISABLE_BRANCH_PROFILING	\
+ 	     $(DISABLE_STACKLEAK_PLUGIN)
+@@ -21,10 +22,11 @@ lib-objs := clear_page.o copy_page.o memcpy.o memset.o
+ lib-objs := $(addprefix ../../../lib/, $(lib-objs))
+ 
+ hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \
+-	 hyp-main.o hyp-smp.o psci-relay.o early_alloc.o page_alloc.o \
++	 hyp-main.o hyp-smp.o psci-relay.o early_alloc.o \
+ 	 cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o
+ hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
+ 	 ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o
++hyp-obj-y += ../../../../../virt/kvm/pkvm/page_alloc.o
+ hyp-obj-$(CONFIG_DEBUG_LIST) += list_debug.o
+ hyp-obj-y += $(lib-objs)
+ 
+diff --git a/arch/arm64/kvm/hyp/nvhe/early_alloc.c b/arch/arm64/kvm/hyp/nvhe/early_alloc.c
+index 00de04153cc6..be1e72cdcbce 100644
+--- a/arch/arm64/kvm/hyp/nvhe/early_alloc.c
++++ b/arch/arm64/kvm/hyp/nvhe/early_alloc.c
+@@ -7,7 +7,7 @@
+ #include <asm/kvm_pgtable.h>
+ 
+ #include <nvhe/early_alloc.h>
+-#include <nvhe/memory.h>
++#include <buddy_memory.h>
+ 
+ struct kvm_pgtable_mm_ops hyp_early_alloc_mm_ops;
+ s64 __ro_after_init hyp_physvirt_offset;
+diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+index 07f9dc9848ef..89d04330ca95 100644
+--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
++++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+@@ -14,8 +14,8 @@
+ 
+ #include <hyp/fault.h>
+ 
+-#include <nvhe/gfp.h>
+-#include <nvhe/memory.h>
++#include <gfp.h>
++#include <buddy_memory.h>
+ #include <nvhe/mem_protect.h>
+ #include <nvhe/mm.h>
+ 
+@@ -24,35 +24,35 @@
+ extern unsigned long hyp_nr_cpus;
+ struct host_kvm host_kvm;
+ 
+-static struct hyp_pool host_s2_pool;
++static struct pkvm_pool host_s2_pool;
+ 
+ const u8 pkvm_hyp_id = 1;
+ 
+ static void host_lock_component(void)
+ {
+-	hyp_spin_lock(&host_kvm.lock);
++	pkvm_spin_lock(&host_kvm.lock);
+ }
+ 
+ static void host_unlock_component(void)
+ {
+-	hyp_spin_unlock(&host_kvm.lock);
++	pkvm_spin_unlock(&host_kvm.lock);
+ }
+ 
+ static void hyp_lock_component(void)
+ {
+-	hyp_spin_lock(&pkvm_pgd_lock);
++	pkvm_spin_lock(&pkvm_pgd_lock);
+ }
+ 
+ static void hyp_unlock_component(void)
+ {
+-	hyp_spin_unlock(&pkvm_pgd_lock);
++	pkvm_spin_unlock(&pkvm_pgd_lock);
+ }
+ 
+ static void *host_s2_zalloc_pages_exact(size_t size)
+ {
+-	void *addr = hyp_alloc_pages(&host_s2_pool, get_order(size));
++	void *addr = pkvm_alloc_pages(&host_s2_pool, get_order(size));
+ 
+-	hyp_split_page(hyp_virt_to_page(addr));
++	pkvm_split_page(pkvm_virt_to_page(addr));
+ 
+ 	/*
+ 	 * The size of concatenated PGDs is always a power of two of PAGE_SIZE,
+@@ -66,17 +66,17 @@ static void *host_s2_zalloc_pages_exact(size_t size)
+ 
+ static void *host_s2_zalloc_page(void *pool)
+ {
+-	return hyp_alloc_pages(pool, 0);
++	return pkvm_alloc_pages(pool, 0);
+ }
+ 
+ static void host_s2_get_page(void *addr)
+ {
+-	hyp_get_page(&host_s2_pool, addr);
++	pkvm_get_page(&host_s2_pool, addr);
+ }
+ 
+ static void host_s2_put_page(void *addr)
+ {
+-	hyp_put_page(&host_s2_pool, addr);
++	pkvm_put_page(&host_s2_pool, addr);
+ }
+ 
+ static int prepare_s2_pool(void *pgt_pool_base)
+@@ -84,9 +84,9 @@ static int prepare_s2_pool(void *pgt_pool_base)
+ 	unsigned long nr_pages, pfn;
+ 	int ret;
+ 
+-	pfn = hyp_virt_to_pfn(pgt_pool_base);
++	pfn = pkvm_virt_to_pfn(pgt_pool_base);
+ 	nr_pages = host_s2_pgtable_pages();
+-	ret = hyp_pool_init(&host_s2_pool, pfn, nr_pages, 0);
++	ret = pkvm_pool_init(&host_s2_pool, pfn, nr_pages, 0);
+ 	if (ret)
+ 		return ret;
+ 
+@@ -95,7 +95,7 @@ static int prepare_s2_pool(void *pgt_pool_base)
+ 		.zalloc_page = host_s2_zalloc_page,
+ 		.phys_to_virt = hyp_phys_to_virt,
+ 		.virt_to_phys = hyp_virt_to_phys,
+-		.page_count = hyp_page_count,
++		.page_count = pkvm_page_count,
+ 		.get_page = host_s2_get_page,
+ 		.put_page = host_s2_put_page,
+ 	};
+@@ -123,7 +123,7 @@ int kvm_host_prepare_stage2(void *pgt_pool_base)
+ 	int ret;
+ 
+ 	prepare_host_vtcr();
+-	hyp_spin_lock_init(&host_kvm.lock);
++	pkvm_spinlock_init(&host_kvm.lock);
+ 	mmu->arch = &host_kvm.arch;
+ 
+ 	ret = prepare_s2_pool(pgt_pool_base);
+@@ -181,8 +181,8 @@ static int host_stage2_unmap_dev_all(void)
+ 	int i, ret;
+ 
+ 	/* Unmap all non-memory regions to recycle the pages */
+-	for (i = 0; i < hyp_memblock_nr; i++, addr = reg->base + reg->size) {
+-		reg = &hyp_memory[i];
++	for (i = 0; i < pkvm_memblock_nr; i++, addr = reg->base + reg->size) {
++		reg = &pkvm_memory[i];
+ 		ret = kvm_pgtable_stage2_unmap(pgt, addr, reg->base - addr);
+ 		if (ret)
+ 			return ret;
+@@ -197,7 +197,7 @@ struct kvm_mem_range {
+ 
+ static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range)
+ {
+-	int cur, left = 0, right = hyp_memblock_nr;
++	int cur, left = 0, right = pkvm_memblock_nr;
+ 	struct memblock_region *reg;
+ 	phys_addr_t end;
+ 
+@@ -207,7 +207,7 @@ static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range)
+ 	/* The list of memblock regions is sorted, binary search it */
+ 	while (left < right) {
+ 		cur = (left + right) >> 1;
+-		reg = &hyp_memory[cur];
++		reg = &pkvm_memory[cur];
+ 		end = reg->base + reg->size;
+ 		if (addr < reg->base) {
+ 			right = cur;
+@@ -263,7 +263,7 @@ static inline int __host_stage2_idmap(u64 start, u64 end,
+ #define host_stage2_try(fn, ...)					\
+ 	({								\
+ 		int __ret;						\
+-		hyp_assert_lock_held(&host_kvm.lock);			\
++		pkvm_assert_lock_held(&host_kvm.lock);			\
+ 		__ret = fn(__VA_ARGS__);				\
+ 		if (__ret == -ENOMEM) {					\
+ 			__ret = host_stage2_unmap_dev_all();		\
+@@ -286,7 +286,7 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range)
+ 	u32 level;
+ 	int ret;
+ 
+-	hyp_assert_lock_held(&host_kvm.lock);
++	pkvm_assert_lock_held(&host_kvm.lock);
+ 	ret = kvm_pgtable_get_leaf(&host_kvm.pgt, addr, &pte, &level);
+ 	if (ret)
+ 		return ret;
+@@ -459,7 +459,7 @@ static int __host_check_page_state_range(u64 addr, u64 size,
+ 		.get_page_state	= host_get_page_state,
+ 	};
+ 
+-	hyp_assert_lock_held(&host_kvm.lock);
++	pkvm_assert_lock_held(&host_kvm.lock);
+ 	return check_page_state_range(&host_kvm.pgt, addr, size, &d);
+ }
+ 
+@@ -516,7 +516,7 @@ static enum pkvm_page_state hyp_get_page_state(kvm_pte_t pte)
+ 	if (!kvm_pte_valid(pte))
+ 		return PKVM_NOPAGE;
+ 
+-	return pkvm_getstate(kvm_pgtable_hyp_pte_prot(pte));
++	return pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte));
+ }
+ 
+ static int __hyp_check_page_state_range(u64 addr, u64 size,
+@@ -527,7 +527,7 @@ static int __hyp_check_page_state_range(u64 addr, u64 size,
+ 		.get_page_state	= hyp_get_page_state,
+ 	};
+ 
+-	hyp_assert_lock_held(&pkvm_pgd_lock);
++	pkvm_assert_lock_held(&pkvm_pgd_lock);
+ 	return check_page_state_range(&pkvm_pgtable, addr, size, &d);
+ }
+ 
+@@ -735,7 +735,7 @@ static int do_unshare(struct pkvm_mem_share *share)
+ int __pkvm_host_share_hyp(u64 pfn)
+ {
+ 	int ret;
+-	u64 host_addr = hyp_pfn_to_phys(pfn);
++	u64 host_addr = pkvm_pfn_to_phys(pfn);
+ 	u64 hyp_addr = (u64)__hyp_va(host_addr);
+ 	struct pkvm_mem_share share = {
+ 		.tx	= {
+@@ -768,7 +768,7 @@ int __pkvm_host_share_hyp(u64 pfn)
+ int __pkvm_host_unshare_hyp(u64 pfn)
+ {
+ 	int ret;
+-	u64 host_addr = hyp_pfn_to_phys(pfn);
++	u64 host_addr = pkvm_pfn_to_phys(pfn);
+ 	u64 hyp_addr = (u64)__hyp_va(host_addr);
+ 	struct pkvm_mem_share share = {
+ 		.tx	= {
+diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c
+index 96193cb31a39..eb0071451cb4 100644
+--- a/arch/arm64/kvm/hyp/nvhe/mm.c
++++ b/arch/arm64/kvm/hyp/nvhe/mm.c
+@@ -12,16 +12,16 @@
+ #include <asm/spectre.h>
+ 
+ #include <nvhe/early_alloc.h>
+-#include <nvhe/gfp.h>
+-#include <nvhe/memory.h>
++#include <gfp.h>
++#include <buddy_memory.h>
+ #include <nvhe/mm.h>
+-#include <nvhe/spinlock.h>
++#include <pkvm_spinlock.h>
+ 
+ struct kvm_pgtable pkvm_pgtable;
+-hyp_spinlock_t pkvm_pgd_lock;
++pkvm_spinlock_t pkvm_pgd_lock;
+ 
+-struct memblock_region hyp_memory[HYP_MEMBLOCK_REGIONS];
+-unsigned int hyp_memblock_nr;
++struct memblock_region pkvm_memory[PKVM_MEMBLOCK_REGIONS];
++unsigned int pkvm_memblock_nr;
+ 
+ static u64 __io_map_base;
+ 
+@@ -30,9 +30,9 @@ static int __pkvm_create_mappings(unsigned long start, unsigned long size,
+ {
+ 	int err;
+ 
+-	hyp_spin_lock(&pkvm_pgd_lock);
++	pkvm_spin_lock(&pkvm_pgd_lock);
+ 	err = kvm_pgtable_hyp_map(&pkvm_pgtable, start, size, phys, prot);
+-	hyp_spin_unlock(&pkvm_pgd_lock);
++	pkvm_spin_unlock(&pkvm_pgd_lock);
+ 
+ 	return err;
+ }
+@@ -52,7 +52,7 @@ int pkvm_alloc_private_va_range(size_t size, unsigned long *haddr)
+ 	unsigned long base, addr;
+ 	int ret = 0;
+ 
+-	hyp_spin_lock(&pkvm_pgd_lock);
++	pkvm_spin_lock(&pkvm_pgd_lock);
+ 
+ 	/* Align the allocation based on the order of its size */
+ 	addr = ALIGN(__io_map_base, PAGE_SIZE << get_order(size));
+@@ -61,14 +61,14 @@ int pkvm_alloc_private_va_range(size_t size, unsigned long *haddr)
+ 	base = addr + PAGE_ALIGN(size);
+ 
+ 	/* Are we overflowing on the vmemmap ? */
+-	if (!addr || base > __hyp_vmemmap)
++	if (!addr || base > __pkvm_vmemmap)
+ 		ret = -ENOMEM;
+ 	else {
+ 		__io_map_base = base;
+ 		*haddr = addr;
+ 	}
+ 
+-	hyp_spin_unlock(&pkvm_pgd_lock);
++	pkvm_spin_unlock(&pkvm_pgd_lock);
+ 
+ 	return ret;
+ }
+@@ -100,7 +100,7 @@ int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot
+ 	unsigned long virt_addr;
+ 	phys_addr_t phys;
+ 
+-	hyp_assert_lock_held(&pkvm_pgd_lock);
++	pkvm_assert_lock_held(&pkvm_pgd_lock);
+ 
+ 	start = start & PAGE_MASK;
+ 	end = PAGE_ALIGN(end);
+@@ -122,9 +122,9 @@ int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
+ {
+ 	int ret;
+ 
+-	hyp_spin_lock(&pkvm_pgd_lock);
++	pkvm_spin_lock(&pkvm_pgd_lock);
+ 	ret = pkvm_create_mappings_locked(from, to, prot);
+-	hyp_spin_unlock(&pkvm_pgd_lock);
++	pkvm_spin_unlock(&pkvm_pgd_lock);
+ 
+ 	return ret;
+ }
+@@ -209,7 +209,7 @@ int hyp_create_idmap(u32 hyp_va_bits)
+ 	 */
+ 	__io_map_base = start & BIT(hyp_va_bits - 2);
+ 	__io_map_base ^= BIT(hyp_va_bits - 2);
+-	__hyp_vmemmap = __io_map_base | BIT(hyp_va_bits - 3);
++	__pkvm_vmemmap = __io_map_base | BIT(hyp_va_bits - 3);
+ 
+ 	return __pkvm_create_mappings(start, end - start, start, PAGE_HYP_EXEC);
+ }
+diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
+index 08508783ec3d..1c757bd02d4d 100644
+--- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c
++++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
+@@ -11,7 +11,7 @@
+ #include <linux/kvm_host.h>
+ #include <uapi/linux/psci.h>
+ 
+-#include <nvhe/memory.h>
++#include <buddy_memory.h>
+ #include <nvhe/trap_handler.h>
+ 
+ void kvm_hyp_cpu_entry(unsigned long r0);
+diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
+index e8d4ea2fcfa0..2c9ce8761e79 100644
+--- a/arch/arm64/kvm/hyp/nvhe/setup.c
++++ b/arch/arm64/kvm/hyp/nvhe/setup.c
+@@ -12,8 +12,8 @@
+ 
+ #include <nvhe/early_alloc.h>
+ #include <nvhe/fixed_config.h>
+-#include <nvhe/gfp.h>
+-#include <nvhe/memory.h>
++#include <gfp.h>
++#include <buddy_memory.h>
+ #include <nvhe/mem_protect.h>
+ #include <nvhe/mm.h>
+ #include <nvhe/trap_handler.h>
+@@ -27,7 +27,7 @@ static void *vmemmap_base;
+ static void *hyp_pgt_base;
+ static void *host_s2_pgt_base;
+ static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops;
+-static struct hyp_pool hpool;
++static struct pkvm_pool ppool;
+ 
+ static int divide_memory_pool(void *virt, unsigned long size)
+ {
+@@ -126,10 +126,10 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
+ 		 * and addresses corresponding to the guard page have the
+ 		 * PAGE_SHIFT bit as 0 - this is used for overflow detection.
+ 		 */
+-		hyp_spin_lock(&pkvm_pgd_lock);
++		pkvm_spin_lock(&pkvm_pgd_lock);
+ 		ret = kvm_pgtable_hyp_map(&pkvm_pgtable, hyp_addr + PAGE_SIZE,
+ 					PAGE_SIZE, params->stack_pa, PAGE_HYP);
+-		hyp_spin_unlock(&pkvm_pgd_lock);
++		pkvm_spin_unlock(&pkvm_pgd_lock);
+ 		if (ret)
+ 			return ret;
+ 
+@@ -173,17 +173,17 @@ static void update_nvhe_init_params(void)
+ 
+ static void *hyp_zalloc_hyp_page(void *arg)
+ {
+-	return hyp_alloc_pages(&hpool, 0);
++	return pkvm_alloc_pages(&ppool, 0);
+ }
+ 
+ static void hpool_get_page(void *addr)
+ {
+-	hyp_get_page(&hpool, addr);
++	pkvm_get_page(&ppool, addr);
+ }
+ 
+ static void hpool_put_page(void *addr)
+ {
+-	hyp_put_page(&hpool, addr);
++	pkvm_put_page(&ppool, addr);
+ }
+ 
+ static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level,
+@@ -246,8 +246,8 @@ static int finalize_host_mappings(void)
+ 	};
+ 	int i, ret;
+ 
+-	for (i = 0; i < hyp_memblock_nr; i++) {
+-		struct memblock_region *reg = &hyp_memory[i];
++	for (i = 0; i < pkvm_memblock_nr; i++) {
++		struct memblock_region *reg = &pkvm_memory[i];
+ 		u64 start = (u64)hyp_phys_to_virt(reg->base);
+ 
+ 		ret = kvm_pgtable_walk(&pkvm_pgtable, start, reg->size, &walker);
+@@ -266,10 +266,10 @@ void __noreturn __pkvm_init_finalise(void)
+ 	int ret;
+ 
+ 	/* Now that the vmemmap is backed, install the full-fledged allocator */
+-	pfn = hyp_virt_to_pfn(hyp_pgt_base);
++	pfn = pkvm_virt_to_pfn(hyp_pgt_base);
+ 	nr_pages = hyp_s1_pgtable_pages();
+ 	reserved_pages = hyp_early_alloc_nr_used_pages();
+-	ret = hyp_pool_init(&hpool, pfn, nr_pages, reserved_pages);
++	ret = pkvm_pool_init(&ppool, pfn, nr_pages, reserved_pages);
+ 	if (ret)
+ 		goto out;
+ 
+@@ -283,7 +283,7 @@ void __noreturn __pkvm_init_finalise(void)
+ 		.virt_to_phys = hyp_virt_to_phys,
+ 		.get_page = hpool_get_page,
+ 		.put_page = hpool_put_page,
+-		.page_count = hyp_page_count,
++		.page_count = pkvm_page_count,
+ 	};
+ 	pkvm_pgtable.mm_ops = &pkvm_pgtable_mm_ops;
+ 
+@@ -314,7 +314,7 @@ int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus,
+ 	if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size))
+ 		return -EINVAL;
+ 
+-	hyp_spin_lock_init(&pkvm_pgd_lock);
++	pkvm_spinlock_init(&pkvm_pgd_lock);
+ 	hyp_nr_cpus = nr_cpus;
+ 
+ 	ret = divide_memory_pool(virt, size);
+diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
+index ebecb7c045f4..a83a350c1956 100644
+--- a/arch/arm64/kvm/pkvm.c
++++ b/arch/arm64/kvm/pkvm.c
+@@ -5,72 +5,27 @@
+  */
+ 
+ #include <linux/kvm_host.h>
+-#include <linux/memblock.h>
+-#include <linux/sort.h>
+ 
+ #include <asm/kvm_pkvm.h>
+-
+ #include "hyp_constants.h"
+ 
+-static struct memblock_region *hyp_memory = kvm_nvhe_sym(hyp_memory);
+-static unsigned int *hyp_memblock_nr_ptr = &kvm_nvhe_sym(hyp_memblock_nr);
+-
+-phys_addr_t hyp_mem_base;
+-phys_addr_t hyp_mem_size;
+-
+-static int cmp_hyp_memblock(const void *p1, const void *p2)
++int pkvm_pre_reserve_check(void)
+ {
+-	const struct memblock_region *r1 = p1;
+-	const struct memblock_region *r2 = p2;
+-
+-	return r1->base < r2->base ? -1 : (r1->base > r2->base);
+-}
+-
+-static void __init sort_memblock_regions(void)
+-{
+-	sort(hyp_memory,
+-	     *hyp_memblock_nr_ptr,
+-	     sizeof(struct memblock_region),
+-	     cmp_hyp_memblock,
+-	     NULL);
+-}
+-
+-static int __init register_memblock_regions(void)
+-{
+-	struct memblock_region *reg;
+-
+-	for_each_mem_region(reg) {
+-		if (*hyp_memblock_nr_ptr >= HYP_MEMBLOCK_REGIONS)
+-			return -ENOMEM;
++	if (!is_hyp_mode_available() || is_kernel_in_hyp_mode())
++		return -EINVAL;
+ 
+-		hyp_memory[*hyp_memblock_nr_ptr] = *reg;
+-		(*hyp_memblock_nr_ptr)++;
+-	}
+-	sort_memblock_regions();
++	if (kvm_get_mode() != KVM_MODE_PROTECTED)
++		return -EINVAL;
+ 
+ 	return 0;
+ }
+ 
+-void __init kvm_hyp_reserve(void)
++u64 pkvm_total_reserve_pages(void)
+ {
+-	u64 nr_pages, prev, hyp_mem_pages = 0;
+-	int ret;
+-
+-	if (!is_hyp_mode_available() || is_kernel_in_hyp_mode())
+-		return;
+-
+-	if (kvm_get_mode() != KVM_MODE_PROTECTED)
+-		return;
++	u64 nr_pages, prev, total_pages = 0;
+ 
+-	ret = register_memblock_regions();
+-	if (ret) {
+-		*hyp_memblock_nr_ptr = 0;
+-		kvm_err("Failed to register hyp memblocks: %d\n", ret);
+-		return;
+-	}
+-
+-	hyp_mem_pages += hyp_s1_pgtable_pages();
+-	hyp_mem_pages += host_s2_pgtable_pages();
++	total_pages += hyp_s1_pgtable_pages();
++	total_pages += host_s2_pgtable_pages();
+ 
+ 	/*
+ 	 * The hyp_vmemmap needs to be backed by pages, but these pages
+@@ -80,30 +35,12 @@ void __init kvm_hyp_reserve(void)
+ 	nr_pages = 0;
+ 	do {
+ 		prev = nr_pages;
+-		nr_pages = hyp_mem_pages + prev;
++		nr_pages = total_pages + prev;
+ 		nr_pages = DIV_ROUND_UP(nr_pages * STRUCT_HYP_PAGE_SIZE,
+ 					PAGE_SIZE);
+ 		nr_pages += __hyp_pgtable_max_pages(nr_pages);
+ 	} while (nr_pages != prev);
+-	hyp_mem_pages += nr_pages;
+-
+-	/*
+-	 * Try to allocate a PMD-aligned region to reduce TLB pressure once
+-	 * this is unmapped from the host stage-2, and fallback to PAGE_SIZE.
+-	 */
+-	hyp_mem_size = hyp_mem_pages << PAGE_SHIFT;
+-	hyp_mem_base = memblock_phys_alloc(ALIGN(hyp_mem_size, PMD_SIZE),
+-					   PMD_SIZE);
+-	if (!hyp_mem_base)
+-		hyp_mem_base = memblock_phys_alloc(hyp_mem_size, PAGE_SIZE);
+-	else
+-		hyp_mem_size = ALIGN(hyp_mem_size, PMD_SIZE);
+-
+-	if (!hyp_mem_base) {
+-		kvm_err("Failed to reserve hyp memory\n");
+-		return;
+-	}
++	total_pages += nr_pages;
+ 
+-	kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20,
+-		 hyp_mem_base);
++	return total_pages;
+ }
+diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
+index 4b4651ee47f2..f939487b24b4 100644
+--- a/arch/arm64/mm/init.c
++++ b/arch/arm64/mm/init.c
+@@ -420,7 +420,7 @@ void __init bootmem_init(void)
+ 
+ 	dma_pernuma_cma_reserve();
+ 
+-	kvm_hyp_reserve();
++	pkvm_reserve();
+ 
+ 	/*
+ 	 * sparse_init() tries to allocate memory from memblock, so must be
+diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
+index 4c9bfc4be58d..f430abaad5aa 100644
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -896,6 +896,17 @@ config INTEL_TDX_GUEST
+ 	  memory contents and CPU state. TDX guests are protected from
+ 	  some attacks from the VMM.
+ 
++config PKVM_GUEST
++	bool "PKVM Guest Support"
++	depends on X86_64 && !PKVM_INTEL
++	select ARCH_HAS_CC_PLATFORM
++	select X86_MEM_ENCRYPT
++	default n
++	help
++	  Support running as a protected guest under Protected KVM.
++	  Without this support, the guest kernel can not boot or run
++	  under Protected KVM.
++
+ endif # HYPERVISOR_GUEST
+ 
+ source "arch/x86/Kconfig.cpu"
+diff --git a/arch/x86/coco/Makefile b/arch/x86/coco/Makefile
+index c816acf78b6a..878cb2f5cccd 100644
+--- a/arch/x86/coco/Makefile
++++ b/arch/x86/coco/Makefile
+@@ -5,4 +5,5 @@ CFLAGS_core.o		+= -fno-stack-protector
+ 
+ obj-y += core.o
+ 
+-obj-$(CONFIG_INTEL_TDX_GUEST)	+= tdx/
++obj-$(CONFIG_INTEL_TDX_GUEST)	+= tdx/ virt_exception.o
++obj-$(CONFIG_PKVM_GUEST)	+= pkvm/ virt_exception.o
+diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c
+index 49b44f881484..701019e9191e 100644
+--- a/arch/x86/coco/core.c
++++ b/arch/x86/coco/core.c
+@@ -16,6 +16,38 @@
+ static enum cc_vendor vendor __ro_after_init;
+ static u64 cc_mask __ro_after_init;
+ 
++static bool pkvm_cc_platform_has(enum cc_attr attr)
++{
++	/*
++	 * Since primary VM can't access pkvm guest's memory, pkvm guest need
++	 * explicitly share DMA buffer with primary VM to make virtio work. By
++	 * using these attribute, pkvm guest will using bounce buffer for DMA
++	 * operation, and share the bounce buffer with primary VM.
++	 *
++	 * CC_ATTR_GUEST_UNROLL_STRING_IO: Since string io cause KVM to do
++	 * instruction decode, to avoid it, using this attribute will unroll the
++	 * string io. For example, in <asm/io.h>, the definition of the outsb
++	 * check to attribute to determine if using string io.
++	 *
++	 * CC_ATTR_GUEST_MEM_ENCRYPT: This attribute has been checked in the
++	 * force_dma_unencrypted(). Which means all DMA buffer will be shared
++	 * between pkvm guest and primary VM. And checked in
++	 * pci_swiotlb_detect(), this makes pkvm guest using bounce buffer.
++	 *
++	 * CC_ATTR_MEM_ENCRYPT: This attribute has been checked in the
++	 * mem_encrypt_init(). Which will make all bounce buffer being shared
++	 * between pkvm guest and primary VM.
++	 */
++	switch (attr) {
++	case CC_ATTR_GUEST_UNROLL_STRING_IO:
++	case CC_ATTR_GUEST_MEM_ENCRYPT:
++	case CC_ATTR_MEM_ENCRYPT:
++		return true;
++	default:
++		return false;
++	}
++}
++
+ static bool intel_cc_platform_has(enum cc_attr attr)
+ {
+ 	switch (attr) {
+@@ -90,6 +122,8 @@ bool cc_platform_has(enum cc_attr attr)
+ 		return intel_cc_platform_has(attr);
+ 	case CC_VENDOR_HYPERV:
+ 		return hyperv_cc_platform_has(attr);
++	case CC_VENDOR_PKVM:
++		return pkvm_cc_platform_has(attr);
+ 	default:
+ 		return false;
+ 	}
+diff --git a/arch/x86/coco/pkvm/Makefile b/arch/x86/coco/pkvm/Makefile
+new file mode 100644
+index 000000000000..7896f6d4f4b2
+--- /dev/null
++++ b/arch/x86/coco/pkvm/Makefile
+@@ -0,0 +1,3 @@
++# SPDX-License-Identifier: GPL-2.0
++
++obj-y += pkvm.o pkvmcall.o
+diff --git a/arch/x86/coco/pkvm/pkvm.c b/arch/x86/coco/pkvm/pkvm.c
+new file mode 100644
+index 000000000000..3590bed967db
+--- /dev/null
++++ b/arch/x86/coco/pkvm/pkvm.c
+@@ -0,0 +1,113 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#undef pr_fmt
++#define pr_fmt(fmt)     "pkvm: " fmt
++
++#include <linux/cpufeature.h>
++#include <linux/kvm_para.h>
++#include <asm/coco.h>
++#include <asm/vmx.h>
++#include <asm/pkvm.h>
++#include <asm/insn.h>
++#include <asm/insn-eval.h>
++#include <asm/pgtable.h>
++#include <asm/virt_exception.h>
++
++static bool pkvm_guest_detected;
++
++bool pkvm_is_protected_guest(void)
++{
++	return pkvm_guest_detected;
++}
++
++int pkvm_set_mem_host_visibility(unsigned long addr, int numpages, bool enc)
++{
++	unsigned long size = numpages * PAGE_SIZE;
++
++	if (!enc) {
++		/*
++		 * When pkvm guest want to share a range of memory, these pages
++		 * may have not been setup in the guest ept pagetables. So when
++		 * the pkvm do the __pkvm_guest_share_host() thing, if no page
++		 * found in guest ept, this function will failed, thus the share
++		 * page function will failed.
++		 * So before share these pages to host, first touch them, so
++		 * they will have entry in the guest ept, to make sure the
++		 * sharing will success.
++		 *
++		 * TODO: Another good way to mitigate this touch is to fake ept
++		 * violation when the sharing function find that there is no
++		 * page in the guest ept.
++		 */
++		memset((void *)addr, 0, size);
++		kvm_hypercall2(PKVM_GHC_SHARE_MEM, __pa(addr), size);
++	} else
++		kvm_hypercall2(PKVM_GHC_UNSHARE_MEM, __pa(addr), size);
++
++	return 0;
++}
++
++void pkvm_get_ve_info(struct ve_info *ve)
++{
++	/* Reuse the tdx output for pkvm. */
++	struct tdx_module_output out;
++
++	__pkvm_module_call(PKVM_GHC_GET_VE_INFO, &out);
++
++	/* Transfer the output parameters */
++	ve->exit_reason = out.rcx;
++	ve->exit_qual   = out.rdx;
++	ve->gla         = out.r8;
++	ve->gpa         = out.r9;
++}
++
++static bool mmio_write(int size, unsigned long addr, unsigned long val)
++{
++	kvm_hypercall3(PKVM_GHC_IOWRITE, addr, size, val);
++
++	return true;
++}
++
++static bool mmio_read(int size, unsigned long addr, unsigned long *val)
++{
++	*val = kvm_hypercall2(PKVM_GHC_IOREAD, addr, size);
++
++	return true;
++}
++
++static int virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
++{
++	switch (ve->exit_reason) {
++	case EXIT_REASON_EPT_VIOLATION:
++		return ve_handle_mmio(regs, ve);
++	default:
++		pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
++		return -EIO;
++	}
++}
++
++static bool pkvm_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
++{
++	int insn_len;
++
++	insn_len = virt_exception_kernel(regs, ve);
++	if (insn_len < 0)
++		return false;
++
++	/* After successful #VE handling, move the IP */
++	regs->ip += insn_len;
++
++	return true;
++}
++
++__init void pkvm_guest_init_coco(void)
++{
++	cc_set_vendor(CC_VENDOR_PKVM);
++
++	pkvm_guest_detected = true;
++
++	ve_x86_ops.mmio_read = mmio_read;
++	ve_x86_ops.mmio_write = mmio_write;
++	ve_x86_ops.handle_virt_exception = pkvm_handle_virt_exception;
++	ve_x86_ops.get_ve_info = pkvm_get_ve_info;
++}
+diff --git a/arch/x86/coco/pkvm/pkvmcall.S b/arch/x86/coco/pkvm/pkvmcall.S
+new file mode 100644
+index 000000000000..b7cbe432f065
+--- /dev/null
++++ b/arch/x86/coco/pkvm/pkvmcall.S
+@@ -0,0 +1,42 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#include <asm/asm-offsets.h>
++#include <asm/asm.h>
++#include <asm/frame.h>
++#include <asm/unwind_hints.h>
++
++#include <linux/linkage.h>
++#include <linux/bits.h>
++#include <linux/errno.h>
++
++.macro PKVM_MODULE_CALL
++	/* Save the output parameter. */
++	push %r12
++
++	/* Push output pointer to stack. */
++	push %rsi
++
++	mov %rdi, %rax
++
++	vmcall
++
++	pop %r12
++
++	test %r12, %r12
++	jz .Lno_output_struct
++
++	/* Copy result registers to output struct. */
++	movq %rcx, 0(%r12)
++	movq %rdx, 8(%r12)
++	movq %r8, 16(%r12)
++	movq %r9, 24(%r12)
++
++.Lno_output_struct:
++	pop %r12
++.endm
++
++SYM_FUNC_START(__pkvm_module_call)
++	FRAME_BEGIN
++	PKVM_MODULE_CALL
++	FRAME_END
++	RET
++SYM_FUNC_END(__pkvm_module_call)
+diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
+index 8a1d48b8c2a3..e8393b62ed0c 100644
+--- a/arch/x86/coco/tdx/tdx.c
++++ b/arch/x86/coco/tdx/tdx.c
+@@ -11,6 +11,7 @@
+ #include <asm/insn.h>
+ #include <asm/insn-eval.h>
+ #include <asm/pgtable.h>
++#include <asm/virt_exception.h>
+ 
+ /* TDX module Call Leaf IDs */
+ #define TDX_GET_INFO			1
+@@ -342,111 +343,6 @@ static bool mmio_write(int size, unsigned long addr, unsigned long val)
+ 			       EPT_WRITE, addr, val);
+ }
+ 
+-static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
+-{
+-	unsigned long *reg, val, vaddr;
+-	char buffer[MAX_INSN_SIZE];
+-	struct insn insn = {};
+-	enum mmio_type mmio;
+-	int size, extend_size;
+-	u8 extend_val = 0;
+-
+-	/* Only in-kernel MMIO is supported */
+-	if (WARN_ON_ONCE(user_mode(regs)))
+-		return -EFAULT;
+-
+-	if (copy_from_kernel_nofault(buffer, (void *)regs->ip, MAX_INSN_SIZE))
+-		return -EFAULT;
+-
+-	if (insn_decode(&insn, buffer, MAX_INSN_SIZE, INSN_MODE_64))
+-		return -EINVAL;
+-
+-	mmio = insn_decode_mmio(&insn, &size);
+-	if (WARN_ON_ONCE(mmio == MMIO_DECODE_FAILED))
+-		return -EINVAL;
+-
+-	if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) {
+-		reg = insn_get_modrm_reg_ptr(&insn, regs);
+-		if (!reg)
+-			return -EINVAL;
+-	}
+-
+-	/*
+-	 * Reject EPT violation #VEs that split pages.
+-	 *
+-	 * MMIO accesses are supposed to be naturally aligned and therefore
+-	 * never cross page boundaries. Seeing split page accesses indicates
+-	 * a bug or a load_unaligned_zeropad() that stepped into an MMIO page.
+-	 *
+-	 * load_unaligned_zeropad() will recover using exception fixups.
+-	 */
+-	vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
+-	if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE)
+-		return -EFAULT;
+-
+-	/* Handle writes first */
+-	switch (mmio) {
+-	case MMIO_WRITE:
+-		memcpy(&val, reg, size);
+-		if (!mmio_write(size, ve->gpa, val))
+-			return -EIO;
+-		return insn.length;
+-	case MMIO_WRITE_IMM:
+-		val = insn.immediate.value;
+-		if (!mmio_write(size, ve->gpa, val))
+-			return -EIO;
+-		return insn.length;
+-	case MMIO_READ:
+-	case MMIO_READ_ZERO_EXTEND:
+-	case MMIO_READ_SIGN_EXTEND:
+-		/* Reads are handled below */
+-		break;
+-	case MMIO_MOVS:
+-	case MMIO_DECODE_FAILED:
+-		/*
+-		 * MMIO was accessed with an instruction that could not be
+-		 * decoded or handled properly. It was likely not using io.h
+-		 * helpers or accessed MMIO accidentally.
+-		 */
+-		return -EINVAL;
+-	default:
+-		WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?");
+-		return -EINVAL;
+-	}
+-
+-	/* Handle reads */
+-	if (!mmio_read(size, ve->gpa, &val))
+-		return -EIO;
+-
+-	switch (mmio) {
+-	case MMIO_READ:
+-		/* Zero-extend for 32-bit operation */
+-		extend_size = size == 4 ? sizeof(*reg) : 0;
+-		break;
+-	case MMIO_READ_ZERO_EXTEND:
+-		/* Zero extend based on operand size */
+-		extend_size = insn.opnd_bytes;
+-		break;
+-	case MMIO_READ_SIGN_EXTEND:
+-		/* Sign extend based on operand size */
+-		extend_size = insn.opnd_bytes;
+-		if (size == 1 && val & BIT(7))
+-			extend_val = 0xFF;
+-		else if (size > 1 && val & BIT(15))
+-			extend_val = 0xFF;
+-		break;
+-	default:
+-		/* All other cases has to be covered with the first switch() */
+-		WARN_ON_ONCE(1);
+-		return -EINVAL;
+-	}
+-
+-	if (extend_size)
+-		memset(reg, extend_val, extend_size);
+-	memcpy(reg, &val, size);
+-	return insn.length;
+-}
+-
+ static bool handle_in(struct pt_regs *regs, int size, int port)
+ {
+ 	struct tdx_hypercall_args args = {
+@@ -606,7 +502,7 @@ static int virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
+ 	case EXIT_REASON_CPUID:
+ 		return handle_cpuid(regs, ve);
+ 	case EXIT_REASON_EPT_VIOLATION:
+-		return handle_mmio(regs, ve);
++		return ve_handle_mmio(regs, ve);
+ 	case EXIT_REASON_IO_INSTRUCTION:
+ 		return handle_io(regs, ve);
+ 	default:
+@@ -615,7 +511,7 @@ static int virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
+ 	}
+ }
+ 
+-bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
++static bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
+ {
+ 	int insn_len;
+ 
+@@ -829,5 +725,10 @@ void __init tdx_early_init(void)
+ 	x86_platform.guest.enc_cache_flush_required  = tdx_cache_flush_required;
+ 	x86_platform.guest.enc_tlb_flush_required    = tdx_tlb_flush_required;
+ 
++	ve_x86_ops.mmio_read = mmio_read;
++	ve_x86_ops.mmio_write = mmio_write;
++	ve_x86_ops.handle_virt_exception = tdx_handle_virt_exception;
++	ve_x86_ops.get_ve_info = tdx_get_ve_info;
++
+ 	pr_info("Guest detected\n");
+ }
+diff --git a/arch/x86/coco/virt_exception.c b/arch/x86/coco/virt_exception.c
+new file mode 100644
+index 000000000000..15db92c24787
+--- /dev/null
++++ b/arch/x86/coco/virt_exception.c
+@@ -0,0 +1,126 @@
++// SPDX-License-Identifier: GPL-2.0-only
++#include <linux/cpufeature.h>
++#include <asm/insn.h>
++#include <asm/insn-eval.h>
++#include <asm/virt_exception.h>
++
++struct ve_x86_ops ve_x86_ops;
++
++int ve_handle_mmio(struct pt_regs *regs, struct ve_info *ve)
++{
++	unsigned long *reg, val, vaddr;
++	char buffer[MAX_INSN_SIZE];
++	struct insn insn = {};
++	enum mmio_type mmio;
++	int size, extend_size;
++	u8 extend_val = 0;
++
++	/* Only in-kernel MMIO is supported */
++	if (WARN_ON_ONCE(user_mode(regs)))
++		return -EFAULT;
++
++	if (copy_from_kernel_nofault(buffer, (void *)regs->ip, MAX_INSN_SIZE))
++		return -EFAULT;
++
++	if (insn_decode(&insn, buffer, MAX_INSN_SIZE, INSN_MODE_64))
++		return -EINVAL;
++
++	mmio = insn_decode_mmio(&insn, &size);
++	if (WARN_ON_ONCE(mmio == MMIO_DECODE_FAILED))
++		return -EINVAL;
++
++	if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) {
++		reg = insn_get_modrm_reg_ptr(&insn, regs);
++		if (!reg)
++			return -EINVAL;
++	}
++
++	/*
++	 * Reject EPT violation #VEs that split pages.
++	 *
++	 * MMIO accesses are supposed to be naturally aligned and therefore
++	 * never cross page boundaries. Seeing split page accesses indicates
++	 * a bug or a load_unaligned_zeropad() that stepped into an MMIO page.
++	 *
++	 * load_unaligned_zeropad() will recover using exception fixups.
++	 */
++	vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
++	if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE)
++		return -EFAULT;
++
++	/* Handle writes first */
++	switch (mmio) {
++	case MMIO_WRITE:
++		memcpy(&val, reg, size);
++		if (!ve_x86_ops.mmio_write(size, ve->gpa, val))
++			return -EIO;
++		return insn.length;
++	case MMIO_WRITE_IMM:
++		val = insn.immediate.value;
++		if (!ve_x86_ops.mmio_write(size, ve->gpa, val))
++			return -EIO;
++		return insn.length;
++	case MMIO_READ:
++	case MMIO_READ_ZERO_EXTEND:
++	case MMIO_READ_SIGN_EXTEND:
++		/* Reads are handled below */
++		break;
++	case MMIO_MOVS:
++	case MMIO_DECODE_FAILED:
++		/*
++		 * MMIO was accessed with an instruction that could not be
++		 * decoded or handled properly. It was likely not using io.h
++		 * helpers or accessed MMIO accidentally.
++		 */
++		return -EINVAL;
++	default:
++		WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?");
++		return -EINVAL;
++	}
++
++	/* Handle reads */
++	if (!ve_x86_ops.mmio_read(size, ve->gpa, &val))
++		return -EIO;
++
++	switch (mmio) {
++	case MMIO_READ:
++		/* Zero-extend for 32-bit operation */
++		extend_size = size == 4 ? sizeof(*reg) : 0;
++		break;
++	case MMIO_READ_ZERO_EXTEND:
++		/* Zero extend based on operand size */
++		extend_size = insn.opnd_bytes;
++		break;
++	case MMIO_READ_SIGN_EXTEND:
++		/* Sign extend based on operand size */
++		extend_size = insn.opnd_bytes;
++		if (size == 1 && val & BIT(7))
++			extend_val = 0xFF;
++		else if (size > 1 && val & BIT(15))
++			extend_val = 0xFF;
++		break;
++	default:
++		/* All other cases has to be covered with the first switch() */
++		WARN_ON_ONCE(1);
++		return -EINVAL;
++	}
++
++	if (extend_size)
++		memset(reg, extend_val, extend_size);
++	memcpy(reg, &val, size);
++	return insn.length;
++}
++
++bool handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
++{
++	if (ve_x86_ops.handle_virt_exception)
++		return ve_x86_ops.handle_virt_exception(regs, ve);
++
++	return false;
++}
++
++void get_ve_info(struct ve_info *ve)
++{
++	if (ve_x86_ops.get_ve_info)
++		ve_x86_ops.get_ve_info(ve);
++}
+diff --git a/arch/x86/include/asm/coco.h b/arch/x86/include/asm/coco.h
+index 3d98c3a60d34..065a70448d8a 100644
+--- a/arch/x86/include/asm/coco.h
++++ b/arch/x86/include/asm/coco.h
+@@ -9,6 +9,7 @@ enum cc_vendor {
+ 	CC_VENDOR_AMD,
+ 	CC_VENDOR_HYPERV,
+ 	CC_VENDOR_INTEL,
++	CC_VENDOR_PKVM,
+ };
+ 
+ void cc_set_vendor(enum cc_vendor v);
+diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
+index e41cbf2ec41d..731db23c6197 100644
+--- a/arch/x86/include/asm/hypervisor.h
++++ b/arch/x86/include/asm/hypervisor.h
+@@ -30,6 +30,7 @@ enum x86_hypervisor_type {
+ 	X86_HYPER_KVM,
+ 	X86_HYPER_JAILHOUSE,
+ 	X86_HYPER_ACRN,
++	X86_HYPER_PKVM,
+ };
+ 
+ #ifdef CONFIG_HYPERVISOR_GUEST
+@@ -64,6 +65,7 @@ extern const struct hypervisor_x86 x86_hyper_xen_pv;
+ extern const struct hypervisor_x86 x86_hyper_kvm;
+ extern const struct hypervisor_x86 x86_hyper_jailhouse;
+ extern const struct hypervisor_x86 x86_hyper_acrn;
++extern const struct hypervisor_x86 x86_hyper_pkvm;
+ extern struct hypervisor_x86 x86_hyper_xen_hvm;
+ 
+ extern bool nopv;
+diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
+index 72184b0b2219..51d79424c09a 100644
+--- a/arch/x86/include/asm/idtentry.h
++++ b/arch/x86/include/asm/idtentry.h
+@@ -632,7 +632,7 @@ DECLARE_IDTENTRY_XENCB(X86_TRAP_OTHER,	exc_xen_hypervisor_callback);
+ DECLARE_IDTENTRY_RAW(X86_TRAP_OTHER,	exc_xen_unknown_trap);
+ #endif
+ 
+-#ifdef CONFIG_INTEL_TDX_GUEST
++#if defined(CONFIG_INTEL_TDX_GUEST) || defined(CONFIG_PKVM_GUEST)
+ DECLARE_IDTENTRY(X86_TRAP_VE,		exc_virtualization_exception);
+ #endif
+ 
+diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
+index 2c6698aa218b..6edd77847405 100644
+--- a/arch/x86/include/asm/kvm-x86-ops.h
++++ b/arch/x86/include/asm/kvm-x86-ops.h
+@@ -19,8 +19,10 @@ KVM_X86_OP(hardware_disable)
+ KVM_X86_OP(hardware_unsetup)
+ KVM_X86_OP(has_emulated_msr)
+ KVM_X86_OP(vcpu_after_set_cpuid)
++KVM_X86_OP(is_vm_type_supported)
+ KVM_X86_OP(vm_init)
+ KVM_X86_OP_OPTIONAL(vm_destroy)
++KVM_X86_OP_OPTIONAL(vm_free)
+ KVM_X86_OP_OPTIONAL_RET0(vcpu_precreate)
+ KVM_X86_OP(vcpu_create)
+ KVM_X86_OP(vcpu_free)
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index c1dcaa3d2d6e..13cce3625c9a 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1148,6 +1148,7 @@ enum kvm_apicv_inhibit {
+ };
+ 
+ struct kvm_arch {
++	unsigned long vm_type;
+ 	unsigned long n_used_mmu_pages;
+ 	unsigned long n_requested_mmu_pages;
+ 	unsigned long n_max_mmu_pages;
+@@ -1462,10 +1463,12 @@ struct kvm_x86_ops {
+ 	void (*hardware_unsetup)(void);
+ 	bool (*has_emulated_msr)(struct kvm *kvm, u32 index);
+ 	void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu);
++	bool (*is_vm_type_supported)(unsigned long vm_type);
+ 
+ 	unsigned int vm_size;
+ 	int (*vm_init)(struct kvm *kvm);
+ 	void (*vm_destroy)(struct kvm *kvm);
++	void (*vm_free)(struct kvm *kvm);
+ 
+ 	/* Create, but do not attach this VCPU */
+ 	int (*vcpu_precreate)(struct kvm *kvm);
+@@ -1666,6 +1669,7 @@ struct kvm_x86_nested_ops {
+ };
+ 
+ struct kvm_x86_init_ops {
++	int (*pkvm_init)(void);
+ 	int (*cpu_has_kvm_support)(void);
+ 	int (*disabled_by_bios)(void);
+ 	int (*check_processor_compatibility)(void);
+@@ -1717,9 +1721,23 @@ static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
+ 		return -ENOTSUPP;
+ }
+ 
++#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB_WITH_RANGE
++static inline int kvm_arch_flush_remote_tlb_with_range(struct kvm *kvm,
++	struct kvm_tlb_range *range)
++{
++	if (range && kvm_x86_ops.tlb_remote_flush_with_range &&
++	    !static_call(kvm_x86_tlb_remote_flush_with_range)(kvm, range))
++		return 0;
++
++	return -ENOTSUPP;
++}
++
+ #define kvm_arch_pmi_in_guest(vcpu) \
+ 	((vcpu) && (vcpu)->arch.handling_intr_from_guest)
+ 
++#ifdef CONFIG_PKVM_INTEL
++int __init pkvm_init(void);
++#endif
+ void __init kvm_mmu_x86_module_init(void);
+ int kvm_mmu_vendor_module_init(void);
+ void kvm_mmu_vendor_module_exit(void);
+diff --git a/arch/x86/include/asm/kvm_pkvm.h b/arch/x86/include/asm/kvm_pkvm.h
+new file mode 100644
+index 000000000000..73ec34f2c4df
+--- /dev/null
++++ b/arch/x86/include/asm/kvm_pkvm.h
+@@ -0,0 +1,250 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++
++#ifndef _ASM_X86_KVM_PKVM_H
++#define _ASM_X86_KVM_PKVM_H
++
++#include <linux/kvm_host.h>
++
++#ifdef CONFIG_PKVM_INTEL
++
++#include <linux/memblock.h>
++#include <asm/pkvm_image.h>
++#include <asm/pkvm.h>
++
++#define PKVM_MEMBLOCK_REGIONS   128
++#define PKVM_PGTABLE_MAX_LEVELS		5U
++
++extern struct memblock_region pkvm_sym(pkvm_memory)[];
++extern unsigned int pkvm_sym(pkvm_memblock_nr);
++
++void *pkvm_phys_to_virt(unsigned long phys);
++unsigned long pkvm_virt_to_phys(void *virt);
++
++#define __pkvm_pa(virt)	pkvm_virt_to_phys((void *)(virt))
++#define __pkvm_va(phys)	pkvm_phys_to_virt((unsigned long)(phys))
++
++extern phys_addr_t pkvm_mem_base;
++extern phys_addr_t pkvm_mem_size;
++
++void __init pkvm_reserve(void);
++
++static inline unsigned long __pkvm_pgtable_max_pages(unsigned long nr_pages)
++{
++	unsigned long total = 0, i;
++
++	/* Provision the worst case */
++	for (i = 0; i < PKVM_PGTABLE_MAX_LEVELS; i++) {
++		nr_pages = DIV_ROUND_UP(nr_pages, PTRS_PER_PTE);
++		total += nr_pages;
++	}
++
++	return total;
++}
++
++static inline unsigned long __pkvm_pgtable_total_pages(void)
++{
++	unsigned long total = 0, i;
++
++	for (i = 0; i < pkvm_sym(pkvm_memblock_nr); i++) {
++		struct memblock_region *reg = &pkvm_sym(pkvm_memory)[i];
++		total += __pkvm_pgtable_max_pages(reg->size >> PAGE_SHIFT);
++	}
++
++	return total;
++}
++
++static inline unsigned long host_ept_pgtable_pages(void)
++{
++	unsigned long res;
++
++	/*
++	 * Include an extra 16 pages to safely upper-bound the worst case of
++	 * concatenated pgds.
++	 */
++	res = __pkvm_pgtable_total_pages() + 16;
++
++	/* Allow 1 GiB for MMIO mappings */
++	 res += __pkvm_pgtable_max_pages(SZ_1G >> PAGE_SHIFT);
++
++	return res;
++}
++
++static inline unsigned long pkvm_mmu_pgtable_pages(void)
++{
++	unsigned long res;
++
++	res = __pkvm_pgtable_total_pages();
++
++	return res;
++}
++
++static inline unsigned long pkvm_vmemmap_memblock_size(struct memblock_region *reg,
++		size_t vmemmap_entry_size)
++{
++	unsigned long nr_pages = reg->size >> PAGE_SHIFT;
++	unsigned long start, end;
++
++	/* Translate the pfn to the vmemmap entry */
++	start = (reg->base >> PAGE_SHIFT) * vmemmap_entry_size;
++	end = start + nr_pages * vmemmap_entry_size;
++	start = ALIGN_DOWN(start, PAGE_SIZE);
++	end = ALIGN(end, PAGE_SIZE);
++
++	return end - start;
++}
++
++static inline unsigned long pkvm_vmemmap_pages(size_t vmemmap_entry_size)
++{
++	unsigned long total_size = 0, i;
++
++	for (i = 0; i < pkvm_sym(pkvm_memblock_nr); i++) {
++		total_size += pkvm_vmemmap_memblock_size(&pkvm_sym(pkvm_memory)[i],
++							 vmemmap_entry_size);
++	}
++
++	return total_size >> PAGE_SHIFT;
++}
++
++static inline unsigned long pkvm_data_struct_pages(unsigned long global_pgs,
++		unsigned long percpu_pgs, int num_cpus)
++{
++	return (percpu_pgs * num_cpus + global_pgs);
++}
++
++static inline int pkvm_pre_reserve_check(void)
++{
++	/* no necessary check yet*/
++	return 0;
++}
++
++/* Calculate the total pages for Scalable IOMMU */
++static inline unsigned long pkvm_iommu_pages(int max_pasid, int nr_pasid_pdev,
++					     int nr_pdev, int nr_iommu, int qidesc_sz,
++					     int qidesc_status_sz, int num_cpus)
++{
++	unsigned long res = 0;
++
++	/* PASID page table pages for each PASID capable pdev */
++	res += ((max_pasid >> 6) + (max_pasid >> 15)) * nr_pasid_pdev;
++	/* PASID page table pages (PASID dir + PASID table) for each normal pdev */
++	res += 2 * nr_pdev;
++	/*
++	 * Context table page count is the minimal value of
++	 * total pdev number and 256 bus * 2 (in scalable mode).
++	 * Each pdev may require a context page if its bdf is
++	 * discrete enough.
++	 */
++	res += min(256 * 2, nr_pasid_pdev + nr_pdev);
++	/* Root pages for each IOMMU */
++	res += nr_iommu;
++	/* Desc and desc_status pages for each IOMMU */
++	res += nr_iommu * ((1 << get_order(qidesc_sz)) + (1 << get_order(qidesc_status_sz)));
++	/*
++	 * Reserve more IQ descriptor page. The size is calculated according to
++	 * the IOMMU QI descriptor size(excludes the QI descriptor status as
++	 * this is not needed to bunch requests) and the CPU number. Each CPU can
++	 * have its own reserved QI descriptor page so that multiple CPUs can
++	 * bunch the QI requests at the same time.
++	 */
++	res += num_cpus * (1 << get_order(qidesc_sz));
++
++	return res;
++}
++
++/*
++ * Calculate the total pages for shadow EPT. The assumptions are that:
++ * 1. There is no shared memory between normal VMs or between secure VMs.
++ * 2. The normal VM or secure VM memory size is no larger than the platform
++ * memory size.
++ * 3. The virtual MMIO range for each VM is no larger than 1G.
++ * With these assumptions, we can reserve enough memory for normal VMs and
++ * secure VMs.
++ * 4. Each VM only has one shadow EPT. This will make vSMM mode and non-vSMM
++ * mode share the same shadow EPT for a VM, which brings security weakness for
++ * the vSMM mode.
++ */
++static inline unsigned long pkvm_shadow_ept_pgtable_pages(int nr_vm)
++{
++	unsigned long pgtable_pages = __pkvm_pgtable_total_pages();
++	unsigned long res;
++
++	/*
++	 * Reserve enough pages to map all the platform memory in shadow
++	 * EPT. With assumption#1 and assumption#4, these pages are enough
++	 * for all VMs.
++	 */
++	res = pgtable_pages;
++
++	/*
++	 * There are multiple VMs. Although the total pages can be calculated
++	 * through __pkvm_pgtable_total_pages() to map all the memory, this is
++	 * enough to satisfy the level1 page table pages for all VMs but not
++	 * enough to satisfy the level2:level5 page table pages. Each VM will
++	 * require its own level2:level5 pages. Because __pkvm_pgtable_total_pages
++	 * has already allocated 1 level2:level5, we just minus 1 from the total
++	 * number of VMs, and multiply it by 2 considering SMM mode.
++	 */
++	res += __pkvm_pgtable_max_pages(pgtable_pages) * (nr_vm - 1) * 2;
++
++	/* Allow 1 GiB for MMIO mappings for each VM */
++	 res += __pkvm_pgtable_max_pages(SZ_1G >> PAGE_SHIFT) * nr_vm;
++
++	 /*
++	  * Each shadow VM has two page tables. One is used to manage page state
++	  * and reused as IOMMU second-level pagetable for passthrough device in
++	  * protected VM. Another one is used as shadow EPT.
++	  */
++	return (res * 2);
++}
++
++/*
++ * Calculate the total pages for shadow IOMMU page tables for the host's
++ * devices used with Legacy IOMMU. Similarly to the calculation for shadow EPT,
++ * we assume that there is no shared memory between devices using different
++ * page tables.
++ *
++ * TODO: do not reserve these pages if legacy mode is not used by pKVM, i.e.
++ * if all the IOMMUs have scalable mode capability.
++ */
++static inline unsigned long pkvm_host_shadow_iommu_pgtable_pages(int nr_pdev)
++{
++	unsigned long pgtable_pages = __pkvm_pgtable_total_pages();
++	unsigned long res;
++
++	res = pgtable_pages;
++
++	/*
++	 * Similarly to shadow VMs (see the comment in
++	 * pkvm_shadow_ept_pgtable_pages()), each device may require
++	 * its own level2:level5 page table pages.
++	 */
++	res += __pkvm_pgtable_max_pages(pgtable_pages) * (nr_pdev - 1);
++
++	return res;
++}
++
++
++u64 pkvm_total_reserve_pages(void);
++
++int pkvm_init_shadow_vm(struct kvm *kvm);
++void pkvm_teardown_shadow_vm(struct kvm *kvm);
++int pkvm_init_shadow_vcpu(struct kvm_vcpu *vcpu);
++void pkvm_teardown_shadow_vcpu(struct kvm_vcpu *vcpu);
++int pkvm_tlb_remote_flush(struct kvm *kvm);
++int pkvm_tlb_remote_flush_with_range(struct kvm *kvm,
++				     struct kvm_tlb_range *range);
++int pkvm_set_mmio_ve(struct kvm_vcpu *vcpu, unsigned long gfn);
++#else
++static inline void pkvm_reserve(void) {}
++static inline int pkvm_init_shadow_vm(struct kvm *kvm) { return 0; }
++static inline void pkvm_teardown_shadow_vm(struct kvm *kvm) {}
++static inline int pkvm_init_shadow_vcpu(struct kvm_vcpu *vcpu) { return 0; }
++static inline void pkvm_teardown_shadow_vcpu(struct kvm_vcpu *vcpu) {}
++static inline int pkvm_set_mmio_ve(struct kvm_vcpu *vcpu, unsigned long gfn) { return 0; }
++#endif
++
++#endif
++
+diff --git a/arch/x86/include/asm/pkvm.h b/arch/x86/include/asm/pkvm.h
+new file mode 100644
+index 000000000000..a404dd549a1b
+--- /dev/null
++++ b/arch/x86/include/asm/pkvm.h
+@@ -0,0 +1,151 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++#ifndef _ASM_X86_PKVM_H
++#define _ASM_X86_PKVM_H
++
++#include <asm/kvm_para.h>
++#include <asm/io.h>
++#include <asm/coco.h>
++#include <asm/virt_exception.h>
++
++/* PKVM Hypercalls */
++#define PKVM_HC_INIT_FINALISE		1
++#define PKVM_HC_INIT_SHADOW_VM		2
++#define PKVM_HC_INIT_SHADOW_VCPU	3
++#define PKVM_HC_TEARDOWN_SHADOW_VM	4
++#define PKVM_HC_TEARDOWN_SHADOW_VCPU	5
++#define PKVM_HC_MMIO_ACCESS		6
++#define PKVM_HC_ACTIVATE_IOMMU		7
++#define PKVM_HC_TLB_REMOTE_FLUSH_RANGE	8
++#define PKVM_HC_SET_MMIO_VE		9
++#define PKVM_HC_ADD_PTDEV		10
++
++/*
++ * 15bits for PASID, DO NOT change it, based on it,
++ * the size of PASID DIR table can kept as one page
++ */
++#define PKVM_MAX_PASID_BITS	15
++#define PKVM_MAX_PASID		(1 << PKVM_MAX_PASID_BITS)
++
++#ifdef CONFIG_PKVM_INTEL
++DECLARE_PER_CPU_READ_MOSTLY(bool, pkvm_enabled);
++
++static inline u64 pkvm_readq(void __iomem *reg, unsigned long reg_phys,
++			     unsigned long offset)
++{
++	if (likely(this_cpu_read(pkvm_enabled)))
++		return (u64)kvm_hypercall3(PKVM_HC_MMIO_ACCESS, true,
++					   sizeof(u64), reg_phys + offset);
++	else
++		return readq(reg + offset);
++}
++
++static inline u32 pkvm_readl(void __iomem *reg, unsigned long reg_phys,
++			     unsigned long offset)
++{
++	if (likely(this_cpu_read(pkvm_enabled)))
++		return (u32)kvm_hypercall3(PKVM_HC_MMIO_ACCESS, true,
++					   sizeof(u32), reg_phys + offset);
++	else
++		return readl(reg + offset);
++}
++
++static inline void pkvm_writeq(void __iomem *reg, unsigned long reg_phys,
++			       unsigned long offset, u64 val)
++{
++	if (likely(this_cpu_read(pkvm_enabled)))
++		kvm_hypercall4(PKVM_HC_MMIO_ACCESS, false, sizeof(u64),
++			       reg_phys + offset, val);
++	else
++		writeq(val, reg + offset);
++}
++
++static inline void pkvm_writel(void __iomem *reg, unsigned long reg_phys,
++			       unsigned long offset, u32 val)
++{
++	if (likely(this_cpu_read(pkvm_enabled)))
++		kvm_hypercall4(PKVM_HC_MMIO_ACCESS, false, sizeof(u32),
++			       reg_phys + offset, (u64)val);
++	else
++		writel(val, reg + offset);
++}
++
++static inline void pkvm_update_iommu_virtual_caps(u64 *cap, u64 *ecap)
++{
++	if (cap)
++		/*
++		 * Set caching mode as linux OS will run in a VM
++		 * controlling a virtual IOMMU device emulated
++		 * by pkvm.
++		 */
++		*cap |= 1 << 7;
++
++	if (ecap) {
++		u64 tmp;
++
++		/*
++		 * Some IOMMU capabilities cannot be directly used by the linux
++		 * IOMMU driver after the linux is deprivileged, which is because after
++		 * deprivileging, pkvm IOMMU driver will control the physical IOMMU and
++		 * it is designed to use physical IOMMU in two ways for better performance
++		 * and simpler implementation:
++		 * 1. using nested translation with the first-level from the deprivileged
++		 * linux IOMMU driver and EPT as second-level.
++		 * 2. using second-level only translation with EPT.
++		 * The linux IOMMU driver then uses a virtual IOMMU device emulated by
++		 * pkvm IOMMU driver.
++		 *
++		 * Way#1 and way#2 can only support the linux IOMMU driver working in
++		 * first-level translation mode or HW pass-through mode. To guarantee
++		 * this, let linux IOMMU driver pick up the supported capabilities
++		 * when running at the bare metal if pkvm is enabled, to make it a
++		 * pkvm-awared IOMMU kernel driver.
++		 *
++		 * So disable SLTS and Nest.
++		 */
++		*ecap &= ~((1UL << 46) | (1UL << 26));
++
++		/* limit PASID to reduce the memory consumptions */
++		tmp = min_t(u64, (PKVM_MAX_PASID_BITS - 1),
++			    (*ecap & GENMASK_ULL(39, 35)) >> 35);
++		*ecap = (*ecap & ~GENMASK_ULL(39, 35)) | (tmp << 35);
++
++		/*
++		 * Disable Device TLB capability for security.
++		 *
++		 * ATS is only enabled for trusted devices by the host OS.
++		 * However with pkvm, the host OS including the device driver
++		 * is treated as untrusted software. A malicious software in
++		 * host OS may enable ATS for untrusted devices so that one
++		 * untrusted device can still exploit the ATS weakness to bypass
++		 * VT-d's translation protection and access the isolated memory.
++		 *
++		 * To resolve this, tell the host IOMMU driver not to enable
++		 * any device's ATS as pkvm controls IOMMU not to enable the
++		 * device TLB.
++		 */
++		*ecap &= ~(1UL << 2);
++	}
++}
++#endif
++
++#ifdef CONFIG_PKVM_GUEST
++
++void pkvm_guest_init_coco(void);
++bool pkvm_is_protected_guest(void);
++int pkvm_set_mem_host_visibility(unsigned long addr, int numpages, bool enc);
++
++u64 __pkvm_module_call(u64 fn, struct tdx_module_output *out);
++
++#else
++
++static inline void pkvm_guest_init_coco(void) { }
++static inline bool pkvm_is_protected_guest(void) { return false; }
++static inline int
++pkvm_set_mem_host_visibility(unsigned long addr, int numpages, bool enc) { return 0; }
++
++#endif
++
++#endif
+diff --git a/arch/x86/include/asm/pkvm_image.h b/arch/x86/include/asm/pkvm_image.h
+new file mode 100644
+index 000000000000..5ae6a53177eb
+--- /dev/null
++++ b/arch/x86/include/asm/pkvm_image.h
+@@ -0,0 +1,48 @@
++/* SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++
++#ifndef __X86_INTEL_PKVM_IMAGE_H
++#define __X86_INTEL_PKVM_IMAGE_H
++
++#if defined(CONFIG_PKVM_INTEL_DEBUG) || defined(__PKVM_HYP__)
++/* No prefix will be added */
++#define PKVM_DECLARE(type, f)	type f
++#define pkvm_sym(sym)		sym
++#else
++/* prefix is added by Makefile */
++#define PKVM_DECLARE(type, f)	type __pkvm_##f
++#define pkvm_sym(sym)		__pkvm_##sym
++#endif
++
++#define __PKVM_CONCAT(a, b)	a ## b
++#define PKVM_CONCAT(a, b)	__PKVM_CONCAT(a, b)
++
++#ifdef LINKER_SCRIPT
++
++#define PKVM_SECTION_NAME(NAME)	.pkvm##NAME
++
++#define PKVM_SECTION_SYMBOL_NAME(NAME) \
++	PKVM_CONCAT(__pkvm_section_, PKVM_SECTION_NAME(NAME))
++
++#define BEGIN_PKVM_SECTION(NAME)			\
++	PKVM_SECTION_NAME(NAME) : {			\
++		PKVM_SECTION_SYMBOL_NAME(NAME) = .;
++
++#define END_PKVM_SECTION				\
++	}
++
++#define PKVM_SECTION(NAME)			\
++	BEGIN_PKVM_SECTION(NAME)		\
++		*(NAME NAME##.*)		\
++	END_PKVM_SECTION
++
++/*
++ * Defines a linker script alias of a kernel-proper symbol referenced by
++ * PKVM code.
++ */
++#define PKVM_ALIAS(sym)  pkvm_sym(sym) = sym;
++
++#endif /* LINKER_SCRIPT */
++
++#endif /* __X86_INTEL_PKVM_IMAGE_H */
+diff --git a/arch/x86/include/asm/pkvm_image_vars.h b/arch/x86/include/asm/pkvm_image_vars.h
+new file mode 100644
+index 000000000000..94d8d6910299
+--- /dev/null
++++ b/arch/x86/include/asm/pkvm_image_vars.h
+@@ -0,0 +1,23 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++#ifndef __ASM_x86_PKVM_IMAGE_VARS_H
++#define __ASM_x86_PKVM_IMAGE_VARS_H
++
++#ifndef CONFIG_PKVM_INTEL_DEBUG
++
++#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
++PKVM_ALIAS(physical_mask);
++#endif
++
++#ifdef CONFIG_AMD_MEM_ENCRYPT
++PKVM_ALIAS(sme_me_mask);
++#endif
++
++PKVM_ALIAS(__default_kernel_pte_mask);
++PKVM_ALIAS(vmcs12_field_offsets);
++PKVM_ALIAS(nr_vmcs12_fields);
++#endif
++
++#endif
+diff --git a/arch/x86/include/asm/pkvm_spinlock.h b/arch/x86/include/asm/pkvm_spinlock.h
+new file mode 100644
+index 000000000000..e524116fe15d
+--- /dev/null
++++ b/arch/x86/include/asm/pkvm_spinlock.h
+@@ -0,0 +1,62 @@
++/*
++ * SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
++ * Copyright (C) 2018-2022 Intel Corporation
++ *
++ * pkvm runs in a self-contained environment
++ * and requires a self-contained spinlock implementation
++ * which doesn't rely on any other external symbols.
++ *
++ * This is arch specific implementation
++ * */
++#ifndef _ASM_X86_PKVM_SPINLOCK_H
++#define _ASM_X86_PKVM_SPINLOCK_H
++
++#include <linux/types.h>
++
++typedef struct arch_pkvm_spinlock {
++	union {
++		u64 head_tail;
++		struct {
++			u32 head;
++			u32 tail;
++		};
++	};
++} arch_pkvm_spinlock_t;
++
++#define __ARCH_PKVM_SPINLOCK_UNLOCKED	{ { 0 } }
++
++static inline void arch_pkvm_spin_lock(arch_pkvm_spinlock_t *lock)
++{
++	/* The lock function atomically increments and exchanges the head
++	 * counter of the queue. If the old head of the queue is equal to the
++	 * tail, we have locked the spinlock. Otherwise we have to wait.
++	 */
++
++	asm volatile ("   movl $0x1,%%eax\n"
++		      "   lock xaddl %%eax,%[head]\n"
++		      "   cmpl %%eax,%[tail]\n"
++		      "   jz 1f\n"
++		      "2: pause\n"
++		      "   cmpl %%eax,%[tail]\n"
++		      "   jnz 2b\n"
++		      "1:\n"
++		      :
++		      :
++		      [head] "m"(lock->head),
++		      [tail] "m"(lock->tail)
++		      : "cc", "memory", "eax");
++}
++
++static inline void arch_pkvm_spin_unlock(arch_pkvm_spinlock_t *lock)
++{
++	/* Increment tail of queue */
++	asm volatile ("   lock incl %[tail]\n"
++				:
++				: [tail] "m" (lock->tail)
++				: "cc", "memory");
++
++}
++
++static inline void arch_pkvm_assert_lock_held(arch_pkvm_spinlock_t *lock) { }
++
++#endif
+diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
+index 020c81a7c729..e44b97f69ccd 100644
+--- a/arch/x86/include/asm/tdx.h
++++ b/arch/x86/include/asm/tdx.h
+@@ -7,6 +7,7 @@
+ #include <linux/bits.h>
+ #include <asm/ptrace.h>
+ #include <asm/shared/tdx.h>
++#include <asm/virt_exception.h>
+ 
+ /*
+  * SW-defined error codes.
+@@ -35,22 +36,6 @@ struct tdx_module_output {
+ 	u64 r11;
+ };
+ 
+-/*
+- * Used by the #VE exception handler to gather the #VE exception
+- * info from the TDX module. This is a software only structure
+- * and not part of the TDX module/VMM ABI.
+- */
+-struct ve_info {
+-	u64 exit_reason;
+-	u64 exit_qual;
+-	/* Guest Linear (virtual) Address */
+-	u64 gla;
+-	/* Guest Physical Address */
+-	u64 gpa;
+-	u32 instr_len;
+-	u32 instr_info;
+-};
+-
+ #ifdef CONFIG_INTEL_TDX_GUEST
+ 
+ void __init tdx_early_init(void);
+@@ -61,8 +46,6 @@ u64 __tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9,
+ 
+ void tdx_get_ve_info(struct ve_info *ve);
+ 
+-bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve);
+-
+ void tdx_safe_halt(void);
+ 
+ bool tdx_early_handle_ve(struct pt_regs *regs);
+diff --git a/arch/x86/include/asm/virt_exception.h b/arch/x86/include/asm/virt_exception.h
+new file mode 100644
+index 000000000000..ec75523624d7
+--- /dev/null
++++ b/arch/x86/include/asm/virt_exception.h
+@@ -0,0 +1,41 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _ASM_X86_VIRT_EXCEPTION_H
++#define _ASM_X86_VIRT_EXCEPTION_H
++
++#include <asm/ptrace.h>
++
++#ifndef __ASSEMBLY__
++
++/*
++ * Used by the #VE exception handler to gather the #VE exception
++ * info from the TDX module. This is a software only structure
++ * and not part of the TDX module/VMM ABI.
++ */
++struct ve_info {
++	u64 exit_reason;
++	u64 exit_qual;
++	/* Guest Linear (virtual) Address */
++	u64 gla;
++	/* Guest Physical Address */
++	u64 gpa;
++	u32 instr_len;
++	u32 instr_info;
++};
++
++int ve_handle_mmio(struct pt_regs *regs, struct ve_info *ve);
++
++void get_ve_info(struct ve_info *ve);
++
++bool handle_virt_exception(struct pt_regs *regs, struct ve_info *ve);
++
++struct ve_x86_ops {
++	bool (*mmio_read)(int size, unsigned long addr, unsigned long *val);
++	bool (*mmio_write)(int size, unsigned long addr, unsigned long val);
++	bool (*handle_virt_exception)(struct pt_regs *regs, struct ve_info *ve);
++	void (*get_ve_info)(struct ve_info *ve);
++};
++
++extern struct ve_x86_ops ve_x86_ops;
++
++#endif /* !__ASSEMBLY__ */
++#endif /* _ASM_X86_VIRT_EXCEPTION_H */
+diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
+index 498dc600bd5c..0af92c414c19 100644
+--- a/arch/x86/include/asm/vmx.h
++++ b/arch/x86/include/asm/vmx.h
+@@ -68,6 +68,7 @@
+ #define SECONDARY_EXEC_ENCLS_EXITING		VMCS_CONTROL_BIT(ENCLS_EXITING)
+ #define SECONDARY_EXEC_RDSEED_EXITING		VMCS_CONTROL_BIT(RDSEED_EXITING)
+ #define SECONDARY_EXEC_ENABLE_PML               VMCS_CONTROL_BIT(PAGE_MOD_LOGGING)
++#define SECONDARY_EXEC_EPT_VIOLATION_VE		VMCS_CONTROL_BIT(EPT_VIOLATION_VE)
+ #define SECONDARY_EXEC_PT_CONCEAL_VMX		VMCS_CONTROL_BIT(PT_CONCEAL_VMX)
+ #define SECONDARY_EXEC_XSAVES			VMCS_CONTROL_BIT(XSAVES)
+ #define SECONDARY_EXEC_MODE_BASED_EPT_EXEC	VMCS_CONTROL_BIT(MODE_BASED_EPT_EXEC)
+@@ -223,6 +224,8 @@ enum vmcs_field {
+ 	VMREAD_BITMAP_HIGH              = 0x00002027,
+ 	VMWRITE_BITMAP                  = 0x00002028,
+ 	VMWRITE_BITMAP_HIGH             = 0x00002029,
++	VE_INFO_ADDR			= 0x0000202A,
++	VE_INFO_ADDR_HIGH		= 0x0000202B,
+ 	XSS_EXIT_BITMAP                 = 0x0000202C,
+ 	XSS_EXIT_BITMAP_HIGH            = 0x0000202D,
+ 	ENCLS_EXITING_BITMAP		= 0x0000202E,
+@@ -322,6 +325,10 @@ enum vmcs_field {
+ 	CR3_TARGET_VALUE2               = 0x0000600c,
+ 	CR3_TARGET_VALUE3               = 0x0000600e,
+ 	EXIT_QUALIFICATION              = 0x00006400,
++	EXIT_IO_RCX	                = 0x00006402,
++	EXIT_IO_RSI	                = 0x00006404,
++	EXIT_IO_RDI	                = 0x00006406,
++	EXIT_IO_RIP	                = 0x00006408,
+ 	GUEST_LINEAR_ADDRESS            = 0x0000640a,
+ 	GUEST_CR0                       = 0x00006800,
+ 	GUEST_CR3                       = 0x00006802,
+diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
+index 46de10a809ec..f513dc0ae610 100644
+--- a/arch/x86/include/uapi/asm/kvm.h
++++ b/arch/x86/include/uapi/asm/kvm.h
+@@ -532,4 +532,7 @@ struct kvm_pmu_event_filter {
+ #define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */
+ #define   KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */
+ 
++#define KVM_X86_DEFAULT_VM	0
++#define KVM_X86_PROTECTED_VM	1
++
+ #endif /* _ASM_X86_KVM_H */
+diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
+index f10a921ee756..9aeff7157d86 100644
+--- a/arch/x86/kernel/cpu/Makefile
++++ b/arch/x86/kernel/cpu/Makefile
+@@ -56,6 +56,7 @@ obj-$(CONFIG_X86_LOCAL_APIC)		+= perfctr-watchdog.o
+ 
+ obj-$(CONFIG_HYPERVISOR_GUEST)		+= vmware.o hypervisor.o mshyperv.o
+ obj-$(CONFIG_ACRN_GUEST)		+= acrn.o
++obj-$(CONFIG_PKVM_GUEST)		+= pkvm.o
+ 
+ ifdef CONFIG_X86_FEATURE_NAMES
+ quiet_cmd_mkcapflags = MKCAP   $@
+diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
+index 553bfbfc3a1b..e658f7c7c950 100644
+--- a/arch/x86/kernel/cpu/hypervisor.c
++++ b/arch/x86/kernel/cpu/hypervisor.c
+@@ -45,6 +45,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
+ #ifdef CONFIG_ACRN_GUEST
+ 	&x86_hyper_acrn,
+ #endif
++#ifdef CONFIG_PKVM_GUEST
++	&x86_hyper_pkvm,
++#endif
+ };
+ 
+ enum x86_hypervisor_type x86_hyper_type;
+diff --git a/arch/x86/kernel/cpu/pkvm.c b/arch/x86/kernel/cpu/pkvm.c
+new file mode 100644
+index 000000000000..e68ae5f3e263
+--- /dev/null
++++ b/arch/x86/kernel/cpu/pkvm.c
+@@ -0,0 +1,33 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * PKVM detection support.
++ */
++
++#include <asm/hypervisor.h>
++#include <asm/pkvm.h>
++
++static u32 __init pkvm_detect(void)
++{
++	if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
++		return hypervisor_cpuid_base("PKVMPKVMPKVM", 0);
++
++	return 0;
++}
++
++static void __init pkvm_init_platform(void)
++{
++	pkvm_guest_init_coco();
++}
++
++static bool pkvm_x2apic_available(void)
++{
++	return boot_cpu_has(X86_FEATURE_X2APIC);
++}
++
++const __initconst struct hypervisor_x86 x86_hyper_pkvm = {
++	.name                   = "PKVM",
++	.detect                 = pkvm_detect,
++	.type			= X86_HYPER_PKVM,
++	.init.init_platform     = pkvm_init_platform,
++	.init.x2apic_available  = pkvm_x2apic_available,
++};
+diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
+index a58c6bc1cd68..bd113065f802 100644
+--- a/arch/x86/kernel/idt.c
++++ b/arch/x86/kernel/idt.c
+@@ -69,7 +69,7 @@ static const __initconst struct idt_data early_idts[] = {
+ 	 */
+ 	INTG(X86_TRAP_PF,		asm_exc_page_fault),
+ #endif
+-#ifdef CONFIG_INTEL_TDX_GUEST
++#if defined(CONFIG_INTEL_TDX_GUEST) || defined(CONFIG_PKVM_GUEST)
+ 	INTG(X86_TRAP_VE,		asm_exc_virtualization_exception),
+ #endif
+ };
+diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
+index 804a252382da..6af7e3d985b8 100644
+--- a/arch/x86/kernel/setup.c
++++ b/arch/x86/kernel/setup.c
+@@ -37,6 +37,7 @@
+ #include <asm/cpu.h>
+ #include <asm/efi.h>
+ #include <asm/gart.h>
++#include <asm/kvm_pkvm.h>
+ #include <asm/hypervisor.h>
+ #include <asm/io_apic.h>
+ #include <asm/kasan.h>
+@@ -1327,6 +1328,8 @@ void __init setup_arch(char **cmdline_p)
+ #endif
+ 
+ 	unwind_init();
++
++	pkvm_reserve();
+ }
+ 
+ #ifdef CONFIG_X86_32
+diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
+index c0a5a4f225d9..f026ffb62de6 100644
+--- a/arch/x86/kernel/traps.c
++++ b/arch/x86/kernel/traps.c
+@@ -63,6 +63,7 @@
+ #include <asm/insn.h>
+ #include <asm/insn-eval.h>
+ #include <asm/vdso.h>
++#include <asm/virt_exception.h>
+ #include <asm/tdx.h>
+ #include <asm/cfi.h>
+ 
+@@ -1354,7 +1355,7 @@ DEFINE_IDTENTRY(exc_device_not_available)
+ 	}
+ }
+ 
+-#ifdef CONFIG_INTEL_TDX_GUEST
++#if defined(CONFIG_INTEL_TDX_GUEST) || defined(CONFIG_PKVM_GUEST)
+ 
+ #define VE_FAULT_STR "VE fault"
+ 
+@@ -1426,15 +1427,15 @@ DEFINE_IDTENTRY(exc_virtualization_exception)
+ 	 * till TDGETVEINFO TDCALL is executed. This ensures that VE
+ 	 * info cannot be overwritten by a nested #VE.
+ 	 */
+-	tdx_get_ve_info(&ve);
++	get_ve_info(&ve);
+ 
+ 	cond_local_irq_enable(regs);
+ 
+ 	/*
+-	 * If tdx_handle_virt_exception() could not process
++	 * If handle_virt_exception() could not process
+ 	 * it successfully, treat it as #GP(0) and handle it.
+ 	 */
+-	if (!tdx_handle_virt_exception(regs, &ve))
++	if (!handle_virt_exception(regs, &ve))
+ 		ve_raise_fault(regs, 0, ve.gla);
+ 
+ 	cond_local_irq_disable(regs);
+diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
+index 78ccb5ec3c0e..c24394c7c245 100644
+--- a/arch/x86/kernel/vmlinux.lds.S
++++ b/arch/x86/kernel/vmlinux.lds.S
+@@ -111,6 +111,35 @@ PHDRS {
+ 	note PT_NOTE FLAGS(0);          /* ___ */
+ }
+ 
++#ifdef CONFIG_PKVM_INTEL
++#include <asm/pkvm_image.h>
++
++#define PKVM_TEXT							\
++	. = ALIGN(PAGE_SIZE);						\
++	__pkvm_text_start = .;						\
++	*(PKVM_SECTION_NAME(.text))					\
++	. = ALIGN(PAGE_SIZE);						\
++	__pkvm_text_end = .;
++
++#define PKVM_BSS							\
++	. = ALIGN(PAGE_SIZE);						\
++	__pkvm_bss_start = .;						\
++	*(PKVM_SECTION_NAME(.bss))					\
++	. = ALIGN(PAGE_SIZE);						\
++	__pkvm_bss_end = .;
++
++#define PKVM_DATA							\
++	. = ALIGN(PAGE_SIZE);						\
++	__pkvm_data_start = .;						\
++	*(PKVM_SECTION_NAME(.data))					\
++	. = ALIGN(PAGE_SIZE);						\
++	__pkvm_data_end = .;
++#else
++#define PKVM_TEXT
++#define PKVM_BSS
++#define PKVM_DATA
++#endif
++
+ SECTIONS
+ {
+ #ifdef CONFIG_X86_32
+@@ -150,6 +179,7 @@ SECTIONS
+ 		ALIGN_ENTRY_TEXT_END
+ 		SOFTIRQENTRY_TEXT
+ 		STATIC_CALL_TEXT
++		PKVM_TEXT
+ 		*(.gnu.warning)
+ 
+ #ifdef CONFIG_RETPOLINE
+@@ -166,6 +196,7 @@ SECTIONS
+ 	. = ALIGN(PAGE_SIZE);
+ 
+ 	X86_ALIGN_RODATA_BEGIN
++	PKVM_RODATA
+ 	RO_DATA(PAGE_SIZE)
+ 	X86_ALIGN_RODATA_END
+ 
+@@ -181,6 +212,7 @@ SECTIONS
+ 		/* 32 bit has nosave before _edata */
+ 		NOSAVE_DATA
+ #endif
++		PKVM_DATA
+ 
+ 		PAGE_ALIGNED_DATA(PAGE_SIZE)
+ 
+@@ -394,6 +426,7 @@ SECTIONS
+ 		. = ALIGN(PAGE_SIZE);
+ 		*(BSS_MAIN)
+ 		BSS_DECRYPTED
++		PKVM_BSS
+ 		. = ALIGN(PAGE_SIZE);
+ 		__bss_stop = .;
+ 	}
+@@ -507,6 +540,10 @@ INIT_PER_CPU(irq_stack_backing_store);
+            "fixed_percpu_data is not at start of per-cpu area");
+ #endif
+ 
++#ifdef CONFIG_PKVM_INTEL
++#include <asm/pkvm_image_vars.h>
++#endif
++
+ #ifdef CONFIG_RETHUNK
+ . = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned");
+ . = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned");
+diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
+index 67be7f217e37..c025486d728f 100644
+--- a/arch/x86/kvm/Kconfig
++++ b/arch/x86/kvm/Kconfig
+@@ -87,6 +87,30 @@ config KVM_INTEL
+ 	  To compile this as a module, choose M here: the module
+ 	  will be called kvm-intel.
+ 
++config PKVM_INTEL
++	bool "pKVM for Intel processors support"
++	depends on KVM_INTEL=y
++	depends on X86_64
++	depends on !KSM
++	select INTEL_IOMMU_DEFAULT_ON
++	select INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
++	help
++	  Provides support for pKVM on Intel processors.
++
++	  This will deprivilege the host as a VM running in non-root VMX
++	  operation mode, and pKVM hypervisor will run in root VMX
++	  operation mode.
++
++	  If unsure, say N.
++
++config PKVM_INTEL_DEBUG
++        bool "Debug pKVM"
++        depends on PKVM_INTEL
++        help
++          Provides debug support for pKVM.
++
++          If unsure, say N.
++
+ config X86_SGX_KVM
+ 	bool "Software Guard eXtensions (SGX) Virtualization"
+ 	depends on X86_SGX && KVM_INTEL
+diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
+index f453a0f96e24..7f1ccc11c610 100644
+--- a/arch/x86/kvm/Makefile
++++ b/arch/x86/kvm/Makefile
+@@ -33,6 +33,7 @@ endif
+ 
+ obj-$(CONFIG_KVM)	+= kvm.o
+ obj-$(CONFIG_KVM_INTEL)	+= kvm-intel.o
++obj-$(CONFIG_PKVM_INTEL) += vmx/pkvm/
+ obj-$(CONFIG_KVM_AMD)	+= kvm-amd.o
+ 
+ AFLAGS_svm/vmenter.o    := -iquote $(obj)
+diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
+index 59804be91b5b..a8ef25726236 100644
+--- a/arch/x86/kvm/mmu.h
++++ b/arch/x86/kvm/mmu.h
+@@ -8,6 +8,11 @@
+ 
+ extern bool __read_mostly enable_mmio_caching;
+ 
++#define PT64_PT_BITS 9
++#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
++#define PT32_PT_BITS 10
++#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
++
+ #define PT_WRITABLE_SHIFT 1
+ #define PT_USER_SHIFT 2
+ 
+@@ -36,6 +41,17 @@ extern bool __read_mostly enable_mmio_caching;
+ #define PT32_ROOT_LEVEL 2
+ #define PT32E_ROOT_LEVEL 3
+ 
++#define PT64_LEVEL_BITS 9
++#define PT64_LEVEL_SHIFT(level) __PT_LEVEL_SHIFT(level, PT64_LEVEL_BITS)
++#define PT_LEVEL_INDEX(addr, level)     \
++	(((addr) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
++
++#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
++#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
++#else
++#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
++#endif
++
+ #define KVM_MMU_CR4_ROLE_BITS (X86_CR4_PSE | X86_CR4_PAE | X86_CR4_LA57 | \
+ 			       X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE)
+ 
+diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
+index d30325e297a0..2c89e6f2c457 100644
+--- a/arch/x86/kvm/mmu/mmu.c
++++ b/arch/x86/kvm/mmu/mmu.c
+@@ -255,18 +255,6 @@ static inline bool kvm_available_flush_tlb_with_range(void)
+ 	return kvm_x86_ops.tlb_remote_flush_with_range;
+ }
+ 
+-static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
+-		struct kvm_tlb_range *range)
+-{
+-	int ret = -ENOTSUPP;
+-
+-	if (range && kvm_x86_ops.tlb_remote_flush_with_range)
+-		ret = static_call(kvm_x86_tlb_remote_flush_with_range)(kvm, range);
+-
+-	if (ret)
+-		kvm_flush_remote_tlbs(kvm);
+-}
+-
+ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
+ 		u64 start_gfn, u64 pages)
+ {
+@@ -275,7 +263,8 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
+ 	range.start_gfn = start_gfn;
+ 	range.pages = pages;
+ 
+-	kvm_flush_remote_tlbs_with_range(kvm, &range);
++	if (kvm_flush_remote_tlbs_with_range(kvm, &range))
++		kvm_flush_remote_tlbs(kvm);
+ }
+ 
+ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
+@@ -1166,7 +1155,8 @@ static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush)
+ 	drop_spte(kvm, sptep);
+ 
+ 	if (flush)
+-		kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
++		kvm_flush_remote_tlbs_with_address(kvm,
++			kvm_mmu_page_get_gfn(sp, sptep - sp->spt),
+ 			KVM_PAGES_PER_HPAGE(sp->role.level));
+ }
+ 
+@@ -1619,7 +1609,7 @@ static void __rmap_add(struct kvm *kvm,
+ 	if (rmap_count > RMAP_RECYCLE_THRESHOLD) {
+ 		kvm_zap_all_rmap_sptes(kvm, rmap_head);
+ 		kvm_flush_remote_tlbs_with_address(
+-				kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
++				kvm, gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
+ 	}
+ }
+ 
+@@ -2950,6 +2940,9 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
+ 	if (unlikely(vcpu->kvm->mmu_invalidate_in_progress))
+ 		return;
+ 
++	if (vcpu->kvm->arch.vm_type == KVM_X86_PROTECTED_VM)
++		return;
++
+ 	__direct_pte_prefetch(vcpu, sp, sptep);
+ }
+ 
+@@ -4243,13 +4236,19 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
+ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+ {
+ 	bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu);
+-
++	struct kvm_pinned_page *ppage = NULL;
+ 	unsigned long mmu_seq;
+ 	int r;
+ 
+ 	fault->gfn = fault->addr >> PAGE_SHIFT;
+ 	fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn);
+ 
++	if (vcpu->kvm->arch.vm_type == KVM_X86_PROTECTED_VM) {
++		ppage = kmalloc(sizeof(*ppage), GFP_KERNEL_ACCOUNT);
++		if (!ppage)
++			return -ENOMEM;
++	}
++
+ 	if (page_fault_handle_page_track(vcpu, fault))
+ 		return RET_PF_EMULATE;
+ 
+@@ -4291,6 +4290,19 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
+ 		r = __direct_map(vcpu, fault);
+ 	}
+ 
++	if (ppage) {
++		struct page *page = kvm_pfn_to_refcounted_page(fault->pfn);
++		if (r == RET_PF_FIXED && page) {
++			ppage->page = pfn_to_page(fault->pfn);
++			get_page(ppage->page);
++			spin_lock(&vcpu->kvm->pkvm.pinned_page_lock);
++			list_add(&ppage->list, &vcpu->kvm->pkvm.pinned_pages);
++			spin_unlock(&vcpu->kvm->pkvm.pinned_page_lock);
++		} else {
++			kfree(ppage);
++		}
++	}
++
+ out_unlock:
+ 	if (is_tdp_mmu_fault)
+ 		read_unlock(&vcpu->kvm->mmu_lock);
+@@ -6428,7 +6440,8 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
+ 			kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
+ 
+ 			if (kvm_available_flush_tlb_with_range())
+-				kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
++				kvm_flush_remote_tlbs_with_address(kvm,
++					kvm_mmu_page_get_gfn(sp, sptep - sp->spt),
+ 					KVM_PAGES_PER_HPAGE(sp->role.level));
+ 			else
+ 				need_tlb_flush = 1;
+diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
+index 1f4f5e703f13..ff34fe5e915d 100644
+--- a/arch/x86/kvm/mmu/paging_tmpl.h
++++ b/arch/x86/kvm/mmu/paging_tmpl.h
+@@ -938,7 +938,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
+ 			mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL);
+ 			if (is_shadow_present_pte(old_spte))
+ 				kvm_flush_remote_tlbs_with_address(vcpu->kvm,
+-					sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
++					kvm_mmu_page_get_gfn(sp, sptep - sp->spt),
++					KVM_PAGES_PER_HPAGE(sp->role.level));
+ 
+ 			if (!rmap_can_add(vcpu))
+ 				break;
+diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
+index 7670c13ce251..c6d8508594a0 100644
+--- a/arch/x86/kvm/mmu/spte.h
++++ b/arch/x86/kvm/mmu/spte.h
+@@ -55,6 +55,7 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
+ #define SPTE_LEVEL_SHIFT(level)		__PT_LEVEL_SHIFT(level, SPTE_LEVEL_BITS)
+ #define SPTE_INDEX(address, level)	__PT_INDEX(address, level, SPTE_LEVEL_BITS)
+ #define SPTE_ENT_PER_PAGE		__PT_ENT_PER_PAGE(SPTE_LEVEL_BITS)
++#define SHADOW_PT_INDEX(address, level)	__PT_INDEX(address, level, SPTE_LEVEL_BITS)
+ 
+ /*
+  * The mask/shift to use for saving the original R/X bits when marking the PTE
+diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
+index c3b0f973375b..15639983c03e 100644
+--- a/arch/x86/kvm/mmu/tdp_mmu.c
++++ b/arch/x86/kvm/mmu/tdp_mmu.c
+@@ -9,6 +9,7 @@
+ 
+ #include <asm/cmpxchg.h>
+ #include <trace/events/kvm.h>
++#include <asm/kvm_pkvm.h>
+ 
+ static bool __read_mostly tdp_mmu_enabled = true;
+ module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
+@@ -1037,8 +1038,12 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
+ 	bool wrprot = false;
+ 
+ 	WARN_ON(sp->role.level != fault->goal_level);
+-	if (unlikely(!fault->slot))
++	if (unlikely(!fault->slot)) {
+ 		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
++
++		if (pkvm_set_mmio_ve(vcpu, iter->gfn))
++			return RET_PF_RETRY;
++	}
+ 	else
+ 		wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
+ 					 fault->pfn, iter->old_spte, fault->prefetch, true,
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index c871a6d6364c..13bec526a693 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -4727,6 +4727,11 @@ static void svm_vm_destroy(struct kvm *kvm)
+ 	sev_vm_destroy(kvm);
+ }
+ 
++static bool svm_is_vm_type_supported(unsigned long type)
++{
++	return type == KVM_X86_DEFAULT_VM;
++}
++
+ static int svm_vm_init(struct kvm *kvm)
+ {
+ 	if (!pause_filter_count || !pause_filter_thresh)
+@@ -4753,6 +4758,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
+ 	.vcpu_free = svm_vcpu_free,
+ 	.vcpu_reset = svm_vcpu_reset,
+ 
++	.is_vm_type_supported = svm_is_vm_type_supported,
+ 	.vm_size = sizeof(struct kvm_svm),
+ 	.vm_init = svm_vm_init,
+ 	.vm_destroy = svm_vm_destroy,
+diff --git a/arch/x86/kvm/vmx/pkvm/.gitignore b/arch/x86/kvm/vmx/pkvm/.gitignore
+new file mode 100644
+index 000000000000..3ac372c4eca7
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/.gitignore
+@@ -0,0 +1 @@
++pkvm.lds
+diff --git a/arch/x86/kvm/vmx/pkvm/Makefile b/arch/x86/kvm/vmx/pkvm/Makefile
+new file mode 100644
+index 000000000000..6ca49fffb4dd
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/Makefile
+@@ -0,0 +1,29 @@
++# SPDX-License-Identifier: GPL-2.0
++
++KVM_PKVM ?= ../../../../../virt/kvm/pkvm
++ccflags-y += -I $(srctree)/arch/x86/kvm
++ccflags-y += -I $(srctree)/arch/x86/kvm/vmx/pkvm/include
++
++pkvm-obj		:= pkvm_host.o pkvm_debugfs.o
++
++pkvm-obj		+= $(KVM_PKVM)/pkvm.o
++
++obj-$(CONFIG_PKVM_INTEL)	+= $(pkvm-obj)
++obj-$(CONFIG_PKVM_INTEL)	+= hyp/
++
++always-y := pkvm_constants.h pkvm-constants.s
++
++define rule_gen_hyp_constants
++        $(call filechk,offsets,__PKVM_CONSTANTS_H__)
++endef
++
++CFLAGS_pkvm-constants.o = -I $(src)/include
++CFLAGS_pkvm-constants.o += -I $(srctree)/virt/kvm/pkvm
++$(obj)/pkvm-constants.s: $(src)/pkvm_constants.c FORCE
++	        $(call if_changed_dep,cc_s_c)
++
++$(obj)/pkvm_constants.h: $(obj)/pkvm-constants.s FORCE
++	        $(call if_changed_rule,gen_hyp_constants)
++
++obj-intel-pkvm := $(addprefix $(obj)/, $(pkvm-obj))
++$(obj-intel-pkvm): $(obj)/pkvm_constants.h
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/Makefile b/arch/x86/kvm/vmx/pkvm/hyp/Makefile
+new file mode 100644
+index 000000000000..682e34a7901f
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/Makefile
+@@ -0,0 +1,79 @@
++# SPDX-License-Identifier: GPL-2.0
++
++KVM_PKVM = virt/kvm/pkvm
++
++ccflags-y += -I $(srctree)/$(KVM_PKVM)/
++ccflags-y += -I $(srctree)/arch/x86/kvm
++ccflags-y += -I $(srctree)/arch/x86/kvm/vmx/pkvm/include
++ccflags-y += -fno-stack-protector
++ccflags-y += -D__DISABLE_EXPORTS
++ccflags-y += -D__PKVM_HYP__
++
++pkvm-hyp-obj	:= $(obj)/vmx_asm.o $(obj)/vmexit.o \
++		   $(obj)/memory.o $(obj)/early_alloc.o \
++		   $(obj)/pgtable.o $(obj)/mmu.o \
++		   $(obj)/ept.o $(obj)/pkvm.o \
++		   $(obj)/idt.o $(obj)/irq.o \
++		   $(obj)/init_finalise.o $(obj)/nested.o \
++		   $(obj)/vmx.o $(obj)/vmsr.o \
++		   $(obj)/iommu.o $(obj)/iommu_debug.o \
++		   $(obj)/mem_protect.o $(obj)/lapic.o \
++		   $(obj)/ptdev.o $(obj)/iommu_spgt.o \
++		   $(obj)/io_emulate.o $(obj)/pci.o \
++		   $(obj)/trace.o
++
++virt-dir	:= $(objtree)/$(KVM_PKVM)
++pkvm-hyp-obj	+= $(virt-dir)/page_alloc.o
++
++ifndef CONFIG_PKVM_INTEL_DEBUG
++lib-dir		:= $(obj)/lib
++lib2-dir	:= $(objtree)/lib
++pkvm-hyp-obj	+= $(lib-dir)/memset_64.o
++pkvm-hyp-obj	+= $(lib-dir)/memcpy_64.o
++pkvm-hyp-obj	+= $(lib2-dir)/find_bit.o
++ifdef CONFIG_DEBUG_LIST
++pkvm-hyp-obj	+= $(lib-dir)/list_debug.o
++endif
++ifdef CONFIG_RETPOLINE
++pkvm-hyp-obj	+= $(lib-dir)/retpoline.o
++endif
++endif
++
++dir-obj		:= $(lib-dir) $(lib2-dir) $(virt-dir)
++
++pkvm-rename-obj 	:= $(patsubst %.o,%.pkvm.o,$(pkvm-hyp-obj))
++pkvm-obj		:= pkvm.o
++
++$(dir-obj): FORCE
++ifndef CONFIG_PKVM_INTEL_DEBUG
++	$(Q)mkdir -p $(lib-dir) $(lib2-dir)
++endif
++	$(Q)mkdir -p $(virt-dir)
++
++%.pkvm.o: %.c $(dir-obj) FORCE
++	$(call if_changed_rule,cc_o_c)
++%.pkvm.o: %.S $(dir-obj) FORCE
++	$(call if_changed_rule,as_o_S)
++
++$(obj)/pkvm.lds: $(src)/pkvm.lds.S FORCE
++	$(call if_changed_dep,cpp_lds_S)
++
++LDFLAGS_pkvm.tmp.o := -r -T
++$(obj)/pkvm.tmp.o: $(obj)/pkvm.lds $(pkvm-rename-obj) FORCE
++	$(call if_changed,ld)
++
++$(obj)/pkvm.o: $(obj)/pkvm.tmp.o FORCE
++	$(call if_changed,pkvmcopy)
++
++quiet_cmd_pkvmcopy = PKVMPCOPY $@
++ifdef CONFIG_PKVM_INTEL_DEBUG
++      cmd_pkvmcopy = $(OBJCOPY) --prefix-symbols= $< $@
++else
++      cmd_pkvmcopy = $(OBJCOPY) --prefix-symbols=__pkvm_ --remove-section=.retpoline_sites --remove-section=.return_sites $< $@
++endif
++
++obj-$(CONFIG_PKVM_INTEL)	+= $(pkvm-obj)
++
++# Remove ftrace, Shadow Call Stack, and CFI CFLAGS.
++# This is equivalent to the 'notrace', '__noscs', and '__nocfi' annotations.
++KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) $(CC_FLAGS_CFI), $(KBUILD_CFLAGS))
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/bug.h b/arch/x86/kvm/vmx/pkvm/hyp/bug.h
+new file mode 100644
+index 000000000000..019c5f2755fe
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/bug.h
+@@ -0,0 +1,23 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++#ifndef __PKVM_BUG_H
++#define __PKVM_BUG_H
++
++#ifdef CONFIG_PKVM_INTEL_DEBUG
++#include <linux/printk.h>
++
++#define PKVM_ASSERT(c)						\
++do {								\
++	if (!(c)) {						\
++		pr_err("assertion failed %s: %d: %s\n",		\
++			__FILE__, __LINE__, #c);		\
++		BUG();						\
++	}							\
++} while (0)
++#else
++#define PKVM_ASSERT(c) do { } while (!(c))
++#endif
++
++#endif
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/cpu.h b/arch/x86/kvm/vmx/pkvm/hyp/cpu.h
+new file mode 100644
+index 000000000000..cd3d60034890
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/cpu.h
+@@ -0,0 +1,53 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++#ifndef _PKVM_CPU_H_
++#define _PKVM_CPU_H_
++
++static inline u64 pkvm_msr_read(u32 reg)
++{
++	u32 msrl, msrh;
++
++	asm volatile (" rdmsr ":"=a"(msrl), "=d"(msrh) : "c" (reg));
++	return (((u64)msrh << 32U) | msrl);
++}
++
++#define pkvm_rdmsr(msr, low, high)              \
++do {                                            \
++	u64 __val = pkvm_msr_read(msr);         \
++	(void)((low) = (u32)__val);             \
++	(void)((high) = (u32)(__val >> 32));    \
++} while (0)
++
++#define pkvm_rdmsrl(msr, val)                   \
++	((val) = pkvm_msr_read((msr)))
++
++static inline void pkvm_msr_write(u32 reg, u64 msr_val)
++{
++	asm volatile (" wrmsr " : : "c" (reg), "a" ((u32)msr_val), "d" ((u32)(msr_val >> 32U)));
++}
++
++#define pkvm_wrmsr(msr, low, high)                      \
++do {                                                    \
++	u64 __val = (u64)(high) << 32 | (u64)(low);     \
++	pkvm_msr_write(msr, __val);                     \
++} while (0)
++
++#define pkvm_wrmsrl(msr, val)   pkvm_msr_write(msr, val)
++
++#ifdef CONFIG_PKVM_INTEL_DEBUG
++#include <linux/smp.h>
++static inline u64 get_pcpu_id(void)
++{
++	return raw_smp_processor_id();
++}
++#else
++/* this function shall only be used during pkvm runtime */
++static inline u64 get_pcpu_id(void)
++{
++	return pkvm_msr_read(MSR_GS_BASE);
++}
++#endif
++
++#endif
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/debug.h b/arch/x86/kvm/vmx/pkvm/hyp/debug.h
+new file mode 100644
+index 000000000000..29d9804cf580
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/debug.h
+@@ -0,0 +1,20 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++
++#ifndef _PKVM_DEBUG_H_
++#define _PKVM_DEBUG_H_
++
++#ifdef CONFIG_PKVM_INTEL_DEBUG
++#include <linux/printk.h>
++#define pkvm_dbg(f, x...) pr_debug(f, ## x)
++#define pkvm_info(f, x...) pr_info(f, ## x)
++#define pkvm_err(f, x...) pr_err(f, ## x)
++#else
++#define pkvm_dbg(x...)
++#define pkvm_info(x...)
++#define pkvm_err(x...)
++#endif
++
++#endif
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/early_alloc.c b/arch/x86/kvm/vmx/pkvm/hyp/early_alloc.c
+new file mode 100644
+index 000000000000..766ff87e989d
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/early_alloc.c
+@@ -0,0 +1,76 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++
++#include <asm/string.h>
++#include <pkvm.h>
++
++#include "pkvm_spinlock.h"
++#include "pgtable.h"
++
++static unsigned long base;
++static unsigned long end;
++static unsigned long cur;
++
++static pkvm_spinlock_t early_lock = { __ARCH_PKVM_SPINLOCK_UNLOCKED };
++
++struct pkvm_mm_ops pkvm_early_alloc_mm_ops;
++
++unsigned long pkvm_early_alloc_nr_used_pages(void)
++{
++	return (cur - base) >> PAGE_SHIFT;
++}
++
++void *pkvm_early_alloc_contig(unsigned int nr_pages)
++{
++	unsigned long size = (nr_pages << PAGE_SHIFT);
++	void *ret;
++
++	if (!nr_pages)
++		return NULL;
++
++	pkvm_spin_lock(&early_lock);
++	if (end - cur < size) {
++		pkvm_spin_unlock(&early_lock);
++		return NULL;
++	}
++	ret = (void *)cur;
++	cur += size;
++	pkvm_spin_unlock(&early_lock);
++
++	memset(ret, 0, size);
++
++	return ret;
++}
++
++void *pkvm_early_alloc_page(void)
++{
++	return pkvm_early_alloc_contig(1);
++}
++
++static void pkvm_early_alloc_get_page(void *addr) { }
++static void pkvm_early_alloc_put_page(void *addr) { }
++static void pkvm_early_flush_tlb(struct pkvm_pgtable *pgt,
++				 unsigned long addr, unsigned long size)
++{
++}
++
++static int pkvm_early_page_count(void *vaddr)
++{
++	return 512;
++}
++
++void pkvm_early_alloc_init(void *virt, unsigned long size)
++{
++	base = cur = (unsigned long)virt;
++	end = base + size;
++
++	pkvm_early_alloc_mm_ops.zalloc_page = pkvm_early_alloc_page;
++	pkvm_early_alloc_mm_ops.get_page = pkvm_early_alloc_get_page;
++	pkvm_early_alloc_mm_ops.put_page = pkvm_early_alloc_put_page;
++	pkvm_early_alloc_mm_ops.phys_to_virt = pkvm_phys_to_virt;
++	pkvm_early_alloc_mm_ops.virt_to_phys = pkvm_virt_to_phys;
++	pkvm_early_alloc_mm_ops.page_count = pkvm_early_page_count;
++	pkvm_early_alloc_mm_ops.flush_tlb = pkvm_early_flush_tlb;
++}
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/early_alloc.h b/arch/x86/kvm/vmx/pkvm/hyp/early_alloc.h
+new file mode 100644
+index 000000000000..59bede62cd03
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/early_alloc.h
+@@ -0,0 +1,15 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++#ifndef __PKVM_EARLY_ALLOC_H
++#define __PKVM_EARLY_ALLOC_H
++
++unsigned long pkvm_early_alloc_nr_used_pages(void);
++void *pkvm_early_alloc_contig(unsigned int nr_pages);
++void *pkvm_early_alloc_page(void);
++void pkvm_early_alloc_init(void *virt, unsigned long size);
++
++extern struct pkvm_mm_ops pkvm_early_alloc_mm_ops;
++
++#endif
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/ept.c b/arch/x86/kvm/vmx/pkvm/hyp/ept.c
+new file mode 100644
+index 000000000000..4d89c8972115
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/ept.c
+@@ -0,0 +1,1066 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++
++#include <linux/types.h>
++#include <linux/memblock.h>
++#include <asm/kvm_pkvm.h>
++#include <mmu.h>
++#include <mmu/spte.h>
++
++#include <pkvm.h>
++#include <gfp.h>
++#include <capabilities.h>
++
++#include "pkvm_hyp.h"
++#include "early_alloc.h"
++#include "pgtable.h"
++#include "ept.h"
++#include "pkvm_spinlock.h"
++#include "memory.h"
++#include "iommu.h"
++#include "vmx.h"
++#include "mem_protect.h"
++#include "debug.h"
++#include "ptdev.h"
++#include "io_emulate.h"
++
++static struct pkvm_pool host_ept_pool;
++static struct pkvm_pgtable host_ept;
++static struct pkvm_pgtable host_ept_notlbflush;
++static pkvm_spinlock_t _host_ept_lock = __PKVM_SPINLOCK_UNLOCKED;
++
++static struct pkvm_pool shadow_pgt_pool;
++static struct rsvd_bits_validate ept_zero_check;
++
++static void flush_tlb_noop(struct pkvm_pgtable *pgt,
++			   unsigned long addr, unsigned long size)
++{
++}
++
++static inline void pkvm_init_ept_page(void *page)
++{
++	/*
++	 * Normal VM: Never clear the "suppress #VE" bit, so #VE will never
++	 * be triggered.
++	 *
++	 * Protected VM: pkvm sets EPT_VIOLATION_VE for Protected VM, "suppress
++	 * #VE" bit must be set to get EPT violation, thus pkvm can build the
++	 * EPT mapping for memory region, and clear "suppress #VE" for mmio
++	 * region, thus mmio can trigger #VE.
++	 *
++	 * For simplicity, unconditionally initialize SEPT to set "suppress
++	 * #VE".
++	 */
++	memset64((u64 *)page, EPT_PROT_DEF, 512);
++}
++
++static void *ept_zalloc_page(struct pkvm_pool *pool)
++{
++	void *page;
++
++	page = pkvm_alloc_pages(pool, 0);
++	if (page)
++		pkvm_init_ept_page(page);
++
++	return page;
++}
++
++static void *host_ept_zalloc_page(void)
++{
++	/*
++	 * Also initiailize the host ept with SUPPRESS_VE bit set although this
++	 * bit is ignored in host ept. Because host_ept and shadow_ept share the
++	 * same ept_ops, this will make the ept_entry_mapped work for both
++	 * host_ept and shadow_ept.
++	 */
++	return ept_zalloc_page(&host_ept_pool);
++}
++
++static void host_ept_get_page(void *vaddr)
++{
++	pkvm_get_page(&host_ept_pool, vaddr);
++}
++
++static void host_ept_put_page(void *vaddr)
++{
++	pkvm_put_page(&host_ept_pool, vaddr);
++}
++
++static void host_ept_flush_cache(void *vaddr, unsigned int size)
++{
++	if (!pkvm_hyp->iommu_coherent)
++		pkvm_clflush_cache_range(vaddr, size);
++}
++
++static void host_ept_flush_tlb(struct pkvm_pgtable *pgt,
++			       unsigned long vaddr, unsigned long size)
++{
++	struct pkvm_host_vcpu *hvcpu;
++	int i;
++
++	for (i = 0; i < pkvm_hyp->num_cpus; i++) {
++		hvcpu = pkvm_hyp->host_vm.host_vcpus[i];
++
++		kvm_make_request(PKVM_REQ_TLB_FLUSH_HOST_EPT, &hvcpu->vmx.vcpu);
++		pkvm_kick_vcpu(&hvcpu->vmx.vcpu);
++	}
++
++	/*
++	 * Also needs to flush the IOTLB as host EPT is used
++	 * as second-stage page table for some devices.
++	 */
++	pkvm_iommu_flush_iotlb(pgt, vaddr, size);
++}
++
++struct pkvm_mm_ops host_ept_mm_ops = {
++	.phys_to_virt = pkvm_phys_to_virt,
++	.virt_to_phys = pkvm_virt_to_phys,
++	.zalloc_page = host_ept_zalloc_page,
++	.get_page = host_ept_get_page,
++	.put_page = host_ept_put_page,
++	.page_count = pkvm_page_count,
++	.flush_tlb = host_ept_flush_tlb,
++	.flush_cache = host_ept_flush_cache,
++};
++
++static struct pkvm_mm_ops host_ept_mm_ops_no_tlbflush = {
++	.phys_to_virt = pkvm_phys_to_virt,
++	.virt_to_phys = pkvm_virt_to_phys,
++	.zalloc_page = host_ept_zalloc_page,
++	.get_page = host_ept_get_page,
++	.put_page = host_ept_put_page,
++	.page_count = pkvm_page_count,
++	.flush_tlb = flush_tlb_noop,
++	.flush_cache = host_ept_flush_cache,
++};
++
++static bool ept_entry_present(void *ptep)
++{
++	u64 val = *(u64 *)ptep;
++
++	return !!(val & VMX_EPT_RWX_MASK);
++}
++
++static bool ept_entry_mapped(void *ptep)
++{
++	/*
++	 * Both present and non-present (shadow)EPT entry is counted as a
++	 * mapped entry because a non-present entry with non-zero value may
++	 * contain page state and ownership information created through map
++	 * operation. So simply count non-zero entry as mapped to cover both
++	 * cases.
++	 *
++	 * Since we initialize every pte with SUPPRESS_VE bit set, which means
++	 * if a pte does not equal to the default value, it has been mapped.
++	 */
++	return !(*(u64 *)ptep == EPT_PROT_DEF);
++}
++
++static bool ept_entry_huge(void *ptep)
++{
++	return is_large_pte(*(u64 *)ptep);
++}
++
++static void ept_entry_mkhuge(void *ptep)
++{
++	*(u64 *)ptep |= PT_PAGE_SIZE_MASK;
++}
++
++static unsigned long ept_entry_to_phys(void *ptep)
++{
++	return *(u64 *)ptep & PT64_BASE_ADDR_MASK;
++}
++
++static u64 ept_entry_to_prot(void *ptep)
++{
++	u64 prot = *(u64 *)ptep & ~(PT64_BASE_ADDR_MASK);
++
++	return prot & ~PT_PAGE_SIZE_MASK;
++}
++
++static int ept_entry_to_index(unsigned long vaddr, int level)
++{
++	return SHADOW_PT_INDEX(vaddr, level);
++}
++
++static bool ept_entry_is_leaf(void *ptep, int level)
++{
++	if (level == PG_LEVEL_4K ||
++		!ept_entry_present(ptep) ||
++		ept_entry_huge(ptep))
++		return true;
++
++	return false;
++
++}
++
++static int ept_level_entry_size(int level)
++{
++	return PAGE_SIZE / PT64_ENT_PER_PAGE;
++}
++
++static int ept_level_to_entries(int level)
++{
++	return PT64_ENT_PER_PAGE;
++}
++
++static u64 ept_level_page_mask(int level)
++{
++	return (~((1UL << PT64_LEVEL_SHIFT(level)) - 1));
++}
++
++static unsigned long ept_level_to_size(int level)
++{
++	return KVM_HPAGE_SIZE(level);
++}
++
++static void ept_set_entry(void *sptep, u64 spte)
++{
++	WRITE_ONCE(*(u64 *)sptep, spte);
++}
++
++struct pkvm_pgtable_ops ept_ops = {
++	.pgt_entry_present = ept_entry_present,
++	.pgt_entry_mapped = ept_entry_mapped,
++	.pgt_entry_huge = ept_entry_huge,
++	.pgt_entry_mkhuge = ept_entry_mkhuge,
++	.pgt_entry_to_phys = ept_entry_to_phys,
++	.pgt_entry_to_prot = ept_entry_to_prot,
++	.pgt_entry_to_index = ept_entry_to_index,
++	.pgt_level_page_mask = ept_level_page_mask,
++	.pgt_entry_is_leaf = ept_entry_is_leaf,
++	.pgt_level_entry_size = ept_level_entry_size,
++	.pgt_level_to_entries = ept_level_to_entries,
++	.pgt_level_to_size = ept_level_to_size,
++	.pgt_set_entry = ept_set_entry,
++	.default_prot = EPT_PROT_DEF,
++};
++
++bool is_pgt_ops_ept(struct pkvm_pgtable *pgt)
++{
++	return pgt && (pgt->pgt_ops == &ept_ops);
++}
++
++int pkvm_host_ept_map(unsigned long vaddr_start, unsigned long phys_start,
++		unsigned long size, int pgsz_mask, u64 prot)
++{
++	return pkvm_pgtable_map(&host_ept, vaddr_start, phys_start, size,
++				pgsz_mask, prot, NULL);
++}
++
++int pkvm_host_ept_unmap(unsigned long vaddr_start, unsigned long phys_start,
++			unsigned long size)
++{
++	return pkvm_pgtable_unmap_safe(&host_ept, vaddr_start, phys_start, size, NULL);
++}
++
++void pkvm_host_ept_lookup(unsigned long vaddr, unsigned long *pphys,
++			  u64 *pprot, int *plevel)
++{
++	pkvm_pgtable_lookup(&host_ept, vaddr, pphys, pprot, plevel);
++}
++
++void pkvm_host_ept_destroy(void)
++{
++	pkvm_pgtable_destroy(&host_ept, NULL);
++}
++
++void host_ept_lock(void)
++{
++	pkvm_spin_lock(&_host_ept_lock);
++}
++
++void host_ept_unlock(void)
++{
++	pkvm_spin_unlock(&_host_ept_lock);
++}
++
++void pkvm_flush_host_ept(void)
++{
++	u64 eptp = pkvm_construct_eptp(host_ept.root_pa, host_ept.level);
++
++	flush_ept(eptp);
++}
++
++static void reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
++				      u64 pa_bits_rsvd, bool execonly,
++				      int huge_page_level)
++{
++	u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
++	u64 large_1g_rsvd = 0, large_2m_rsvd = 0;
++	u64 bad_mt_xwr;
++
++	if (huge_page_level < PG_LEVEL_1G)
++		large_1g_rsvd = rsvd_bits(7, 7);
++	if (huge_page_level < PG_LEVEL_2M)
++		large_2m_rsvd = rsvd_bits(7, 7);
++
++	rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7);
++	rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7);
++	rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd;
++	rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd;
++	rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
++
++	/* large page */
++	rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
++	rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
++	rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd;
++	rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd;
++	rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
++
++	bad_mt_xwr = 0xFFull << (2 * 8);	/* bits 3..5 must not be 2 */
++	bad_mt_xwr |= 0xFFull << (3 * 8);	/* bits 3..5 must not be 3 */
++	bad_mt_xwr |= 0xFFull << (7 * 8);	/* bits 3..5 must not be 7 */
++	bad_mt_xwr |= REPEAT_BYTE(1ull << 2);	/* bits 0..2 must not be 010 */
++	bad_mt_xwr |= REPEAT_BYTE(1ull << 6);	/* bits 0..2 must not be 110 */
++	if (!execonly) {
++		/* bits 0..2 must not be 100 unless VMX capabilities allow it */
++		bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
++	}
++	rsvd_check->bad_mt_xwr = bad_mt_xwr;
++}
++
++int pkvm_host_ept_init(struct pkvm_pgtable_cap *cap,
++		void *ept_pool_base, unsigned long ept_pool_pages)
++{
++	unsigned long pfn = __pkvm_pa(ept_pool_base) >> PAGE_SHIFT;
++	int ret;
++	u8 pa_bits;
++
++	ret = pkvm_pool_init(&host_ept_pool, pfn, ept_pool_pages, 0);
++	if (ret)
++		return ret;
++
++	pa_bits = get_max_physaddr_bits();
++	if (!pa_bits)
++		return -EINVAL;
++	reset_rsvds_bits_mask_ept(&ept_zero_check, rsvd_bits(pa_bits, 63),
++				  vmx_has_ept_execute_only(),
++				  fls(cap->allowed_pgsz) - 1);
++
++	pkvm_hyp->host_vm.ept = &host_ept;
++	ret = pkvm_pgtable_init(&host_ept, &host_ept_mm_ops, &ept_ops, cap, true);
++	if (ret)
++		return ret;
++
++	/*
++	 * Prepare an instance for host EPT without doing TLB flushing.
++	 * This is used for some fastpath code which wants to avoid
++	 * doing TLB flushing for each host EPT modifications. It doesn't
++	 * mean TLB flushing is not needed. The user still needs to do
++	 * TLB flushing explicitly after finishing all the host EPT
++	 * modifications.
++	 */
++	host_ept_notlbflush = host_ept;
++	host_ept_notlbflush.mm_ops = &host_ept_mm_ops_no_tlbflush;
++	pkvm_hyp->host_vm.ept_notlbflush = &host_ept_notlbflush;
++
++	return 0;
++}
++
++int handle_host_ept_violation(struct kvm_vcpu *vcpu, bool *skip_instruction)
++{
++	unsigned long hpa, gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
++	struct mem_range range, cur;
++	bool is_memory = find_mem_range(gpa, &range);
++	u64 prot = pkvm_mkstate(HOST_EPT_DEF_MMIO_PROT, PKVM_PAGE_OWNED);
++	int level;
++	int ret;
++	*skip_instruction = true;
++
++	if (is_memory) {
++		pkvm_err("%s: not handle for memory address 0x%lx\n", __func__, gpa);
++		return -EPERM;
++	}
++
++	ret = try_emul_host_mmio(vcpu, gpa);
++	if (ret != -EINVAL) {
++		return ret;
++	}
++
++	pkvm_spin_lock(&_host_ept_lock);
++
++	pkvm_pgtable_lookup(&host_ept, gpa, &hpa, NULL, &level);
++	if (hpa != INVALID_ADDR) {
++		ret = -EAGAIN;
++		goto out;
++	}
++
++	do {
++		unsigned long size = ept_level_to_size(level);
++
++		cur.start = ALIGN_DOWN(gpa, size);
++		cur.end = cur.start + size - 1;
++		/*
++		 * TODO:
++		 * check if this MMIO belongs to a secure VM pass-through device.
++		 */
++		if ((1 << level & host_ept.allowed_pgsz) &&
++				mem_range_included(&cur, &range) &&
++				!is_mem_range_overlap_iommu(cur.start, cur.end))
++			break;
++		level--;
++	} while (level != PG_LEVEL_NONE);
++
++	if (level == PG_LEVEL_NONE) {
++		pkvm_err("pkvm: No valid range: gpa 0x%lx, cur 0x%lx ~ 0x%lx size 0x%lx level %d\n",
++			 gpa, cur.start, cur.end, cur.end - cur.start + 1, level);
++		ret = -EPERM;
++		goto out;
++	}
++
++	pkvm_dbg("pkvm: %s: cur MMIO range 0x%lx ~ 0x%lx size 0x%lx level %d\n",
++		__func__, cur.start, cur.end, cur.end - cur.start + 1, level);
++
++	ret = pkvm_host_ept_map(cur.start, cur.start, cur.end - cur.start + 1,
++			   1 << level, prot);
++	if (ret == -ENOMEM) {
++		/* TODO: reclaim MMIO range pages first and try do map again */
++		pkvm_dbg("%s: no memory to set host ept for addr 0x%lx\n",
++			 __func__, gpa);
++	}
++out:
++	pkvm_spin_unlock(&_host_ept_lock);
++
++	if (ret == 0)
++		*skip_instruction = false;
++	return ret;
++}
++
++int pkvm_shadow_ept_pool_init(void *ept_pool_base, unsigned long ept_pool_pages)
++{
++	unsigned long pfn = __pkvm_pa(ept_pool_base) >> PAGE_SHIFT;
++
++	return pkvm_pool_init(&shadow_pgt_pool, pfn, ept_pool_pages, 0);
++}
++
++static void *shadow_pgt_zalloc_page(void)
++{
++	return ept_zalloc_page(&shadow_pgt_pool);
++}
++
++static void shadow_pgt_get_page(void *vaddr)
++{
++	pkvm_get_page(&shadow_pgt_pool, vaddr);
++}
++
++static void shadow_pgt_put_page(void *vaddr)
++{
++	pkvm_put_page(&shadow_pgt_pool, vaddr);
++}
++
++static void shadow_ept_flush_tlb(struct pkvm_pgtable *pgt,
++				 unsigned long addr,
++				 unsigned long size)
++{
++	struct pkvm_shadow_vm *shadow_vm = sept_to_shadow_vm(pgt);
++	struct shadow_vcpu_state *shadow_vcpu;
++	struct kvm_vcpu *vcpu;
++	s64 shadow_vcpu_handle;
++	int i, shadow_vm_handle = shadow_vm->shadow_vm_handle;
++
++	for (i = 0; i < shadow_vm->created_vcpus; i++) {
++		shadow_vcpu_handle = to_shadow_vcpu_handle(shadow_vm_handle, i);
++		shadow_vcpu = get_shadow_vcpu(shadow_vcpu_handle);
++		/*
++		 * For a shadow_vcpu which is already teardown, no need to kick
++		 * it as its shadow EPT tlb entries are already flushed when
++		 * this shadow vcpu is doing vmclear before teardown.
++		 */
++		if (!shadow_vcpu)
++			continue;
++
++		/*
++		 * If this shadow_vcpu is not loaded then there is vcpu
++		 * pointer for it, so can skip this remote tlb flushing.
++		 */
++		vcpu = READ_ONCE(shadow_vcpu->vcpu);
++		if (!vcpu)
++			goto next;
++
++		kvm_make_request(PKVM_REQ_TLB_FLUSH_SHADOW_EPT, vcpu);
++		pkvm_kick_vcpu(vcpu);
++next:
++		put_shadow_vcpu(shadow_vcpu_handle);
++	}
++}
++
++static struct pkvm_mm_ops shadow_ept_mm_ops = {
++	.phys_to_virt = pkvm_phys_to_virt,
++	.virt_to_phys = pkvm_virt_to_phys,
++	.zalloc_page = shadow_pgt_zalloc_page,
++	.get_page = shadow_pgt_get_page,
++	.put_page = shadow_pgt_put_page,
++	.page_count = pkvm_page_count,
++	.flush_tlb = shadow_ept_flush_tlb,
++};
++
++/*
++ * mm_ops for shadow second-level IOMMU page tables. These tables
++ * are similar to shadow EPT tables, as they also have the EPT
++ * format and their memory is reserved together with shadow EPT
++ * pages. The difference is that this mm_ops doesn't have the
++ * flush_tlb callback.
++ *
++ * Precisely, shadow_sl_iommu_pgt_mm_ops is used for two kinds of
++ * 2nd level iommu page tables:
++ *
++ * - pgstate_pgt which is reused as IOMMU page table for protected
++ *   VM with passthrough devices. In this case the memory is pinned,
++ *   and the mapping is not allowed to be removed from pgstate_pgt,
++ *   so the flush_tlb callback is not needed.
++ *
++ * - Host shadow IOMMU page tables used for the host's devices when
++ *   legacy IOMMU is used. They do not need the flush_tlb callback
++ *   either, since IOTLB flush after unmapping pages from these
++ *   tables is performed in other ways: either as a part of vIOMMU
++ *   IOTLB flush emulation when initiated by the host, or together
++ *   with host EPT TLB flush when ensuring pKVM memory protection.
++ *
++ * TODO: refactor the code: move all the management of both types
++ * of 2nd level iommu page tables to iommu_spgt.c to some common API.
++ * That means also refactoring of pkvm_ptdev structure.
++ */
++static struct pkvm_mm_ops shadow_sl_iommu_pgt_mm_ops = {
++	.phys_to_virt = pkvm_phys_to_virt,
++	.virt_to_phys = pkvm_virt_to_phys,
++	.zalloc_page = shadow_pgt_zalloc_page,
++	.get_page = shadow_pgt_get_page,
++	.put_page = shadow_pgt_put_page,
++	.page_count = pkvm_page_count,
++	.flush_tlb = flush_tlb_noop,
++};
++
++/*
++ * Flushing cache is needed when modifying IOMMU page table entries
++ * if the IOMMU is not coherent. This ops has flush_cache callback
++ * so it can be used for a pgtable which is used as IOMMU page table
++ * with noncoherent IOMMU.
++ */
++static struct pkvm_mm_ops shadow_sl_iommu_pgt_mm_ops_noncoherency = {
++	.phys_to_virt = pkvm_phys_to_virt,
++	.virt_to_phys = pkvm_virt_to_phys,
++	.zalloc_page = shadow_pgt_zalloc_page,
++	.get_page = shadow_pgt_get_page,
++	.put_page = shadow_pgt_put_page,
++	.page_count = pkvm_page_count,
++	.flush_tlb = flush_tlb_noop,
++	.flush_cache = pkvm_clflush_cache_range,
++};
++
++static int pkvm_pgstate_pgt_map_leaf(struct pkvm_pgtable *pgt, unsigned long vaddr, int level,
++				     void *ptep, struct pgt_flush_data *flush_data, void *arg)
++{
++	struct pkvm_pgtable_map_data *data = arg;
++	struct pkvm_pgtable_ops *pgt_ops = pgt->pgt_ops;
++	unsigned long level_size = pgt_ops->pgt_level_to_size(level);
++	unsigned long map_phys = data->phys & PAGE_MASK;
++	struct pkvm_shadow_vm *vm = pgstate_pgt_to_shadow_vm(pgt);
++	int ret;
++
++	/*
++	 * It is possible that another CPU just created same mapping when
++	 * multiple EPT violations happen on different CPUs.
++	 */
++	if (pgt_ops->pgt_entry_present(ptep)) {
++		unsigned long phys = pgt_ops->pgt_entry_to_phys(ptep);
++
++		/*
++		 * Check if the existing mapping is the same as the wanted one.
++		 * If not the same, report an error so that the map_leaf caller
++		 * will not map the different addresses in its shadow EPT.
++		 */
++		if (phys != map_phys) {
++			pkvm_err("%s: gpa 0x%lx @level%d old_phys 0x%lx != new_phys 0x%lx\n",
++				 __func__, vaddr, level, phys, map_phys);
++			return -EPERM;
++		}
++
++		/*
++		 * The pgstate_pgt now is EPT format with fixed property bits. No
++		 * need to check and update property bits for pgstate_pgt.
++		 */
++		goto out;
++	}
++
++	switch (vm->vm_type) {
++	case KVM_X86_DEFAULT_VM:
++		ret = __pkvm_host_share_guest(map_phys, pgt, vaddr, level_size, data->prot);
++		break;
++	case KVM_X86_PROTECTED_VM:
++		if (vm->need_prepopulation)
++			/*
++			 * As pgstate pgt is the source of the shadow EPT, only after pgstate
++			 * pgt is set up, shadow EPT can be set up. So protected VM will not be
++			 * able to use the memory donated in pgstate pgt before its shadow EPT
++			 * is setting up. So it is safe to use the fastpath to donate all the
++			 * pages to improve the pre-population performance. TLB flushing
++			 * can be done in the caller after the pre-population is done but before
++			 * setting up its shadow EPT.
++			 */
++			ret = __pkvm_host_donate_guest_fastpath(map_phys, pgt, vaddr,
++								level_size, data->prot);
++		else
++			ret = __pkvm_host_donate_guest(map_phys, pgt, vaddr,
++						       level_size, data->prot);
++		break;
++	default:
++		ret = -EINVAL;
++		break;
++	}
++
++	if (ret) {
++		pkvm_err("%s failed: ret %d vm_type %ld L2 GPA 0x%lx level %d HPA 0x%lx prot 0x%llx\n",
++			 __func__, ret, vm->vm_type, vaddr, level, map_phys, data->prot);
++		return ret;
++	}
++
++out:
++	/* Increase the physical address for the next mapping */
++	data->phys += level_size;
++
++	return 0;
++}
++
++static int pkvm_pgstate_pgt_free_leaf(struct pkvm_pgtable *pgt, unsigned long vaddr, int level,
++				      void *ptep, struct pgt_flush_data *flush_data, void *arg)
++{
++	unsigned long phys = pgt->pgt_ops->pgt_entry_to_phys(ptep);
++	unsigned long size = pgt->pgt_ops->pgt_level_to_size(level);
++	struct pkvm_shadow_vm *vm = pgstate_pgt_to_shadow_vm(pgt);
++	int ret;
++
++	if (!pgt->pgt_ops->pgt_entry_present(ptep))
++		return 0;
++
++	/*
++	 * For normal VM, call __pkvm_host_unshare_guest() to unshare all previous
++	 * shared pages. A page table entry with present bits indicates the page
++	 * was shared before.
++	 *
++	 * For protected VM, call __pkvm_host_undonate_guest() to undonate all
++	 * previous donated pages, the donated pages are indicated by their page
++	 * table entries which state is present.
++	 *
++	 * Since the pgtable_free_cb in this current page walker is still
++	 * walking the page state table, the __pkvm_host_unshare_guest() or
++	 * __pkvm_host_undonate_guest() are not allowed to release page state
++	 * table pages. So get_page() should be called before these APIs, then
++	 * put_page() to allow pgtable_free_cb free table pages with correct
++	 * refcount.
++	 */
++	switch(vm->vm_type) {
++	case KVM_X86_DEFAULT_VM:
++		pgt->mm_ops->get_page(ptep);
++		ret = __pkvm_host_unshare_guest(phys, pgt, vaddr, size);
++		pgt->mm_ops->put_page(ptep);
++		flush_data->flushtlb |= true;
++		break;
++	case KVM_X86_PROTECTED_VM: {
++		struct mem_range range;
++		/*
++		 * before returning to host, the memory page previously owned by
++		 * protected VM shall be memset to 0 to avoid secret leakage.
++		 */
++		if (find_mem_range(phys, &range))
++			memset(pgt->mm_ops->phys_to_virt(phys), 0, min(size, range.end - phys));
++		pgt->mm_ops->get_page(ptep);
++		ret = __pkvm_host_undonate_guest(phys, pgt, vaddr, size);
++		pgt->mm_ops->put_page(ptep);
++		flush_data->flushtlb |= true;
++		break;
++	}
++	default:
++		ret = -EINVAL;
++		break;
++	}
++
++	if (ret)
++		pkvm_err("%s failed: ret %d vm_type %ld phys 0x%lx GPA 0x%lx size 0x%lx\n",
++			 __func__, ret, vm->vm_type, phys, vaddr, size);
++	return ret;
++}
++
++static void __invalidate_shadow_ept_with_range(struct shadow_ept_desc *desc,
++					       unsigned long vaddr, unsigned long size)
++{
++	struct pkvm_shadow_vm *vm = sept_desc_to_shadow_vm(desc);
++	struct pkvm_pgtable *sept = &desc->sept;
++
++	if (!size)
++		return;
++
++	pkvm_spin_lock(&vm->lock);
++
++	if (!is_valid_eptp(desc->shadow_eptp))
++		goto out;
++
++	pkvm_pgtable_unmap_nosplit(sept, vaddr, size, NULL);
++
++	/*
++	 * As for normal VM, its memory might need to be swapped out
++	 * or other kinds of management from primary VM thus should
++	 * unmap from pgstate pgt as well.
++	 *
++	 * As for protected VM, its memory is pinned thus no need to
++	 * unmap from pgstate pgt.
++	 */
++	if (vm->vm_type == KVM_X86_DEFAULT_VM)
++		pkvm_pgtable_unmap_nosplit(&vm->pgstate_pgt, vaddr, size,
++					   pkvm_pgstate_pgt_free_leaf);
++out:
++	pkvm_spin_unlock(&vm->lock);
++}
++
++void pkvm_invalidate_shadow_ept(struct shadow_ept_desc *desc)
++{
++	struct pkvm_pgtable *sept = &desc->sept;
++	unsigned long size = sept->pgt_ops->pgt_level_to_size(sept->level + 1);
++
++	__invalidate_shadow_ept_with_range(desc, 0, size);
++}
++
++void pkvm_invalidate_shadow_ept_with_range(struct shadow_ept_desc *desc,
++					   unsigned long vaddr, unsigned long size)
++{
++	__invalidate_shadow_ept_with_range(desc, vaddr, size);
++}
++
++void pkvm_shadow_ept_deinit(struct shadow_ept_desc *desc)
++{
++	struct pkvm_shadow_vm *vm = sept_desc_to_shadow_vm(desc);
++
++	pkvm_spin_lock(&vm->lock);
++
++	if (desc->shadow_eptp)
++		pkvm_pgtable_destroy(&desc->sept, NULL);
++
++	memset(desc, 0, sizeof(struct shadow_ept_desc));
++
++	pkvm_spin_unlock(&vm->lock);
++}
++
++int pkvm_shadow_ept_init(struct shadow_ept_desc *desc)
++{
++	struct pkvm_pgtable_cap cap = {
++		.level = 4,
++		.allowed_pgsz = 1 << PG_LEVEL_4K,
++		.table_prot = VMX_EPT_RWX_MASK,
++	};
++	int ret;
++
++	if (vmx_ept_has_2m_page())
++		cap.allowed_pgsz |= 1 << PG_LEVEL_2M;
++	if (vmx_ept_has_1g_page())
++		cap.allowed_pgsz |= 1 << PG_LEVEL_1G;
++
++	memset(desc, 0, sizeof(struct shadow_ept_desc));
++
++	ret = pkvm_pgtable_init(&desc->sept, &shadow_ept_mm_ops, &ept_ops, &cap, true);
++	if (ret)
++		return ret;
++
++	desc->shadow_eptp = pkvm_construct_eptp(desc->sept.root_pa, cap.level);
++	flush_ept(desc->shadow_eptp);
++
++	return 0;
++}
++
++void pkvm_pgstate_pgt_deinit(struct pkvm_shadow_vm *vm)
++{
++	pkvm_spin_lock(&vm->lock);
++
++	pkvm_pgtable_destroy(&vm->pgstate_pgt, pkvm_pgstate_pgt_free_leaf);
++
++	pkvm_spin_unlock(&vm->lock);
++}
++
++int pkvm_pgstate_pgt_init(struct pkvm_shadow_vm *vm)
++{
++	struct pkvm_pgtable *pgt = &vm->pgstate_pgt;
++	struct pkvm_pgtable_cap cap = {
++		.level = pkvm_hyp->ept_iommu_pgt_level,
++		.allowed_pgsz = pkvm_hyp->ept_iommu_pgsz_mask,
++		.table_prot = VMX_EPT_RWX_MASK,
++	};
++
++	return pkvm_pgtable_init(pgt, &shadow_sl_iommu_pgt_mm_ops, &ept_ops, &cap, true);
++}
++
++struct pkvm_mm_ops *pkvm_shadow_sl_iommu_pgt_get_mm_ops(bool coherent)
++{
++	return coherent ? &shadow_sl_iommu_pgt_mm_ops
++			: &shadow_sl_iommu_pgt_mm_ops_noncoherency;
++}
++
++void pkvm_shadow_sl_iommu_pgt_update_coherency(struct pkvm_pgtable *pgt, bool coherent)
++{
++	if (coherent)
++		pkvm_pgtable_set_mm_ops(pgt, &shadow_sl_iommu_pgt_mm_ops);
++	else
++		pkvm_pgtable_set_mm_ops(pgt, &shadow_sl_iommu_pgt_mm_ops_noncoherency);
++}
++
++/*
++ * virtual_ept_mm_ops is used as the ops for the ept constructed by
++ * KVM high in host.
++ * The physical address in this ept is the host VM GPA, which is
++ * the same with HPA.
++ */
++struct pkvm_mm_ops virtual_ept_mm_ops = {
++	.phys_to_virt = host_gpa2hva,
++};
++
++void pkvm_guest_ept_deinit(struct shadow_vcpu_state *shadow_vcpu)
++{
++	struct pkvm_pgtable *vept = &shadow_vcpu->vept;
++
++	memset(vept, 0, sizeof(struct pkvm_pgtable));
++}
++
++void pkvm_guest_ept_init(struct shadow_vcpu_state *shadow_vcpu, u64 guest_eptp)
++{
++	struct pkvm_pgtable_cap cap = {
++		.level = 4,
++		.allowed_pgsz = 1 << PG_LEVEL_4K,
++		.table_prot = VMX_EPT_RWX_MASK,
++	};
++
++	/*
++	 * TODO: we just assume guest will use page level the HW supported,
++	 * it actually need align with KVM high
++	 */
++	if ((guest_eptp & VMX_EPTP_PWL_MASK) == VMX_EPTP_PWL_5)
++		cap.level = 5;
++	if (vmx_ept_has_2m_page())
++		cap.allowed_pgsz |= 1 << PG_LEVEL_2M;
++	if (vmx_ept_has_1g_page())
++		cap.allowed_pgsz |= 1 << PG_LEVEL_1G;
++
++	pkvm_pgtable_init(&shadow_vcpu->vept, &virtual_ept_mm_ops, &ept_ops, &cap, false);
++	shadow_vcpu->vept.root_pa = host_gpa2hpa(guest_eptp & PT64_BASE_ADDR_MASK);
++}
++
++static bool is_access_violation(u64 ept_entry, u64 exit_qual)
++{
++	bool access_violation = false;
++
++	if (/* Caused by data read */
++	    (((exit_qual & 0x1UL) != 0UL) && ((ept_entry & VMX_EPT_READABLE_MASK) == 0)) ||
++	    /* Caused by data write */
++	    (((exit_qual & 0x2UL) != 0UL) && ((ept_entry & VMX_EPT_WRITABLE_MASK) == 0)) ||
++	    /* Caused by instruction fetch */
++	    (((exit_qual & 0x4UL) != 0UL) && ((ept_entry & VMX_EPT_EXECUTABLE_MASK) == 0))) {
++		access_violation = true;
++	}
++
++	return access_violation;
++}
++
++static int populate_pgstate_pgt(struct pkvm_pgtable *pgt)
++{
++	struct pkvm_shadow_vm *vm = pgstate_pgt_to_shadow_vm(pgt);
++	struct list_head *ptdev_head = &vm->ptdev_head;
++	struct pkvm_ptdev *ptdev, *tmp;
++	u64 *prot_override;
++	bool populated;
++	u64 prot;
++	int ret;
++
++	list_for_each_entry(ptdev, ptdev_head, vm_node) {
++		/* No need to populate if vpgt.root_pa doesn't exist */
++		if (!ptdev->vpgt.root_pa)
++			continue;
++
++		populated = false;
++		list_for_each_entry(tmp, ptdev_head, vm_node) {
++			if (tmp == ptdev)
++				break;
++			if (tmp->vpgt.root_pa == ptdev->vpgt.root_pa) {
++				populated = true;
++				break;
++			}
++		}
++
++		if (populated)
++			continue;
++
++		if (ptdev->vpgt.pgt_ops != pgt->pgt_ops) {
++			/* Populate with EPT format */
++			if (is_pgt_ops_ept(pgt)) {
++				prot = VMX_EPT_RWX_MASK;
++			} else {
++				pkvm_err("pkvm: not supported populating\n");
++				return -EOPNOTSUPP;
++			}
++			prot_override = &prot;
++		} else {
++			prot_override = NULL;
++		}
++
++		ret = pkvm_pgtable_sync_map(&ptdev->vpgt, pgt, prot_override,
++					    pkvm_pgstate_pgt_map_leaf);
++		if (ret)
++			return ret;
++	}
++
++	return 0;
++}
++
++static bool allow_shadow_ept_mapping(struct pkvm_shadow_vm *vm,
++				     u64 gpa, unsigned long hpa,
++				     unsigned long size)
++{
++	struct pkvm_pgtable *pgstate_pgt = &vm->pgstate_pgt;
++	unsigned long mapped_hpa;
++	int level;
++
++	/*
++	 * VM will be marked as need_prepopulation when a passthrough device is
++	 * attached. With this flag being set, VM's pgstate_pgt will be pre-populated
++	 * before handling EPT violation. After the population is done, this flag
++	 * can be cleared.
++	 */
++	if (vm->need_prepopulation) {
++		unsigned long size;
++
++		if (populate_pgstate_pgt(pgstate_pgt))
++			return false;
++		/*
++		 * Explicitly flush TLB of the host EPT after populating the page
++		 * state pgt.
++		 *
++		 * During the population, some pages are donated from primary VM to
++		 * this VM with the fastpath interface to avoid doing TLB flushing
++		 * during each iteration of the page donation so that to have a fast
++		 * population performance. So still need to do TLB flushing in the
++		 * end after finishing all the donations.
++		 */
++		size = host_ept.pgt_ops->pgt_level_to_size(host_ept.level + 1);
++		host_ept_flush_tlb(&host_ept, 0, size);
++		vm->need_prepopulation = false;
++	}
++
++	/*
++	 * Lookup the page state pgt to check if the mapping is already created
++	 * or not.
++	 */
++	pkvm_pgtable_lookup(pgstate_pgt, gpa, &mapped_hpa, NULL, &level);
++
++	if ((pgstate_pgt->pgt_ops->pgt_level_to_size(level) < size) ||
++	    mapped_hpa == INVALID_ADDR) {
++		u64 prot;
++		/*
++		 * Page state pgt doesn't have mapping yet, or it has mapping
++		 * but with a smaller size, so try to map with the desired size
++		 * in page state pgt first. Although page state pgt may already
++		 * have all the desired mappings with smaller size, map_leaf
++		 * can help to check if the mapped phys matches with the desired
++		 * hpa to guarantee shadow EPT maps GPA to the right HPA.
++		 */
++		if (is_pgt_ops_ept(pgstate_pgt)) {
++			prot = VMX_EPT_RWX_MASK;
++		} else {
++			pkvm_err("%s: pgstate_pgt format not supported\n", __func__);
++			return false;
++		}
++
++		if (pkvm_pgtable_map(pgstate_pgt, gpa, hpa, size,
++				     0, prot, pkvm_pgstate_pgt_map_leaf)) {
++			pkvm_err("%s: pgstate_pgt map gpa 0x%llx hpa 0x%lx size 0x%lx failed\n",
++				 __func__, gpa, hpa, size);
++			return false;
++		}
++	} else if (mapped_hpa != hpa) {
++		/*
++		 * Page state pgt has mapping already, so check if the mapped
++		 * phys matches with the hpa, and report an error if doesn't
++		 * match.
++		 */
++		pkvm_err("pgstate_pgt not match: mapped_hpa 0x%lx != 0x%lx for gpa 0x%llx\n",
++			 mapped_hpa, hpa, gpa);
++		return false;
++	}
++
++	return true;
++}
++
++enum sept_handle_ret
++pkvm_handle_shadow_ept_violation(struct shadow_vcpu_state *shadow_vcpu, u64 l2_gpa, u64 exit_quali)
++{
++	struct pkvm_shadow_vm *vm = shadow_vcpu->vm;
++	struct shadow_ept_desc *desc = &vm->sept_desc;
++	struct pkvm_pgtable *sept = &desc->sept;
++	struct pkvm_pgtable_ops *pgt_ops = sept->pgt_ops;
++	struct pkvm_pgtable *vept = &shadow_vcpu->vept;
++	enum sept_handle_ret ret = PKVM_NOT_HANDLED;
++	unsigned long phys;
++	int level;
++	u64 gprot, rsvd_chk_gprot;
++
++	pkvm_spin_lock(&vm->lock);
++
++	pkvm_pgtable_lookup(vept, l2_gpa, &phys, &gprot, &level);
++	if (phys == INVALID_ADDR)
++		/* Geust EPT not valid, back to kvm-high */
++		goto out;
++
++	if (is_access_violation(gprot, exit_quali))
++		/* Guest EPT error, refuse to handle in shadow ept */
++		goto out;
++
++	rsvd_chk_gprot = gprot;
++	/* is_rsvd_spte() need based on PAGE_SIZE bit */
++	if (level != PG_LEVEL_4K)
++		pgt_ops->pgt_entry_mkhuge(&rsvd_chk_gprot);
++
++	if (is_rsvd_spte(&ept_zero_check, rsvd_chk_gprot, level)) {
++		ret = PKVM_INJECT_EPT_MISC;
++	} else {
++		unsigned long level_size = pgt_ops->pgt_level_to_size(level);
++		unsigned long gpa = ALIGN_DOWN(l2_gpa, level_size);
++		unsigned long hpa = ALIGN_DOWN(host_gpa2hpa(phys), level_size);
++		/*
++		 * Still set SUPPRESS_VE bit here as some mapping may still
++		 * cause EPT_VIOLATION and we want these EPT_VIOLATION to cause
++		 * vmexit.
++		 */
++		u64 prot = (gprot & EPT_PROT_MASK) | EPT_PROT_DEF;
++
++		if (allow_shadow_ept_mapping(vm, gpa, hpa, level_size) &&
++		    !pkvm_pgtable_map(sept, gpa, hpa, level_size, 0, prot, NULL))
++			ret = PKVM_HANDLED;
++	}
++out:
++	pkvm_spin_unlock(&vm->lock);
++	return ret;
++}
++
++void pkvm_flush_shadow_ept(struct shadow_ept_desc *desc)
++{
++	if (!is_valid_eptp(desc->shadow_eptp))
++		return;
++
++	flush_ept(desc->shadow_eptp);
++}
++
++void pkvm_shadow_clear_suppress_ve(struct kvm_vcpu *vcpu, unsigned long gfn)
++{
++	unsigned long gpa = gfn * PAGE_SIZE;
++	struct pkvm_host_vcpu *pkvm_hvcpu = to_pkvm_hvcpu(vcpu);
++	struct shadow_vcpu_state *shadow_vcpu = pkvm_hvcpu->current_shadow_vcpu;
++	struct pkvm_shadow_vm *vm = shadow_vcpu->vm;
++	struct shadow_ept_desc *desc = &vm->sept_desc;
++	struct pkvm_pgtable *sept = &desc->sept;
++
++	if (!shadow_vcpu_is_protected(shadow_vcpu))
++		return;
++
++	/*
++	 * Set the mmio_pte with prot 0, which means it is invalid and with
++	 * "Suppress #VE" bit cleared. Accessing this pte will trigger #VE.
++	 */
++	pkvm_pgtable_annotate(sept, gpa, PAGE_SIZE, SHADOW_EPT_MMIO_ENTRY);
++}
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/ept.h b/arch/x86/kvm/vmx/pkvm/hyp/ept.h
+new file mode 100644
+index 000000000000..a0b5e701fa31
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/ept.h
+@@ -0,0 +1,70 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++#ifndef __PKVM_EPT_H
++#define __PKVM_EPT_H
++
++#include "pkvm_hyp.h"
++
++#define HOST_EPT_DEF_MEM_PROT   (VMX_EPT_RWX_MASK |				\
++				(MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT))
++#define HOST_EPT_DEF_MMIO_PROT	(VMX_EPT_RWX_MASK |				\
++				(MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT))
++#define EPT_PROT_MASK		(VMX_EPT_RWX_MASK | VMX_EPT_MT_MASK | VMX_EPT_IPAT_BIT)
++#define EPT_PROT_DEF		SUPPRESS_VE
++
++#define SHADOW_EPT_MMIO_ENTRY	0
++
++enum sept_handle_ret {
++	PKVM_NOT_HANDLED,
++	PKVM_HANDLED,
++	PKVM_INJECT_EPT_MISC,
++};
++
++void host_ept_lock(void);
++void host_ept_unlock(void);
++int pkvm_host_ept_map(unsigned long vaddr_start, unsigned long phys_start,
++		unsigned long size, int pgsz_mask, u64 prot);
++int pkvm_host_ept_unmap(unsigned long vaddr_start, unsigned long phys_start,
++		unsigned long size);
++void pkvm_host_ept_lookup(unsigned long vaddr, unsigned long *pphys,
++			  u64 *pprot, int *plevel);
++void pkvm_host_ept_destroy(void);
++int pkvm_host_ept_init(struct pkvm_pgtable_cap *cap, void *ept_pool_base,
++		unsigned long ept_pool_pages);
++int handle_host_ept_violation(struct kvm_vcpu *vcpu, bool *skip);
++void pkvm_flush_host_ept(void);
++int pkvm_shadow_ept_pool_init(void *ept_pool_base, unsigned long ept_pool_pages);
++int pkvm_shadow_ept_init(struct shadow_ept_desc *desc);
++void pkvm_shadow_ept_deinit(struct shadow_ept_desc *desc);
++void pkvm_guest_ept_init(struct shadow_vcpu_state *shadow_vcpu, u64 guest_eptp);
++void pkvm_guest_ept_deinit(struct shadow_vcpu_state *shadow_vcpu);
++enum sept_handle_ret
++pkvm_handle_shadow_ept_violation(struct shadow_vcpu_state *shadow_vcpu, u64 l2_gpa, u64 exit_quali);
++void pkvm_invalidate_shadow_ept(struct shadow_ept_desc *desc);
++void pkvm_invalidate_shadow_ept_with_range(struct shadow_ept_desc *desc,
++					   unsigned long vaddr, unsigned long size);
++void pkvm_flush_shadow_ept(struct shadow_ept_desc *desc);
++void pkvm_shadow_clear_suppress_ve(struct kvm_vcpu *vcpu, unsigned long gfn);
++
++int pkvm_pgstate_pgt_init(struct pkvm_shadow_vm *vm);
++void pkvm_pgstate_pgt_deinit(struct pkvm_shadow_vm *vm);
++
++struct pkvm_mm_ops *pkvm_shadow_sl_iommu_pgt_get_mm_ops(bool coherent);
++void pkvm_shadow_sl_iommu_pgt_update_coherency(struct pkvm_pgtable *pgt, bool coherent);
++
++bool is_pgt_ops_ept(struct pkvm_pgtable *pgt);
++
++static inline bool is_valid_eptp(u64 eptp)
++{
++	if (!eptp || (eptp == INVALID_GPA))
++		return false;
++
++	/* TODO: other bits check */
++	return true;
++}
++
++extern struct pkvm_pgtable_ops ept_ops;
++
++#endif
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/idt.S b/arch/x86/kvm/vmx/pkvm/hyp/idt.S
+new file mode 100644
+index 000000000000..87252724a501
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/idt.S
+@@ -0,0 +1,67 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++#include <linux/linkage.h>
++#include <asm/asm.h>
++#include <asm/unwind_hints.h>
++
++.macro save_frame
++	push %r15
++	push %r14
++	push %r13
++	push %r12
++	push %r11
++	push %r10
++	push %r9
++	push %r8
++	push %_ASM_DI
++	push %_ASM_SI
++	push %_ASM_BP
++	push %_ASM_SP
++	push %_ASM_DX
++	push %_ASM_CX
++	push %_ASM_BX
++	push %_ASM_AX
++.endm
++
++.macro restore_frame
++	pop %_ASM_AX
++	pop %_ASM_BX
++	pop %_ASM_CX
++	pop %_ASM_DX
++	pop %_ASM_SP
++	pop %_ASM_BP
++	pop %_ASM_SI
++	pop %_ASM_DI
++	pop %r8
++	pop %r9
++	pop %r10
++	pop %r11
++	pop %r12
++	pop %r13
++	pop %r14
++	pop %r15
++.endm
++
++SYM_CODE_START(noop_handler)
++	UNWIND_HINT_EMPTY
++	save_frame
++
++	call handle_noop
++
++	restore_frame
++
++	iretq
++SYM_CODE_END(noop_handler)
++
++SYM_CODE_START(nmi_handler)
++	UNWIND_HINT_EMPTY
++	save_frame
++
++	call handle_nmi
++
++	restore_frame
++
++	iretq
++SYM_CODE_END(nmi_handler)
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/init_finalise.c b/arch/x86/kvm/vmx/pkvm/hyp/init_finalise.c
+new file mode 100644
+index 000000000000..035dc092917e
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/init_finalise.c
+@@ -0,0 +1,371 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++
++#include <linux/memblock.h>
++#include <mmu.h>
++#include <mmu/spte.h>
++#include <asm/kvm_pkvm.h>
++#include <asm/pci_x86.h>
++
++#include <pkvm.h>
++#include <gfp.h>
++#include <capabilities.h>
++#include "pkvm_hyp.h"
++#include "early_alloc.h"
++#include "memory.h"
++#include "pgtable.h"
++#include "mmu.h"
++#include "ept.h"
++#include "vmx.h"
++#include "nested.h"
++#include "debug.h"
++#include "iommu.h"
++#include "iommu_internal.h"
++#include "mem_protect.h"
++#include "lapic.h"
++#include "pci.h"
++
++void *pkvm_vmemmap_base;
++void *pkvm_mmu_pgt_base;
++void *host_ept_pgt_base;
++static void *iommu_mem_base;
++static void *shadow_ept_base;
++
++static int divide_memory_pool(phys_addr_t phys, unsigned long size)
++{
++	int data_struct_size = pkvm_data_struct_pages(
++			PKVM_PAGES + PKVM_EXTRA_PAGES,
++			PKVM_PCPU_PAGES + PKVM_HOST_VCPU_PAGES
++			+ PKVM_HOST_VCPU_VMCS_PAGES, pkvm_hyp->num_cpus) << PAGE_SHIFT;
++	void *virt = __pkvm_va(phys + data_struct_size);
++	unsigned long nr_pages;
++
++	pkvm_early_alloc_init(virt, size - data_struct_size);
++
++	nr_pages = pkvm_vmemmap_pages(sizeof(struct pkvm_page));
++	pkvm_vmemmap_base = pkvm_early_alloc_contig(nr_pages);
++	if (!pkvm_vmemmap_base)
++		return -ENOMEM;
++
++	nr_pages = pkvm_mmu_pgtable_pages();
++	pkvm_mmu_pgt_base = pkvm_early_alloc_contig(nr_pages);
++	if (!pkvm_mmu_pgt_base)
++		return -ENOMEM;
++
++	nr_pages = host_ept_pgtable_pages();
++	host_ept_pgt_base = pkvm_early_alloc_contig(nr_pages);
++	if (!host_ept_pgt_base)
++		return -ENOMEM;
++
++	nr_pages = pkvm_iommu_pages(PKVM_MAX_PASID, PKVM_MAX_PASID_PDEV_NUM,
++				    PKVM_MAX_PDEV_NUM, PKVM_MAX_IOMMU_NUM,
++				    PKVM_QI_DESC_ALIGNED_SIZE,
++				    PKVM_QI_DESC_STATUS_ALIGNED_SIZE,
++				    pkvm_hyp->num_cpus);
++	iommu_mem_base = pkvm_early_alloc_contig(nr_pages);
++	if (!iommu_mem_base)
++		return -ENOMEM;
++
++	nr_pages = pkvm_shadow_ept_pgtable_pages(PKVM_MAX_NORMAL_VM_NUM +
++						 PKVM_MAX_SECURE_VM_NUM) +
++		   pkvm_host_shadow_iommu_pgtable_pages(PKVM_MAX_PDEV_NUM);
++	shadow_ept_base = pkvm_early_alloc_contig(nr_pages);
++	if (!shadow_ept_base)
++		return -ENOMEM;
++
++	return 0;
++}
++
++static int pkvm_back_vmemmap(phys_addr_t back_pa)
++{
++	unsigned long i, start, start_va, size, end, end_va = 0;
++	struct memblock_region *reg;
++	int ret;
++
++	/* vmemmap region map to virtual address 0 */
++	__pkvm_vmemmap = 0;
++
++	for (i = 0; i < pkvm_memblock_nr; i++) {
++		reg = &pkvm_memory[i];
++		start = reg->base;
++		/* Translate a range of memory to vmemmap range */
++		start_va = ALIGN_DOWN((unsigned long)pkvm_phys_to_page(start),
++				   PAGE_SIZE);
++		/*
++		 * The beginning of the pkvm_vmemmap region for the current
++		 * memblock may already be backed by the page backing the end of
++		 * the previous region, so avoid mapping it twice.
++		 */
++		start_va = max(start_va, end_va);
++
++		end = reg->base + reg->size;
++		end_va = ALIGN((unsigned long)pkvm_phys_to_page(end), PAGE_SIZE);
++		/* vmemmap va shall below PKVM_IOVA_OFFSET*/
++		if (end_va >= PKVM_IOVA_OFFSET)
++			return -ENOMEM;
++		if (start_va >= end_va)
++			continue;
++
++		size = end_va - start_va;
++		/*
++		 * Create mapping for vmemmap virtual address
++		 * [start, start+size) to physical address
++		 * [back, back+size).
++		 */
++		ret = pkvm_mmu_map(start_va, back_pa, size, 0,
++				  (u64)pgprot_val(PAGE_KERNEL));
++		if (ret)
++			return ret;
++
++		memset(__pkvm_va(back_pa), 0, size);
++		back_pa += size;
++	}
++
++	return 0;
++}
++
++static int create_mmu_mapping(const struct pkvm_section sections[],
++				 int section_sz)
++{
++	unsigned long nr_pages = pkvm_mmu_pgtable_pages();
++	int ret;
++#ifndef CONFIG_PKVM_INTEL_DEBUG
++	struct memblock_region *reg;
++	int i;
++#endif
++
++	ret = pkvm_early_mmu_init(&pkvm_hyp->mmu_cap,
++			pkvm_mmu_pgt_base, nr_pages);
++	if (ret)
++		return ret;
++
++#ifdef CONFIG_PKVM_INTEL_DEBUG
++	/*
++	 * clone host CR3 page mapping from __page_base_offset, it covers both
++	 * direct mapping and symbol mapping for pkvm (same mapping as kernel)
++	 */
++	pkvm_mmu_clone_host(pkvm_hyp->mmu_cap.level, __page_base_offset);
++#else
++	/*
++	 * Create mapping for the memory in memblocks.
++	 * This will include all the memory host kernel can see, as well
++	 * as the memory pkvm allocated during init.
++	 *
++	 * The virtual address for this mapping is the same with the kernel
++	 * direct mapping.
++	 */
++	for (i = 0; i < pkvm_memblock_nr; i++) {
++		reg = &pkvm_memory[i];
++		ret = pkvm_mmu_map((unsigned long)__pkvm_va(reg->base),
++				reg->base, reg->size,
++				0, (u64)pgprot_val(PAGE_KERNEL));
++		if (ret)
++			return ret;
++	}
++
++	for (i = 0; i < section_sz; i++) {
++		if (sections[i].type != PKVM_RESERVED_MEMORY) {
++			ret = pkvm_mmu_map(sections[i].addr,
++					__pkvm_pa_symbol(sections[i].addr),
++					sections[i].size,
++					0, sections[i].prot);
++		}
++		if (ret)
++			return ret;
++	}
++#endif
++
++	ret = pkvm_back_vmemmap(__pkvm_pa(pkvm_vmemmap_base));
++	if (ret)
++		return ret;
++
++	/* Switch the mmu pgtable to enable pkvm_vmemmap */
++	native_write_cr3(pkvm_hyp->mmu->root_pa);
++
++	pkvm_later_mmu_init(pkvm_mmu_pgt_base, nr_pages);
++
++	return 0;
++}
++
++static int create_host_ept_mapping(void)
++{
++	struct memblock_region *reg;
++	int ret, i;
++	unsigned long phys = 0;
++	u64 entry_prot;
++
++	ret = pkvm_host_ept_init(&pkvm_hyp->ept_cap,
++			host_ept_pgt_base, host_ept_pgtable_pages());
++	if (ret)
++		return ret;
++
++	/*
++	 * Create EPT mapping for memory with WB + RWX property
++	 */
++	entry_prot = pkvm_mkstate(HOST_EPT_DEF_MEM_PROT, PKVM_PAGE_OWNED);
++	for (i = 0; i < pkvm_memblock_nr; i++) {
++		reg = &pkvm_memory[i];
++		ret = pkvm_host_ept_map((unsigned long)reg->base,
++				  (unsigned long)reg->base,
++				  (unsigned long)reg->size,
++				  0, entry_prot);
++		pkvm_info("create_host_ept_mapping(): mapped 0x%llx -> 0x%llx, sz %llu\n", reg->base, reg->base, reg->size);
++		if (ret)
++			return ret;
++	}
++
++	/*
++	 * The holes in memblocks are treated as MMIO with the
++	 * mapping UC + RWX.
++	 */
++	entry_prot = pkvm_mkstate(HOST_EPT_DEF_MMIO_PROT, PKVM_PAGE_OWNED);
++	for (i = 0; i < pkvm_memblock_nr; i++, phys = reg->base + reg->size) {
++		reg = &pkvm_memory[i];
++		pkvm_info("create_host_ept_mapping(): mapped 0x%lx -> 0x%lx, sz %llu\n", phys, phys, reg->base - phys);
++		ret = pkvm_host_ept_map(phys, phys, (unsigned long)reg->base - phys,
++				  0, entry_prot);
++		if (ret)
++			return ret;
++	}
++
++	return 0;
++}
++
++static int protect_pkvm_pages(const struct pkvm_section sections[],
++		       int section_sz, phys_addr_t phys, unsigned long size)
++{
++	int i, ret;
++
++	for (i = 0; i < section_sz; i++) {
++		u64 pa, size;
++
++		if (sections[i].type == PKVM_CODE_DATA_SECTIONS) {
++			pa = __pkvm_pa_symbol(sections[i].addr);
++			size = sections[i].size;
++			kvm_info("protect_pkvm_pages(): unmapping pkvm addr 0x%llx -> 0x%llx, sz %llu\n", pa, pa, size);
++			ret = pkvm_host_ept_unmap(pa, pa, size);
++			if (ret) {
++				pkvm_err("%s: failed to protect section\n", __func__);
++				return ret;
++			}
++		}
++	}
++
++	ret = pkvm_host_ept_unmap(phys, phys, size);
++	kvm_info("protect_pkvm_pages(): unmapping pkvm addr 0x%llx -> 0x%llx, sz %lu\n", phys, phys, size);
++	if (ret) {
++		pkvm_err("%s: failed to protect reserved memory\n", __func__);
++		return ret;
++	}
++
++	return 0;
++}
++
++static int create_iommu(void)
++{
++	int nr_pages = pkvm_iommu_pages(PKVM_MAX_PASID, PKVM_MAX_PASID_PDEV_NUM,
++					PKVM_MAX_PDEV_NUM, PKVM_MAX_IOMMU_NUM,
++					PKVM_QI_DESC_ALIGNED_SIZE,
++					PKVM_QI_DESC_STATUS_ALIGNED_SIZE,
++					pkvm_hyp->num_cpus);
++
++	return pkvm_init_iommu(pkvm_virt_to_phys(iommu_mem_base), nr_pages);
++}
++
++#define TMP_SECTION_SZ	16UL
++int __pkvm_init_finalise(struct kvm_vcpu *vcpu, struct pkvm_section sections[],
++			 int section_sz)
++{
++	int i, ret = 0;
++	static bool pkvm_init;
++	struct pkvm_host_vcpu *pkvm_host_vcpu = to_pkvm_hvcpu(vcpu);
++	struct pkvm_pcpu *pcpu = pkvm_host_vcpu->pcpu;
++	struct pkvm_section tmp_sections[TMP_SECTION_SZ];
++	phys_addr_t pkvm_mem_base;
++	unsigned long pkvm_mem_size = 0;
++	u64 eptp;
++
++	if (pkvm_init) {
++		/* Switch to pkvm mmu in root mode in case some setup may need this */
++		native_write_cr3(pkvm_hyp->mmu->root_pa);
++		goto switch_pgt;
++	}
++
++	if (section_sz > TMP_SECTION_SZ) {
++		pkvm_err("pkvm: no enough space to save sections[] array parameters!");
++		goto out;
++	}
++
++	/* kernel may use VMAP_STACK, which could make the parameter's vaddr
++	 * not-valid after we switch new CR3 later, so copy parameter sections
++	 * array from host space to pkvm space
++	 */
++	for (i = 0; i < section_sz; i++) {
++		tmp_sections[i] = sections[i];
++		if (sections[i].type == PKVM_RESERVED_MEMORY) {
++			pkvm_mem_base = sections[i].addr;
++			pkvm_mem_size = sections[i].size;
++		}
++	}
++	if (pkvm_mem_size == 0) {
++		pkvm_err("pkvm: no pkvm reserve memory!");
++		goto out;
++	}
++
++	ret = divide_memory_pool(pkvm_mem_base, pkvm_mem_size);
++	if (ret) {
++		pkvm_err("pkvm: not reserve enough memory!");
++		goto out;
++	}
++
++	ret = create_mmu_mapping(tmp_sections, section_sz);
++	if (ret)
++		goto out;
++
++	ret = create_host_ept_mapping();
++	if (ret)
++		goto out;
++
++	ret = protect_pkvm_pages(tmp_sections, section_sz,
++			pkvm_mem_base, pkvm_mem_size);
++	if (ret)
++		goto out;
++
++	ret = init_finalize_pci(&pkvm_hyp->host_vm.pci_info);
++	if (ret)
++		goto out;
++
++	ret = create_iommu();
++	if (ret)
++		goto out;
++
++	pkvm_init_nest();
++
++	ret = pkvm_shadow_ept_pool_init(shadow_ept_base,
++					pkvm_shadow_ept_pgtable_pages(PKVM_MAX_NORMAL_VM_NUM +
++								      PKVM_MAX_SECURE_VM_NUM) +
++					pkvm_host_shadow_iommu_pgtable_pages(PKVM_MAX_PDEV_NUM));
++	if (ret)
++		goto out;
++
++	pkvm_init = true;
++
++switch_pgt:
++	/* switch mmu */
++	vmcs_writel(HOST_CR3, pkvm_hyp->mmu->root_pa);
++	pcpu->cr3 = pkvm_hyp->mmu->root_pa;
++
++	/* enable ept */
++	eptp = pkvm_construct_eptp(pkvm_hyp->host_vm.ept->root_pa,
++			pkvm_hyp->host_vm.ept->level);
++	secondary_exec_controls_setbit(&pkvm_host_vcpu->vmx, SECONDARY_EXEC_ENABLE_EPT);
++	vmcs_write64(EPT_POINTER, eptp);
++
++	ept_sync_global();
++
++	ret = pkvm_setup_lapic(pcpu, vcpu->cpu);
++out:
++	return ret;
++}
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/io.h b/arch/x86/kvm/vmx/pkvm/hyp/io.h
+new file mode 100644
+index 000000000000..bf62bbdc1697
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/io.h
+@@ -0,0 +1,82 @@
++/* SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2023 Intel Corporation
++ */
++#ifndef _PKVM_IO_H_
++#define _PKVM_IO_H_
++
++/* Size mask for I/O access */
++#define IO_SIZE_1 1
++#define IO_SIZE_2 2
++#define IO_SIZE_4 4
++#define IO_SIZE_FULL 7
++
++static inline void pkvm_pio_read(unsigned int port, int size, unsigned long *value)
++{
++	switch (size) {
++	case IO_SIZE_1:
++		*(u8 *)value = inb(port);
++		break;
++	case IO_SIZE_2:
++		*(u16 *)value = inw(port);
++		break;
++	case IO_SIZE_4:
++		*(u32 *)value = inl(port);
++		break;
++	default:
++		break;
++	}
++}
++
++static inline void pkvm_pio_write(unsigned int port, int size, unsigned long value)
++{
++	switch (size) {
++	case IO_SIZE_1:
++		outb((u8)value, port);
++		break;
++	case IO_SIZE_2:
++		outw((u16)value, port);
++		break;
++	case IO_SIZE_4:
++		outl((u32)value, port);
++		break;
++	default:
++		break;
++	}
++}
++
++
++static inline void pkvm_mmio_read(u64 pos, int size, unsigned long *value)
++{
++	switch (size) {
++	case IO_SIZE_1:
++		asm volatile("movb (%1),%%al" : "=a" (*(u8 *)value) : "r" (pos));
++		break;
++	case IO_SIZE_2:
++		asm volatile("movw (%1),%%ax" : "=a" (*(u16 *)value) : "r" (pos));
++		break;
++	case IO_SIZE_4:
++		asm volatile("movl (%1),%%eax" : "=a" (*(u32 *)value) : "r" (pos));
++		break;
++	default:
++		break;
++	}
++}
++
++static inline void pkvm_mmio_write(u64 pos, int size, unsigned long value)
++{
++	switch (size) {
++	case IO_SIZE_1:
++		asm volatile("movb %%al,(%1)" : : "a" ((u8)value), "r" (pos) : "memory");
++		break;
++	case IO_SIZE_2:
++		asm volatile("movw %%ax,(%1)" : : "a" ((u16)value), "r" (pos) : "memory");
++		break;
++	case IO_SIZE_4:
++		asm volatile("movl %%eax,(%1)" : : "a" ((u32)value), "r" (pos) : "memory");
++		break;
++	default:
++		break;
++	}
++}
++
++#endif
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/io_emulate.c b/arch/x86/kvm/vmx/pkvm/hyp/io_emulate.c
+new file mode 100644
+index 000000000000..d48d804aaf12
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/io_emulate.c
+@@ -0,0 +1,374 @@
++// SPDX-License-Identifier: GPL-2.0
++/* Copyright(c) 2023 Intel Corporation. */
++#include <pkvm.h>
++#include "ept.h"
++#include "io.h"
++#include "io_emulate.h"
++
++struct pkvm_pio_emul_table host_pio_emul_table;
++struct pkvm_mmio_emul_table host_mmio_emul_table;
++
++static int pkvm_pio_default_in(struct kvm_vcpu *vcpu, struct pkvm_pio_req *req)
++{
++	pkvm_pio_read(req->port, req->size, req->value);
++	return 0;
++}
++
++static int pkvm_pio_default_out(struct kvm_vcpu *vcpu, struct pkvm_pio_req *req)
++{
++	pkvm_pio_write(req->port, req->size, *req->value);
++	return 0;
++}
++
++struct pkvm_pio_handler default_pio_handler = {
++	.read = pkvm_pio_default_in,
++	.write = pkvm_pio_default_out
++};
++
++/*
++ * Not thread safe and should hold a lock if called concurrently.
++ */
++int register_host_pio_handler(struct pkvm_host_vm *host_vm, unsigned int port,
++	unsigned int size_mask, pio_handler_t read, pio_handler_t write)
++{
++	struct pkvm_pio_emul_table *table;
++	struct pkvm_pio_handler *handler;
++	unsigned long index;
++	u8 bit;
++
++	table = &host_pio_emul_table;
++	index = find_first_zero_bit(table->bitmap, PKVM_MAX_PIO_EMUL_NUM);
++	if (index >= PKVM_MAX_PIO_EMUL_NUM)
++		return -ENOSPC;
++
++	__set_bit(index, table->bitmap);
++
++	handler = &table->table[index];
++	handler->port = port;
++	handler->size_mask = size_mask;
++	handler->read = read;
++	handler->write = write;
++
++	index = port >> 3U;
++	bit = (u8)(1U << (port & 0x7U));
++	host_vm->io_bitmap[index] |= bit;
++
++	return 0;
++}
++
++static bool pio_access_valid(int size)
++{
++	return size == IO_SIZE_1 || size == IO_SIZE_2 || size == IO_SIZE_4;
++}
++
++static struct pkvm_pio_handler *get_pio_handler(struct pkvm_pio_emul_table *table,
++	struct pkvm_pio_req *req)
++{
++	struct pkvm_pio_handler *handler;
++	unsigned long index;
++	/*
++	 * Port I/O access is expected to only based on their address and have a
++	 * fixed access width. Note that they might overlap, for example PCI config
++	 * space addr port 0xcf8 and ACPI reset port 0xcf9. So match the handler
++	 * strictly based on their base address and access width here.
++	 *
++	 * There are two special situations to consider. One case is that the base
++	 * address matches but the access width differs, this is regarded as an
++	 * invalid access and thus return a NULL handler. Another case is no base
++	 * address matches. This is due to an overlapped I/O access that triggered
++	 * the IO VM exit, but we are not intended to handle the base address. So
++	 * in this case choose the default handler to do plain pio.
++	 */
++	for_each_set_bit(index, table->bitmap, PKVM_MAX_PIO_EMUL_NUM) {
++		handler = &table->table[index];
++		if (req->port == handler->port) {
++			if (pio_access_valid(req->size) && (req->size & handler->size_mask))
++				return handler;
++
++			pkvm_err("pkvm: I/O port 0x%x mismatched access witdth %d",
++				req->port, req->size);
++			return NULL;
++		}
++	}
++
++	return &default_pio_handler;
++}
++
++static int emulate_host_pio(struct kvm_vcpu *vcpu, struct pkvm_pio_req *req)
++{
++	struct pkvm_pio_emul_table *table;
++	struct pkvm_pio_handler *handler;
++	int ret = 0;
++
++	table = &host_pio_emul_table;
++	handler = get_pio_handler(table, req);
++	if (!handler)
++		return -EINVAL;
++
++	if (req->direction == PKVM_IO_READ && handler->read)
++		ret = handler->read(vcpu, req);
++	else if (req->direction == PKVM_IO_WRITE && handler->write)
++		ret = handler->write(vcpu, req);
++
++	return ret;
++}
++
++int handle_host_pio(struct kvm_vcpu *vcpu)
++{
++	struct vcpu_vmx *vmx = to_vmx(vcpu);
++	unsigned long exit_qual;
++	struct pkvm_pio_req req;
++	int string;
++
++	exit_qual = vmx->exit_qualification;
++
++	string = (exit_qual & 16) != 0;
++	if (string) {
++		pkvm_err("pkvm: unsupported string instruction\n");
++		return -EINVAL;
++	}
++
++	req.port = exit_qual >> 16;
++	req.size = (exit_qual & 7) + 1;
++	req.value = &vcpu->arch.regs[VCPU_REGS_RAX];
++	req.direction = (exit_qual & 8) == 0;
++
++	pkvm_dbg("pkvm: host %s I/O port 0x%x width %d value %lx", req.direction ?
++		"write" : "read", req.port, req.size, *req.value);
++
++	return emulate_host_pio(vcpu, &req);
++}
++
++static int pkvm_mmio_default_read(struct kvm_vcpu *vcpu, struct pkvm_mmio_req *req)
++{
++	pkvm_mmio_read((u64)host_mmio2hva(req->address), req->size, req->value);
++	return 0;
++}
++
++static int pkvm_mmio_default_write(struct kvm_vcpu *vcpu, struct pkvm_mmio_req *req)
++{
++	pkvm_mmio_write((u64)host_mmio2hva(req->address), req->size, *req->value);
++	return 0;
++}
++
++struct pkvm_mmio_handler default_mmio_handler = {
++	.read = pkvm_mmio_default_read,
++	.write = pkvm_mmio_default_write
++};
++
++static struct pkvm_mmio_handler *emul_mmio_lookup(struct pkvm_mmio_emul_table *table,
++	unsigned long start, unsigned long end)
++{
++	struct pkvm_mmio_handler *handler;
++	unsigned long index;
++
++	for_each_set_bit(index, table->bitmap, PKVM_MAX_MMIO_EMUL_NUM) {
++		handler = &table->table[index];
++		if (start <= handler->end && handler->start <= end)
++			return handler;
++	}
++
++	return NULL;
++}
++
++/*
++ * Not thread safe and should hold a lock if called concurrently.
++ */
++int register_host_mmio_handler(unsigned long start, unsigned long end,
++	mmio_handler_t read, mmio_handler_t write)
++{
++	struct pkvm_mmio_emul_table *table;
++	struct pkvm_mmio_handler *handler;
++	unsigned long index;
++	int ret = 0;
++
++	if (start > end)
++		return -EINVAL;
++
++	table = &host_mmio_emul_table;
++
++	if (emul_mmio_lookup(table, start, end))
++		return -EINVAL;
++
++	index = find_first_zero_bit(table->bitmap, PKVM_MAX_MMIO_EMUL_NUM);
++	if (index >= PKVM_MAX_MMIO_EMUL_NUM)
++		return -ENOSPC;
++
++	__set_bit(index, table->bitmap);
++
++	handler = &table->table[index];
++	handler->start = start;
++	handler->end = end;
++	handler->read = read;
++	handler->write = write;
++
++	host_ept_lock();
++	ret = pkvm_host_ept_unmap(start, start, end - start + 1);
++	host_ept_unlock();
++
++	return ret;
++}
++
++/*
++ * mmcfg access in x86 only use simple mov instrcutions. So keep the decoder
++ * simple for now.
++ * TODO: make the decoder complete
++ */
++static int mmio_instruction_decode(struct kvm_vcpu *vcpu, unsigned long gpa,
++	struct pkvm_mmio_req *req)
++{
++	struct x86_exception exception;
++	bool direction, zero_extend = false;
++	unsigned long rip;
++	u8 insn[3];
++	int size;
++
++	rip = vmcs_readl(GUEST_RIP);
++
++	/*
++	 * Read first three bytes is enough to determine the opcode.
++	 * Check arch/x86/include/asm/pci_x86.h.
++	 */
++	if (read_gva(vcpu, rip, insn, 3, &exception) < 0)
++		return -EINVAL;
++
++	/*
++	 * In case the compiler adds the REX prefix
++	 */
++	if ((insn[0] & 0xf0) == 0x40) {
++		insn[0] = insn[1];
++		insn[1] = insn[2];
++	}
++
++	if (insn[0] == 0x66 && (insn[1] & 0xf0) == 0x40)
++		insn[1] = insn[2];
++
++	switch (insn[0]) {
++	case 0x0f:
++		switch (insn[1]) {
++		case 0xb6:
++			zero_extend = true;
++			direction = PKVM_IO_READ;
++			size = 1;
++			break;
++		default:
++			return -EIO;
++		}
++		break;
++	case 0x66:
++		size = 2;
++		switch (insn[1]) {
++		case 0x89:
++			direction = PKVM_IO_WRITE;
++			break;
++		case 0x8b:
++			direction = PKVM_IO_READ;
++			break;
++		default:
++			return -EIO;
++		}
++		break;
++	case 0x88:
++		size = 1;
++		direction = PKVM_IO_WRITE;
++		break;
++	case 0x89:
++		size = 4;
++		direction = PKVM_IO_WRITE;
++		break;
++	case 0x8a:
++		size = 1;
++		direction = PKVM_IO_READ;
++		break;
++	case 0x8b:
++		size = 4;
++		direction = PKVM_IO_READ;
++		break;
++	default:
++		return -EIO;
++	}
++
++	req->address = gpa;
++	req->size = size;
++	req->value = &vcpu->arch.regs[VCPU_REGS_RAX];
++	req->direction = direction;
++
++	if (zero_extend)
++		*req->value = 0;
++
++	return 0;
++}
++
++static struct pkvm_mmio_handler *get_mmio_handler(struct pkvm_mmio_emul_table *table,
++	struct pkvm_mmio_req *req)
++{
++	struct pkvm_mmio_handler *handler;
++	unsigned long start, end;
++
++	start = req->address;
++	end = req->address + req->size - 1;
++
++	handler = emul_mmio_lookup(table, start, end);
++
++	/*
++	 * If handler is NULL, this is an access that does not touch the emulated
++	 * MMIO range. Return the default handler.
++	 */
++	if (!handler)
++		return &default_mmio_handler;
++
++	/* Do not allow the access to cross the boundary. */
++	if ((start < handler->start && end >= handler->start) ||
++		(start <= handler->end && end > handler->end))
++		return NULL;
++
++	return handler;
++}
++
++static int emulate_host_mmio(struct kvm_vcpu *vcpu, struct pkvm_mmio_req *req)
++{
++	struct pkvm_mmio_emul_table *table;
++	struct pkvm_mmio_handler *handler;
++	int ret = 0;
++
++	table = &host_mmio_emul_table;
++
++	handler = get_mmio_handler(table, req);
++	if (!handler)
++		return -EINVAL;
++
++	if (req->direction == PKVM_IO_READ && handler->read)
++		ret = handler->read(vcpu, req);
++	else if (req->direction == PKVM_IO_WRITE && handler->write)
++		ret = handler->write(vcpu, req);
++
++	return ret;
++}
++
++static int handle_host_mmio(struct kvm_vcpu *vcpu, unsigned long gpa)
++{
++	struct pkvm_mmio_req req;
++
++	if (mmio_instruction_decode(vcpu, gpa, &req)) {
++		pkvm_dbg("pkvm: MMIO instruction decode failed");
++		return -EINVAL;
++	}
++
++	pkvm_dbg("pkvm: host %s MMIO gpa 0x%lx width %d value 0x%lx", req.direction ?
++		"write" : "read", req.address, req.size, *req.value);
++
++	return emulate_host_mmio(vcpu, &req);
++}
++
++int try_emul_host_mmio(struct kvm_vcpu *vcpu, unsigned long gpa)
++{
++	if (emul_mmio_lookup(&host_mmio_emul_table, gpa, gpa) == NULL)
++		return -EINVAL;
++
++	if (handle_host_mmio(vcpu, gpa)) {
++		pkvm_err("%s: emulate MMIO failed for memory address 0x%lx\n", __func__, gpa);
++		return -EIO;
++	}
++
++	return 0;
++}
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/io_emulate.h b/arch/x86/kvm/vmx/pkvm/hyp/io_emulate.h
+new file mode 100644
+index 000000000000..d9303bd8bf20
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/io_emulate.h
+@@ -0,0 +1,67 @@
++/* SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2023 Intel Corporation
++ */
++#ifndef _PKVM_IO_EMULATE_H_
++#define _PKVM_IO_EMULATE_H_
++
++/* I/O direction */
++#define PKVM_IO_READ 0
++#define PKVM_IO_WRITE 1
++
++/* Max num of port I/O emulation handlers */
++#define PKVM_MAX_PIO_EMUL_NUM 32
++
++struct pkvm_pio_req {
++	unsigned int port;
++	int size;
++	bool direction;
++	unsigned long *value;
++};
++
++typedef int (*pio_handler_t)(struct kvm_vcpu *, struct pkvm_pio_req *);
++
++struct pkvm_pio_handler {
++	unsigned int port;
++	int size_mask;
++	pio_handler_t read;
++	pio_handler_t write;
++};
++
++struct pkvm_pio_emul_table {
++	struct pkvm_pio_handler table[PKVM_MAX_PIO_EMUL_NUM];
++	DECLARE_BITMAP(bitmap, PKVM_MAX_PIO_EMUL_NUM);
++};
++
++/* Max num of memory mapped I/O emulation handlers */
++#define PKVM_MAX_MMIO_EMUL_NUM 256
++
++struct pkvm_mmio_req {
++	unsigned long address;
++	int size;
++	bool direction;
++	unsigned long *value;
++};
++
++typedef int (*mmio_handler_t)(struct kvm_vcpu *, struct pkvm_mmio_req *);
++
++struct pkvm_mmio_handler {
++	unsigned long start;
++	unsigned long end;
++	mmio_handler_t read;
++	mmio_handler_t write;
++};
++
++struct pkvm_mmio_emul_table {
++	struct pkvm_mmio_handler table[PKVM_MAX_MMIO_EMUL_NUM];
++	DECLARE_BITMAP(bitmap, PKVM_MAX_MMIO_EMUL_NUM);
++};
++
++int register_host_pio_handler(struct pkvm_host_vm *host_vm, unsigned int port,
++	unsigned int size_mask, pio_handler_t read, pio_handler_t write);
++int handle_host_pio(struct kvm_vcpu *vcpu);
++
++int register_host_mmio_handler(unsigned long start, unsigned long end,
++	mmio_handler_t read, mmio_handler_t write);
++int try_emul_host_mmio(struct kvm_vcpu *vcpu, unsigned long gpa);
++
++#endif
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/iommu.c b/arch/x86/kvm/vmx/pkvm/hyp/iommu.c
+new file mode 100644
+index 000000000000..6556ee9f4884
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/iommu.c
+@@ -0,0 +1,2372 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++#include <linux/intel-iommu.h>
++#include <pkvm_spinlock.h>
++#include <pkvm.h>
++#include <gfp.h>
++#include "pkvm_hyp.h"
++#include "memory.h"
++#include "mmu.h"
++#include "ept.h"
++#include "pgtable.h"
++#include "iommu_internal.h"
++#include "debug.h"
++#include "ptdev.h"
++#include "iommu_spgt.h"
++#include "bug.h"
++
++#define for_each_valid_iommu(p)						\
++	for ((p) = iommus; (p) < iommus + PKVM_MAX_IOMMU_NUM; (p)++)	\
++		if (!(p) || !(p)->iommu.reg_phys) {			\
++			continue;					\
++		} else
++
++static struct pkvm_iommu iommus[PKVM_MAX_IOMMU_NUM];
++
++static struct pkvm_pool iommu_pool;
++
++/* Used in legacy mode only. */
++struct shadow_pgt_sync_data {
++	unsigned long vaddr;
++	unsigned long vaddr_end;
++};
++
++/*
++ * Guest root/context/pasid table (hereinafter "id table") walking parameter.
++ * pkvm IOMMU driver walks the guest page table when syncing
++ * with the shadow id table.
++ */
++struct id_sync_walk_data {
++	struct pkvm_iommu *iommu;
++	/*
++	 * Used to hold shadow id table physical address
++	 * which is used for sync shadow entries at each
++	 * id table level.
++	 */
++	u64 shadow_pa[IOMMU_SM_LEVEL_NUM];
++	/*
++	 * Used when just syncing a part of shadow
++	 * id table entries which match with this did if
++	 * it is set as a non-zero did value.
++	 */
++	u16 did;
++	/*
++	 * Used in legacy mode when just syncing a specific
++	 * range of pages in shadow page tables.
++	 */
++	struct shadow_pgt_sync_data *spgt_data;
++};
++
++#define DEFINE_ID_SYNC_WALK_DATA(name, _iommu, domain_id, _spgt_data)	\
++	struct id_sync_walk_data (name) = {				\
++		.iommu = (_iommu),					\
++		.shadow_pa = {0},					\
++		.did = (domain_id),					\
++		.spgt_data = (_spgt_data),				\
++	}
++
++/*
++ * Used to config a shadow id table entry in root/context/pasid
++ * level.
++ */
++struct id_sync_data {
++	union {
++		u64 root_entry;
++		struct context_entry ct_entry;
++		struct pasid_dir_entry pd_entry;
++		struct pasid_entry p_entry;
++	};
++	void *guest_ptep;
++	void *shadow_ptep;
++	int level;
++	u64 iommu_ecap;
++	u64 shadow_pa;
++	struct pkvm_pgtable *shadow_id;
++	unsigned long vaddr;
++	struct shadow_pgt_sync_data *spgt_data;
++};
++
++static inline void *iommu_zalloc_pages(size_t size)
++{
++	return pkvm_alloc_pages(&iommu_pool, get_order(size));
++}
++
++static void *iommu_zalloc_page(void)
++{
++	return pkvm_alloc_pages(&iommu_pool, 0);
++}
++
++static void iommu_get_page(void *vaddr)
++{
++	pkvm_get_page(&iommu_pool, vaddr);
++}
++
++static void iommu_put_page(void *vaddr)
++{
++	pkvm_put_page(&iommu_pool, vaddr);
++}
++
++static void iommu_flush_cache(void *ptep, unsigned int size)
++{
++	pkvm_clflush_cache_range(ptep, size);
++}
++
++static struct pkvm_mm_ops viommu_mm_ops = {
++	.phys_to_virt = host_gpa2hva,
++};
++
++static struct pkvm_mm_ops iommu_pw_coherency_mm_ops = {
++	.phys_to_virt = pkvm_phys_to_virt,
++	.virt_to_phys = pkvm_virt_to_phys,
++	.zalloc_page = iommu_zalloc_page,
++	.get_page = iommu_get_page,
++	.put_page = iommu_put_page,
++	.page_count = pkvm_page_count,
++};
++
++static struct pkvm_mm_ops iommu_pw_noncoherency_mm_ops = {
++	.phys_to_virt = pkvm_phys_to_virt,
++	.virt_to_phys = pkvm_virt_to_phys,
++	.zalloc_page = iommu_zalloc_page,
++	.get_page = iommu_get_page,
++	.put_page = iommu_put_page,
++	.page_count = pkvm_page_count,
++	.flush_cache = iommu_flush_cache,
++};
++
++static bool iommu_id_entry_present(void *ptep)
++{
++	u64 val;
++
++	val = *(u64 *)ptep;
++	return !!(val & 1);
++}
++
++static unsigned long iommu_id_entry_to_phys(void *ptep)
++{
++	u64 val = *(u64 *)ptep;
++
++	return val & VTD_PAGE_MASK;
++}
++
++static int iommu_sm_id_entry_to_index(unsigned long vaddr, int level)
++{
++	switch (level) {
++	case IOMMU_PASID_TABLE:
++		return vaddr & (BIT(PASIDDIR_BITS) - 1);
++	case IOMMU_PASID_DIR:
++		return (vaddr >> PASIDDIR_SHIFT) & (BIT(PASIDDIR_BITS) - 1);
++	case IOMMU_SM_CONTEXT:
++		return (vaddr >> DEVFN_SHIFT) & (BIT(SM_DEVFN_BITS) - 1);
++	case IOMMU_SM_ROOT:
++		return (vaddr >> SM_BUS_SHIFT) & (BIT(SM_BUS_BITS) - 1);
++	default:
++		break;
++	}
++
++	return -EINVAL;
++}
++
++static bool iommu_id_entry_is_leaf(void *ptep, int level)
++{
++	if (LAST_LEVEL(level) ||
++		!iommu_id_entry_present(ptep))
++		return true;
++
++	return false;
++}
++
++static int iommu_sm_id_level_entry_size(int level)
++{
++	switch (level) {
++	case IOMMU_PASID_TABLE:
++		return sizeof(struct pasid_entry);
++	case IOMMU_PASID_DIR:
++		return sizeof(struct pasid_dir_entry);
++	case IOMMU_SM_CONTEXT:
++		/* scalable mode requires 32bytes for context */
++		return sizeof(struct context_entry) * 2;
++	case IOMMU_SM_ROOT:
++		return sizeof(u64);
++	default:
++		break;
++	}
++
++	return -EINVAL;
++}
++
++static int iommu_sm_id_level_to_entries(int level)
++{
++	switch (level) {
++	case IOMMU_PASID_TABLE:
++		return 1 << PASIDTAB_BITS;
++	case IOMMU_PASID_DIR:
++		return 1 << PASIDDIR_BITS;
++	case IOMMU_SM_CONTEXT:
++		return 1 << SM_DEVFN_BITS;
++	case IOMMU_SM_ROOT:
++		return 1 << SM_BUS_BITS;
++	default:
++		break;
++	}
++
++	return -EINVAL;
++}
++
++static unsigned long iommu_sm_id_level_to_size(int level)
++{
++	switch (level) {
++	case IOMMU_PASID_TABLE:
++		return 1;
++	case IOMMU_PASID_DIR:
++		return 1 << PASIDDIR_SHIFT;
++	case IOMMU_SM_CONTEXT:
++		return 1 << DEVFN_SHIFT;
++	case IOMMU_SM_ROOT:
++		return 1 << SM_BUS_SHIFT;
++	default:
++		break;
++	}
++
++	return 0;
++}
++
++struct pkvm_pgtable_ops iommu_sm_id_ops = {
++	.pgt_entry_present = iommu_id_entry_present,
++	.pgt_entry_to_phys = iommu_id_entry_to_phys,
++	.pgt_entry_to_index = iommu_sm_id_entry_to_index,
++	.pgt_entry_is_leaf = iommu_id_entry_is_leaf,
++	.pgt_level_entry_size = iommu_sm_id_level_entry_size,
++	.pgt_level_to_entries = iommu_sm_id_level_to_entries,
++	.pgt_level_to_size = iommu_sm_id_level_to_size,
++};
++
++static int iommu_lm_id_entry_to_index(unsigned long vaddr, int level)
++{
++	switch (level) {
++	case IOMMU_LM_CONTEXT:
++		return (vaddr >> LM_DEVFN_SHIFT) & (BIT(LM_DEVFN_BITS) - 1);
++	case IOMMU_LM_ROOT:
++		return (vaddr >> LM_BUS_SHIFT) & (BIT(LM_BUS_BITS) - 1);
++	default:
++		break;
++	}
++
++	return -EINVAL;
++}
++
++static int iommu_lm_id_level_entry_size(int level)
++{
++	switch (level) {
++	case IOMMU_LM_CONTEXT:
++		return sizeof(struct context_entry);
++	case IOMMU_LM_ROOT:
++		return sizeof(struct root_entry);
++	default:
++		break;
++	}
++
++	return -EINVAL;
++}
++
++static int iommu_lm_id_level_to_entries(int level)
++{
++	switch (level) {
++	case IOMMU_LM_CONTEXT:
++		return 1 << LM_DEVFN_BITS;
++	case IOMMU_LM_ROOT:
++		return 1 << LM_BUS_BITS;
++	default:
++		break;
++	}
++
++	return -EINVAL;
++}
++
++static unsigned long iommu_lm_id_level_to_size(int level)
++{
++	switch (level) {
++	case IOMMU_LM_CONTEXT:
++		return 1 << LM_DEVFN_SHIFT;
++	case IOMMU_LM_ROOT:
++		return 1 << LM_BUS_SHIFT;
++	default:
++		break;
++	}
++
++	return 0;
++}
++
++struct pkvm_pgtable_ops iommu_lm_id_ops = {
++	.pgt_entry_present = iommu_id_entry_present,
++	.pgt_entry_to_phys = iommu_id_entry_to_phys,
++	.pgt_entry_to_index = iommu_lm_id_entry_to_index,
++	.pgt_entry_is_leaf = iommu_id_entry_is_leaf,
++	.pgt_level_entry_size = iommu_lm_id_level_entry_size,
++	.pgt_level_to_entries = iommu_lm_id_level_to_entries,
++	.pgt_level_to_size = iommu_lm_id_level_to_size,
++};
++
++static int iommu_pgtable_walk(struct pkvm_pgtable *pgt, unsigned long vaddr,
++		       unsigned long vaddr_end, struct pkvm_pgtable_walker *walker)
++{
++	if (!pgt->root_pa)
++		return 0;
++
++	return pgtable_walk(pgt, vaddr, vaddr_end - vaddr, false, walker);
++}
++
++static struct pkvm_ptdev *iommu_find_ptdev(struct pkvm_iommu *iommu, u16 bdf, u32 pasid)
++{
++	struct pkvm_ptdev *p;
++
++	list_for_each_entry(p, &iommu->ptdev_head, iommu_node) {
++		if (match_ptdev(p, bdf, pasid))
++			return p;
++	}
++
++	return NULL;
++}
++
++static inline bool iommu_coherency(u64 ecap)
++{
++	return ecap_smts(ecap) ? ecap_smpwc(ecap) : ecap_coherent(ecap);
++}
++
++static struct pkvm_ptdev *iommu_add_ptdev(struct pkvm_iommu *iommu, u16 bdf, u32 pasid)
++{
++	struct pkvm_ptdev *ptdev = pkvm_get_ptdev(bdf, pasid);
++
++	if (!ptdev) {
++		ptdev = pkvm_alloc_ptdev(bdf, pasid, iommu_coherency(iommu->iommu.ecap));
++		if (!ptdev)
++			return NULL;
++	}
++
++	list_add_tail(&ptdev->iommu_node, &iommu->ptdev_head);
++	return ptdev;
++}
++
++static void iommu_del_ptdev(struct pkvm_iommu *iommu, struct pkvm_ptdev *ptdev)
++{
++	list_del_init(&ptdev->iommu_node);
++	pkvm_put_ptdev(ptdev);
++}
++
++static int iommu_audit_did(struct pkvm_iommu *iommu, u16 did, int shadow_vm_handle)
++{
++	struct pkvm_ptdev *tmp;
++	int ret = 0;
++
++	list_for_each_entry(tmp, &iommu->ptdev_head, iommu_node) {
++		if (tmp->shadow_vm_handle != shadow_vm_handle) {
++			if (tmp->did == did) {
++				/*
++				 * The devices belong to different VMs but behind
++				 * the same IOMMU, cannot use the same did.
++				 */
++				ret = -EPERM;
++				break;
++			}
++		}
++	}
++
++	return ret;
++}
++
++static int shadow_pgt_map_leaf(struct pkvm_pgtable *pgt, unsigned long vaddr, int level,
++			       void *ptep, struct pgt_flush_data *flush_data, void *arg)
++{
++	struct pkvm_pgtable_map_data *data = arg;
++	unsigned long map_phys;
++	int ret = 0;
++
++	host_ept_lock();
++
++	pkvm_host_ept_lookup(data->phys, &map_phys, NULL, NULL);
++	if (map_phys == INVALID_ADDR) {
++		pkvm_err("pkvm: phys addr 0x%lx not mapped in host ept\n", data->phys);
++		goto out;
++	}
++
++	ret = pgtable_map_leaf(pgt, vaddr, level, ptep, flush_data, arg);
++
++out:
++	host_ept_unlock();
++	return ret;
++}
++
++/* used in legacy mode only */
++static void sync_shadow_pgt(struct pkvm_ptdev *ptdev, struct shadow_pgt_sync_data *sdata)
++{
++	struct pkvm_pgtable *spgt;
++	int ret;
++
++	PKVM_ASSERT(is_pgt_ops_ept(&ptdev->vpgt));
++
++	/*
++	 * ptdev->pgt should be already set to this shadow iommu pgtable.
++	 * However, ptdev->pgt could change in the meantime due to ptdev
++	 * attach to a VM. So to avoid race, do not use ptdev->pgt directly
++	 * but get the same shadow iommu pgtable on our own.
++	 */
++	spgt = pkvm_get_host_iommu_spgt(ptdev->vpgt.root_pa, ptdev->iommu_coherency);
++	PKVM_ASSERT(spgt);
++
++	if (sdata)
++		ret = pkvm_pgtable_sync_map_range(&ptdev->vpgt, spgt,
++						  sdata->vaddr,
++						  sdata->vaddr_end - sdata->vaddr,
++						  NULL, shadow_pgt_map_leaf);
++	else
++		ret = pkvm_pgtable_sync_map(&ptdev->vpgt, spgt,
++					    NULL, shadow_pgt_map_leaf);
++	PKVM_ASSERT(ret == 0);
++
++	pkvm_put_host_iommu_spgt(spgt, ptdev->iommu_coherency);
++}
++
++/* present root entry when shadow_pa valid, otherwise un-present it */
++static bool sync_root_entry(struct id_sync_data *sdata)
++{
++	u64 *sre = sdata->shadow_ptep;
++	u64 sre_val = sdata->shadow_pa ? (sdata->shadow_pa | 1) : 0;
++
++	if (READ_ONCE(*sre) != sre_val) {
++		WRITE_ONCE(*sre, sre_val);
++		return true;
++	}
++
++	return false;
++}
++
++/* sync context entry when guest_ptep & shadow_pa valid, otherwise un-present it */
++static bool sync_shadow_context_entry(struct id_sync_data *sdata)
++{
++	struct context_entry *shadow_ce = sdata->shadow_ptep, tmp = {0};
++	struct context_entry *guest_ce = sdata->guest_ptep;
++	struct pkvm_iommu *iommu = pgt_to_pkvm_iommu(sdata->shadow_id);
++	struct pkvm_ptdev *ptdev;
++	struct pkvm_pgtable_cap cap;
++	bool updated = false;
++	u8 tt, aw;
++	u16 bdf, did;
++
++	if (ecap_smts(sdata->iommu_ecap)) {
++		if (sdata->guest_ptep && sdata->shadow_pa) {
++			tmp.hi = guest_ce->hi;
++			tmp.lo = sdata->shadow_pa | (guest_ce->lo & 0xfff);
++
++			/* Clear DTE to make sure device TLB is disabled for security */
++			context_sm_clear_dte(&tmp);
++		}
++	} else {
++		/*
++		 * In legacy mode, a context entry is a leaf entry responsible for
++		 * configuring the actual address translation for the given ptdev,
++		 * much like a PASID table entry in scalable mode. So the below logic
++		 * is quite similar to the logic in sync_shadow_pasid_table_entry()
++		 * for scalable mode.
++		 */
++		bdf = sdata->vaddr >> LM_DEVFN_SHIFT;
++		ptdev = iommu_find_ptdev(iommu, bdf, 0);
++
++		if (!ptdev) {
++			ptdev = iommu_add_ptdev(iommu, bdf, 0);
++			if (!ptdev)
++				return false;
++		}
++
++		if (!sdata->guest_ptep) {
++			if (context_lm_is_present(shadow_ce)) {
++				pkvm_setup_ptdev_vpgt(ptdev, 0, NULL, NULL, NULL, false);
++				pkvm_setup_ptdev_did(ptdev, 0);
++				iommu_del_ptdev(iommu, ptdev);
++
++				goto update_shadow_ce;
++			}
++			return false;
++		}
++
++		tt = context_lm_get_tt(guest_ce);
++		switch (tt) {
++		case CONTEXT_TT_MULTI_LEVEL:
++		case CONTEXT_TT_DEV_IOTLB:
++			aw = context_lm_get_aw(guest_ce);
++			if (aw != 1 && aw != 2 && aw != 3) {
++				pkvm_err("pkvm: unsupported address width %u\n", aw);
++
++				pkvm_setup_ptdev_vpgt(ptdev, 0, NULL, NULL, NULL, false);
++				pkvm_setup_ptdev_did(ptdev, 0);
++
++				/*
++				 * TODO: our error reporting to the host for invalid
++				 * values of aw or tt is not good: the host will see
++				 * translation fault reason "present bit is clear"
++				 * instead of "invalid entry".
++				 */
++				goto update_shadow_ce;
++			}
++			cap.level = (aw == 1) ? 3 :
++				    (aw == 2) ? 4 : 5;
++			cap.allowed_pgsz = pkvm_hyp->ept_cap.allowed_pgsz;
++			pkvm_setup_ptdev_vpgt(ptdev, context_lm_get_slptr(guest_ce),
++					      &viommu_mm_ops, &ept_ops, &cap, true);
++
++			if (!ptdev_attached_to_vm(ptdev))
++				sync_shadow_pgt(ptdev, sdata->spgt_data);
++
++			break;
++		case CONTEXT_TT_PASS_THROUGH:
++			/*
++			 * When host IOMMU driver is using pass-through mode, pkvm
++			 * IOMMU will actually use the address translation
++			 * (CONTEXT_TT_MULTI_LEVEL) with the primary VM's EPT
++			 * to guarantee the protection.
++			 */
++			break;
++		default:
++			pkvm_err("pkvm: unsupported translation type %u\n", tt);
++
++			pkvm_setup_ptdev_vpgt(ptdev, 0, NULL, NULL, NULL, false);
++			pkvm_setup_ptdev_did(ptdev, 0);
++			goto update_shadow_ce;
++		}
++
++		did = context_lm_get_did(guest_ce);
++		if (iommu_audit_did(iommu, did, ptdev->shadow_vm_handle))
++			return false;
++
++		pkvm_setup_ptdev_did(ptdev, did);
++
++		if (!is_pgt_ops_ept(ptdev->pgt))
++			return false;
++
++		tmp = *guest_ce;
++
++		/*
++		 * Always set translation type to MULTI_LEVEL to ensure address
++		 * translation and to disable device TLB for security.
++		 */
++		context_lm_set_tt(&tmp, CONTEXT_TT_MULTI_LEVEL);
++		context_lm_set_slptr(&tmp, ptdev->pgt->root_pa);
++		aw = (ptdev->pgt->level == 3) ? 1 :
++		     (ptdev->pgt->level == 4) ? 2 : 3;
++		context_lm_set_aw(&tmp, aw);
++	}
++
++update_shadow_ce:
++	if (READ_ONCE(shadow_ce->hi) != tmp.hi) {
++		WRITE_ONCE(shadow_ce->hi, tmp.hi);
++		updated = true;
++	}
++
++	if (READ_ONCE(shadow_ce->lo) != tmp.lo) {
++		WRITE_ONCE(shadow_ce->lo, tmp.lo);
++		updated = true;
++	}
++
++	return updated;
++}
++
++/* sync pasid dir entry when guest_ptep & shadow_pa valid, otherwise un-present it */
++static bool sync_shadow_pasid_dir_entry(struct id_sync_data *sdata)
++{
++	struct pasid_dir_entry *shadow_pde = sdata->shadow_ptep;
++	u64 val = 0;
++
++	if (sdata->guest_ptep && sdata->shadow_pa) {
++		struct pasid_dir_entry *guest_pde = sdata->guest_ptep;
++
++		val = guest_pde->val & (PASID_PTE_FPD | PASID_PTE_PRESENT);
++		val |= sdata->shadow_pa;
++	}
++
++	if (READ_ONCE(shadow_pde->val) != val) {
++		WRITE_ONCE(shadow_pde->val, val);
++		return true;
++	}
++
++	return false;
++}
++
++/* sync pasid table entry when guest_ptep valid, otherwise un-present it */
++static bool sync_shadow_pasid_table_entry(struct id_sync_data *sdata)
++{
++	u16 bdf = sdata->vaddr >> DEVFN_SHIFT;
++	u32 pasid = sdata->vaddr & ((1UL << MAX_NR_PASID_BITS) - 1);
++	struct pkvm_iommu *iommu = pgt_to_pkvm_iommu(sdata->shadow_id);
++	struct pkvm_ptdev *ptdev = iommu_find_ptdev(iommu, bdf, pasid);
++	struct pasid_entry *shadow_pte = sdata->shadow_ptep, tmp_pte = {0};
++	struct pasid_entry *guest_pte;
++	bool synced = false;
++	u64 type, aw;
++
++	if (!ptdev) {
++		ptdev = iommu_add_ptdev(iommu, bdf, pasid);
++		if (!ptdev)
++			return false;
++	}
++
++	if (!sdata->guest_ptep) {
++		if (pasid_pte_is_present(shadow_pte)) {
++			/*
++			 * Making a pasid entry not present needs to remove
++			 * the corresponding ptdev from IOMMU. It also means
++			 * a ptdev's vpgt/did should be reset as well as
++			 * deleting ptdev from this iommu.
++			 */
++			pkvm_setup_ptdev_vpgt(ptdev, 0, NULL, NULL, NULL, false);
++			pkvm_setup_ptdev_did(ptdev, 0);
++			iommu_del_ptdev(iommu, ptdev);
++
++			synced = pasid_copy_entry(shadow_pte, &tmp_pte);
++		}
++		return synced;
++	}
++
++	guest_pte = sdata->guest_ptep;
++	type = pasid_pte_get_pgtt(guest_pte);
++	if (type == PASID_ENTRY_PGTT_FL_ONLY) {
++		struct pkvm_pgtable_cap cap;
++
++		if (ptdev_attached_to_vm(ptdev))
++			/*
++			 * For the attached ptdev, use SL Only mode with
++			 * using ptdev->pgt so that the translation is
++			 * totally controlled by pkvm.
++			 */
++			type = PASID_ENTRY_PGTT_SL_ONLY;
++		else
++			/*
++			 * For the other ptdev, pkvm IOMMU will use nested
++			 * translation to add one more layer translation to
++			 * guarantee the protection. This one more layer is the
++			 * primary VM's EPT.
++			 */
++			type = PASID_ENTRY_PGTT_NESTED;
++
++		/* ptdev vpgt can be initialized with flptr */
++		cap.level = pasid_get_flpm(guest_pte) == 0 ? 4 : 5;
++		cap.allowed_pgsz = pkvm_hyp->mmu_cap.allowed_pgsz;
++		pkvm_setup_ptdev_vpgt(ptdev, pasid_get_flptr(guest_pte),
++				      &viommu_mm_ops, &mmu_ops, &cap, false);
++	} else if (type == PASID_ENTRY_PGTT_PT) {
++		/*
++		 * When host IOMMU driver is using pass-through mode, pkvm
++		 * IOMMU will actually use the second-level only translation
++		 * to guarantee the protection. This second-level is als
++		 * the EPT.
++		 */
++		type = PASID_ENTRY_PGTT_SL_ONLY;
++	} else {
++		/*
++		 * As the host IOMMU driver in the pkvm enabled kernel has
++		 * already been configured to use first-level only or
++		 * pass-through mode, it will not use any other mode. But
++		 * in case this happens, reset the ptdev vpgt/did, keep ptdev
++		 * linked to this IOMMU, and clear the shadow entry in order
++		 * not to support it.
++		 */
++		pkvm_setup_ptdev_vpgt(ptdev, 0, NULL, NULL, NULL, false);
++		pkvm_setup_ptdev_did(ptdev, 0);
++
++		pkvm_err("pkvm: unsupported pasid type %lld\n", type);
++
++		return pasid_copy_entry(shadow_pte, &tmp_pte);
++	}
++
++	pkvm_setup_ptdev_did(ptdev, pasid_get_domain_id(guest_pte));
++
++	if (iommu_audit_did(iommu, ptdev->did, ptdev->shadow_vm_handle))
++		/*
++		 * It is possible that this ptdev will be attached to a protected
++		 * VM so primary VM allocates the same did used by this protected
++		 * VM and did a TLB flush. But at this moment, this ptdev is not
++		 * attached yet so audit is failed. For this case, can skip the sync
++		 * of this pasid table entry and it will be synced again when this
++		 * ptdev is attached.
++		 *
++		 * It is also possible that this ptdev is just detached from a
++		 * protected VM but still using the previous did due to primary VM
++		 * has not configured this ptdev yet. In this case, the did of this
++		 * ptdev is still the same as the did used by other ptdevs not
++		 * detached yet. For this case, can skip the sync of this pasid
++		 * table entry and it will be synced again when primary VM configures
++		 * this ptdev.
++		 *
++		 * If not the above cases but primary VM does this by purpose, also
++		 * not sync the pasid table entry to guarantee the isolation.
++		 */
++		return false;
++
++	/*
++	 * ptdev->pgt will be used as second-level translation table
++	 * which should be EPT format.
++	 */
++	if (!is_pgt_ops_ept(ptdev->pgt))
++		return false;
++
++	/*
++	 * Copy all the bits from guest_pte. As the translation type will
++	 * be re-configured in below, even some bits inherit from guest_pte
++	 * but hardware will ignore those bits according to the translation
++	 * type.
++	 */
++	memcpy(&tmp_pte, guest_pte, sizeof(struct pasid_entry));
++
++	pasid_set_page_snoop(&tmp_pte, !!ecap_smpwc(sdata->iommu_ecap));
++	if (ecap_sc_support(sdata->iommu_ecap))
++		pasid_set_pgsnp(&tmp_pte);
++
++	/*
++	 * Modify the second-level related bits:
++	 * Set PGTT/SLPTR/AW.
++	 * Clear SLADE/SLEE
++	 * Reuse FPD/P
++	 */
++	pasid_set_translation_type(&tmp_pte, type);
++	pasid_set_slptr(&tmp_pte, ptdev->pgt->root_pa);
++	aw = (ptdev->pgt->level == 4) ? 2 : 3;
++	pasid_set_address_width(&tmp_pte, aw);
++	pasid_set_ssade(&tmp_pte, 0);
++	pasid_set_ssee(&tmp_pte, 0);
++
++	return pasid_copy_entry(shadow_pte, &tmp_pte);
++}
++
++static bool iommu_id_sync_entry(struct id_sync_data *sdata)
++{
++	bool ret = false;
++	struct pkvm_pgtable *shadow_id = sdata->shadow_id;
++
++	if (ecap_smts(sdata->iommu_ecap)) {
++		switch (sdata->level) {
++		case IOMMU_PASID_TABLE:
++			ret = sync_shadow_pasid_table_entry(sdata);
++			break;
++		case IOMMU_PASID_DIR:
++			ret = sync_shadow_pasid_dir_entry(sdata);
++			break;
++		case IOMMU_SM_CONTEXT:
++			ret = sync_shadow_context_entry(sdata);
++			break;
++		case IOMMU_SM_ROOT:
++			ret = sync_root_entry(sdata);
++			break;
++		default:
++			break;
++		}
++	} else {
++		switch (sdata->level) {
++		case IOMMU_LM_CONTEXT:
++			ret = sync_shadow_context_entry(sdata);
++			break;
++		case IOMMU_LM_ROOT:
++			ret = sync_root_entry(sdata);
++			break;
++		default:
++			break;
++		}
++	}
++
++	if (ret) {
++		int entry_size = shadow_id->pgt_ops->pgt_level_entry_size(sdata->level);
++
++		if (entry_size && shadow_id->mm_ops->flush_cache)
++			shadow_id->mm_ops->flush_cache(sdata->shadow_ptep, entry_size);
++	}
++
++	return ret;
++}
++
++static int initialize_iommu_pgt(struct pkvm_iommu *iommu)
++{
++	struct pkvm_pgtable *pgt = &iommu->pgt;
++	struct pkvm_pgtable *vpgt = &iommu->viommu.pgt;
++	static struct pkvm_mm_ops *iommu_mm_ops;
++	struct pkvm_pgtable_ops *iommu_ops;
++	struct pkvm_pgtable_cap cap;
++	u64 grt_pa = readq(iommu->iommu.reg + DMAR_RTADDR_REG) & VTD_PAGE_MASK;
++	int ret;
++
++	if (ecap_smts(iommu->iommu.ecap)) {
++		cap.level = IOMMU_SM_ROOT;
++		iommu_ops = &iommu_sm_id_ops;
++	} else {
++		cap.level = IOMMU_LM_ROOT;
++		iommu_ops = &iommu_lm_id_ops;
++	}
++
++	vpgt->root_pa = grt_pa;
++	ret = pkvm_pgtable_init(vpgt, &viommu_mm_ops, iommu_ops, &cap, false);
++	if (ret)
++		return ret;
++
++	/*
++	 * For the IOMMU without Page-Walk Coherency, should use
++	 * iommu_pw_noncoherency_mm_ops to flush CPU cache when
++	 * modifying any remapping structure entry.
++	 *
++	 * For the IOMMU with Page-Walk Coherency, can use
++	 * iommu_pw_coherency_mm_ops to skip the CPU cache flushing.
++	 */
++	if (!ecap_coherent(iommu->iommu.ecap))
++		iommu_mm_ops = &iommu_pw_noncoherency_mm_ops;
++	else
++		iommu_mm_ops = &iommu_pw_coherency_mm_ops;
++
++	ret = pkvm_pgtable_init(pgt, iommu_mm_ops, iommu_ops, &cap, true);
++	if (!ret) {
++		/*
++		 * Hold additional reference count to make
++		 * sure root page won't be freed
++		 */
++		void *root = pgt->mm_ops->phys_to_virt(pgt->root_pa);
++
++		pgt->mm_ops->get_page(root);
++	}
++	return ret;
++}
++
++int pkvm_init_iommu(unsigned long mem_base, unsigned long nr_pages)
++{
++	struct pkvm_iommu_info *info = &pkvm_hyp->iommu_infos[0];
++	struct pkvm_iommu *piommu = &iommus[0];
++	int i, ret = pkvm_pool_init(&iommu_pool, mem_base >> PAGE_SHIFT, nr_pages, 0);
++
++	if (ret)
++		return ret;
++
++	for (i = 0; i < PKVM_MAX_IOMMU_NUM; piommu++, info++, i++) {
++		if (!info->reg_phys)
++			break;
++
++		INIT_LIST_HEAD(&piommu->ptdev_head);
++
++		pkvm_spinlock_init(&piommu->lock);
++		piommu->iommu.reg_phys = info->reg_phys;
++		piommu->iommu.reg_size = info->reg_size;
++		piommu->iommu.reg = pkvm_iophys_to_virt(info->reg_phys);
++		if ((unsigned long)piommu->iommu.reg == INVALID_ADDR)
++			return -ENOMEM;
++		piommu->iommu.seq_id = i;
++
++		ret = pkvm_mmu_map((unsigned long)piommu->iommu.reg,
++				   (unsigned long)info->reg_phys,
++				   info->reg_size, 1 << PG_LEVEL_4K,
++				   PKVM_PAGE_IO_NOCACHE);
++		if (ret)
++			return ret;
++
++		piommu->iommu.cap = readq(piommu->iommu.reg + DMAR_CAP_REG);
++		piommu->iommu.ecap = readq(piommu->iommu.reg + DMAR_ECAP_REG);
++		/* cache the enabled features from Global Status register */
++		piommu->iommu.gcmd = readl(piommu->iommu.reg + DMAR_GSTS_REG) &
++				     DMAR_GSTS_EN_BITS;
++
++		ret = pkvm_host_ept_unmap((unsigned long)info->reg_phys,
++				     (unsigned long)info->reg_phys,
++				     info->reg_size);
++		if (ret)
++			return ret;
++	}
++
++	return 0;
++}
++
++static int free_shadow_id_cb(struct pkvm_pgtable *pgt, unsigned long vaddr,
++			  unsigned long vaddr_end, int level, void *ptep,
++			  unsigned long flags, struct pgt_flush_data *flush_data,
++			  void *const arg)
++{
++	struct pkvm_pgtable_ops *pgt_ops = pgt->pgt_ops;
++	struct pkvm_mm_ops *mm_ops = pgt->mm_ops;
++	struct id_sync_data sync_data = {0};
++	struct pkvm_iommu *iommu = pgt_to_pkvm_iommu(pgt);
++	void *child_ptep;
++
++	/* Doesn't need to do anything if the shadow entry is not present */
++	if (!pgt_ops->pgt_entry_present(ptep))
++		return 0;
++
++	sync_data.shadow_ptep = ptep;
++	sync_data.level = level;
++	sync_data.shadow_id = pgt;
++	sync_data.iommu_ecap = iommu->iommu.ecap;
++	sync_data.vaddr = vaddr;
++
++	/* Un-present a present PASID Table entry */
++	if (LAST_LEVEL(level)) {
++		if (iommu_id_sync_entry(&sync_data))
++			mm_ops->put_page(ptep);
++		return 0;
++	}
++
++	/*
++	 * it's a present entry for PASID DIR, context or root.
++	 * its child ptep shall already be freed (the refcnt == 1), if so, we
++	 * can un-present itself as well now.
++	 */
++	child_ptep = mm_ops->phys_to_virt(pgt_ops->pgt_entry_to_phys(ptep));
++	if (mm_ops->page_count(child_ptep) == 1) {
++		if (iommu_id_sync_entry(&sync_data)) {
++			mm_ops->put_page(ptep);
++			mm_ops->put_page(child_ptep);
++		}
++	}
++
++	return 0;
++}
++
++/* sync_data != NULL, data != NULL */
++static int init_sync_id_data(struct id_sync_data *sync_data,
++		struct id_sync_walk_data *data,
++		struct pkvm_iommu *iommu, void *guest_ptep,
++		unsigned long vaddr, int level)
++{
++	struct pkvm_pgtable *shadow_id = &iommu->pgt;
++	int idx = shadow_id->pgt_ops->pgt_entry_to_index(vaddr, level);
++	int entry_size = shadow_id->pgt_ops->pgt_level_entry_size(level);
++
++	if (ecap_smts(iommu->iommu.ecap)) {
++		switch (level) {
++		case IOMMU_PASID_TABLE:
++			sync_data->p_entry = *((struct pasid_entry *)guest_ptep);
++			sync_data->guest_ptep = &sync_data->p_entry;
++			break;
++		case IOMMU_PASID_DIR:
++			sync_data->pd_entry = *((struct pasid_dir_entry *)guest_ptep);
++			sync_data->guest_ptep = &sync_data->pd_entry;
++			break;
++		case IOMMU_SM_CONTEXT:
++			sync_data->ct_entry = *((struct context_entry *)guest_ptep);
++			sync_data->guest_ptep = &sync_data->ct_entry;
++			break;
++		case IOMMU_SM_ROOT:
++			sync_data->root_entry = *((u64 *)guest_ptep);
++			sync_data->guest_ptep = &sync_data->root_entry;
++			break;
++		default:
++			return -EINVAL;
++		}
++	} else {
++		switch (level) {
++		case IOMMU_LM_CONTEXT:
++			sync_data->ct_entry = *((struct context_entry *)guest_ptep);
++			sync_data->guest_ptep = &sync_data->ct_entry;
++			break;
++		case IOMMU_LM_ROOT:
++			sync_data->root_entry = *((u64 *)guest_ptep);
++			sync_data->guest_ptep = &sync_data->root_entry;
++			break;
++		default:
++			return -EINVAL;
++		}
++	}
++
++	/* shadow_pa of current level must be there */
++	if (!data->shadow_pa[level])
++		return -EINVAL;
++
++	/* get current shadow_ptep */
++	sync_data->shadow_ptep = shadow_id->mm_ops->phys_to_virt(data->shadow_pa[level]);
++	sync_data->shadow_ptep += idx * entry_size;
++
++	sync_data->level = level;
++	sync_data->shadow_id = shadow_id;
++	sync_data->iommu_ecap = iommu->iommu.ecap;
++	sync_data->shadow_pa = 0;
++	sync_data->vaddr = vaddr;
++	sync_data->spgt_data = data->spgt_data;
++
++	return 0;
++}
++
++static int free_shadow_id(struct pkvm_iommu *iommu, unsigned long vaddr,
++		       unsigned long vaddr_end);
++static int sync_shadow_id_cb(struct pkvm_pgtable *vpgt, unsigned long vaddr,
++			  unsigned long vaddr_end, int level, void *ptep,
++			  unsigned long flags, struct pgt_flush_data *flush_data,
++			  void *const arg)
++{
++	struct pkvm_pgtable_ops *vpgt_ops = vpgt->pgt_ops;
++	struct id_sync_walk_data *data = arg;
++	struct pkvm_iommu *iommu = data->iommu;
++	struct pkvm_pgtable *shadow_id = &iommu->pgt;
++	struct id_sync_data sync_data;
++	void *shadow_ptep, *guest_ptep;
++	bool shadow_p, guest_p;
++	int ret = init_sync_id_data(&sync_data, data, iommu, ptep, vaddr, level);
++
++	if (ret < 0)
++		return ret;
++
++	guest_ptep = sync_data.guest_ptep;
++	shadow_ptep = sync_data.shadow_ptep;
++
++	/*
++	 * WALK_TABLE_PRE is for non leaf, WALK_LEAF is for leaf
++	 * if not match, it means guest changed it, return -EAGAIN
++	 * to re-walk the page table.
++	 */
++	if ((flags == PKVM_PGTABLE_WALK_TABLE_PRE &&
++		vpgt_ops->pgt_entry_is_leaf(guest_ptep, level)) ||
++		(flags == PKVM_PGTABLE_WALK_LEAF &&
++		!vpgt_ops->pgt_entry_is_leaf(guest_ptep, level)))
++		return -EAGAIN;
++
++	shadow_p = shadow_id->pgt_ops->pgt_entry_present(shadow_ptep);
++	guest_p = vpgt_ops->pgt_entry_present(guest_ptep);
++	if (!guest_p) {
++		if (shadow_p) {
++			/*
++			 * For the case that guest not present but shadow present, just
++			 * simply free the shadow to make them consistent.
++			 */
++			unsigned long new_vaddr_end = shadow_id->pgt_ops->pgt_level_to_size(level) +
++						      vaddr;
++			/*
++			 * Get a reference count before free to make sure the current page
++			 * of this level and the pages of its parent levels won't be freed.
++			 * As here we only want to free its specific sub-level.
++			 */
++			shadow_id->mm_ops->get_page(shadow_ptep);
++			free_shadow_id(iommu, vaddr, new_vaddr_end);
++			shadow_id->mm_ops->put_page(shadow_ptep);
++		}
++		/*
++		 * As now both guest and shadow are not
++		 * present, don't need to do anything more.
++		 */
++		return ret;
++	}
++
++	if (LAST_LEVEL(level)) {
++		/*
++		 * Cache invalidation may want to sync specific PASID entries
++		 * (in scalable mode) or context entries (in legacy mode) with
++		 * DID matched. In such case we only need to sync the entries
++		 * with the matching DID.
++		 *
++		 * According to vt-d spec 6.2.2.1 and 6.2.3.1, software must
++		 * not use domain-id value of 0 when programming entries on
++		 * implementations reporting CM=1 in the Capability register.
++		 * So non-zero DID means a real DID from host software.
++		 */
++		if (data->did) {
++			u16 did = ecap_smts(iommu->iommu.ecap)
++				? pasid_get_domain_id(guest_ptep)
++				: context_lm_get_did(guest_ptep);
++
++			if (did != data->did)
++				return ret;
++		}
++
++		/*
++		 * For a leaf entry, the physical address of its child level
++		 * is determined by the pgt used by the corresponding ptdev.
++		 * So no need to set sync_data.shadow_pa.
++		 */
++	} else if (!shadow_p) {
++		/*
++		 * For a non-present non-leaf (which may be root/context/pasid
++		 * dir) entry, needs to allocate a new page to make this entry
++		 * present. Root and context page are always one page with 4K
++		 * size. As we fixed the pasid to only support 15 bits,
++		 * the pasid dir is also one page with 4K size.
++		 */
++		void *shadow = shadow_id->mm_ops->zalloc_page();
++
++		if (!shadow)
++			return -ENOMEM;
++		/* Get the shadow id physical address of the child level */
++		sync_data.shadow_pa = shadow_id->mm_ops->virt_to_phys(shadow);
++	} else
++		/*
++		 * For a present non-leaf (which is probably root/context/pasid dir)
++		 * entry, get the shadow id physical address of its child level.
++		 */
++		sync_data.shadow_pa = shadow_id->pgt_ops->pgt_entry_to_phys(shadow_ptep);
++
++	if (iommu_id_sync_entry(&sync_data)) {
++		if (!shadow_p)
++			/*
++			 * A non-present to present changing needs to get
++			 * a new reference count for the shadow id page.
++			 */
++			shadow_id->mm_ops->get_page(shadow_ptep);
++	}
++
++	if ((flags == PKVM_PGTABLE_WALK_TABLE_PRE) && (!LAST_LEVEL(level))) {
++		/*
++		 * As guest page table walking will go to the child level, pass
++		 * the shadow id physical address of the child level to sync.
++		 */
++		data->shadow_pa[level - 1] = sync_data.shadow_pa;
++	}
++
++	return ret;
++}
++
++static int free_shadow_id(struct pkvm_iommu *iommu, unsigned long vaddr,
++		       unsigned long vaddr_end)
++{
++	struct pkvm_pgtable_walker walker = {
++		.cb = free_shadow_id_cb,
++		.flags = PKVM_PGTABLE_WALK_LEAF |
++			 PKVM_PGTABLE_WALK_TABLE_POST,
++	};
++
++	/*
++	 * To free the shadow IOMMU page table, walk the shadow IOMMU
++	 * page table.
++	 */
++	if (!(iommu->viommu.vreg.gsts & DMA_GSTS_TES))
++		return 0;
++
++	return iommu_pgtable_walk(&iommu->pgt, vaddr, vaddr_end, &walker);
++}
++
++static int sync_shadow_id(struct pkvm_iommu *iommu, unsigned long vaddr,
++		       unsigned long vaddr_end, u16 did,
++		       struct shadow_pgt_sync_data *spgt_data)
++{
++	DEFINE_ID_SYNC_WALK_DATA(arg, iommu, did, spgt_data);
++	struct pkvm_pgtable_walker walker = {
++		.cb = sync_shadow_id_cb,
++		.flags = PKVM_PGTABLE_WALK_TABLE_PRE |
++			 PKVM_PGTABLE_WALK_LEAF,
++		.arg = &arg,
++	};
++	int ret, retry_cnt = 0;
++
++	if (!(iommu->viommu.vreg.gsts & DMA_GSTS_TES))
++		return 0;
++
++retry:
++	if (ecap_smts(iommu->iommu.ecap))
++		arg.shadow_pa[IOMMU_SM_ROOT] = iommu->pgt.root_pa;
++	else
++		arg.shadow_pa[IOMMU_LM_ROOT] = iommu->pgt.root_pa;
++	/*
++	 * To sync the shadow IOMMU page table, walks the guest IOMMU
++	 * page table
++	 */
++	ret = iommu_pgtable_walk(&iommu->viommu.pgt, vaddr, vaddr_end, &walker);
++	if ((ret == -EAGAIN) && (retry_cnt++ < 5))
++		goto retry;
++
++	return ret;
++}
++
++static void enable_qi(struct pkvm_iommu *iommu)
++{
++	void *desc = iommu->qi.desc;
++	int dw, qs;
++	u32 sts;
++
++	dw = !!ecap_smts(iommu->iommu.ecap);
++	qs = fls(iommu->qi.free_cnt >> (7 + !dw)) - 1;
++
++	/* Disable QI */
++	sts = readl(iommu->iommu.reg + DMAR_GSTS_REG);
++	if (sts & DMA_GSTS_QIES) {
++		iommu->iommu.gcmd &= ~DMA_GCMD_QIE;
++		writel(iommu->iommu.gcmd, iommu->iommu.reg + DMAR_GCMD_REG);
++		PKVM_IOMMU_WAIT_OP(iommu->iommu.reg + DMAR_GSTS_REG,
++				   readl, !(sts & DMA_GSTS_QIES), sts);
++	}
++
++	/* Set tail to 0 */
++	writel(0, iommu->iommu.reg + DMAR_IQT_REG);
++
++	/* Set IQA */
++	iommu->piommu_iqa = pkvm_virt_to_phys(desc) | (dw << 11) | qs;
++	writeq(iommu->piommu_iqa, iommu->iommu.reg + DMAR_IQA_REG);
++
++	/* Enable QI */
++	iommu->iommu.gcmd |= DMA_GCMD_QIE;
++	writel(iommu->iommu.gcmd, iommu->iommu.reg + DMAR_GCMD_REG);
++	PKVM_IOMMU_WAIT_OP(iommu->iommu.reg + DMAR_GSTS_REG,
++			   readl, (sts & DMA_GSTS_QIES), sts);
++}
++
++static int create_qi_desc(struct pkvm_iommu *iommu)
++{
++	struct pkvm_viommu *viommu = &iommu->viommu;
++	struct q_inval *qi = &iommu->qi;
++	void __iomem *reg = iommu->iommu.reg;
++
++	pkvm_spinlock_init(&iommu->qi_lock);
++	/*
++	 * Before switching the descriptor, need to wait for any pending
++	 * invalidation descriptor completed. According to spec 6.5.2,
++	 * The invalidation queue is considered quiesced when the queue
++	 * is empty (head and tail registers equal) and the last
++	 * descriptor completed is an Invalidation Wait Descriptor
++	 * (which indicates no invalidation requests are pending in hardware).
++	 */
++	while (readq(reg + DMAR_IQH_REG) !=
++		readq(reg + DMAR_IQT_REG))
++		cpu_relax();
++
++	viommu->vreg.iqa = viommu->iqa = readq(reg + DMAR_IQA_REG);
++	viommu->vreg.iq_head = readq(reg + DMAR_IQH_REG);
++	viommu->vreg.iq_tail = readq(reg + DMAR_IQT_REG);
++
++	if (viommu->vreg.gsts & DMA_GSTS_QIES) {
++		struct qi_desc *wait_desc;
++		u64 iqa = viommu->iqa;
++		int shift = IQ_DESC_SHIFT(iqa);
++		int offset = ((viommu->vreg.iq_head >> shift) +
++			      IQ_DESC_LEN(iqa) - 1) % IQ_DESC_LEN(iqa);
++		int *desc_status;
++
++		/* Find out the last descriptor */
++		wait_desc = pkvm_phys_to_virt(IQ_DESC_BASE_PHYS(iqa)) + (offset << shift);
++
++		pkvm_dbg("pkvm: viommu iqa 0x%llx head 0x%llx tail 0x%llx qw0 0x%llx qw1 0x%llx",
++				viommu->vreg.iqa, viommu->vreg.iq_head, viommu->vreg.iq_tail,
++				wait_desc->qw0, wait_desc->qw1);
++
++		if (QI_DESC_TYPE(wait_desc->qw0) != QI_IWD_TYPE) {
++			pkvm_err("pkvm: %s: expect wait desc but 0x%llx\n",
++				 __func__, wait_desc->qw0);
++			return -EINVAL;
++		}
++
++		desc_status = pkvm_phys_to_virt(wait_desc->qw1);
++		/*
++		 * Wait until the wait descriptor is completed.
++		 *
++		 * The desc_status is from host. Checking this in pkvm
++		 * relies on host IOMMU driver not to release the
++		 * desc_status after it is completed, and this is guaranteed
++		 * by the current Linux IOMMU driver.
++		 */
++		while (READ_ONCE(*desc_status) == QI_IN_USE)
++			cpu_relax();
++	}
++
++	qi->free_cnt = PKVM_QI_DESC_ALIGNED_SIZE / sizeof(struct qi_desc);
++	qi->desc = iommu_zalloc_pages(PKVM_QI_DESC_ALIGNED_SIZE);
++	if (!qi->desc)
++		return -ENOMEM;
++
++	qi->desc_status = iommu_zalloc_pages(PKVM_QI_DESC_STATUS_ALIGNED_SIZE);
++	if (!qi->desc_status) {
++		iommu_put_page(qi->desc);
++		return -ENOMEM;
++	}
++
++	enable_qi(iommu);
++	return 0;
++}
++
++static int qi_check_fault(struct pkvm_iommu *iommu, int wait_index)
++{
++	u32 fault;
++	struct q_inval *qi = &iommu->qi;
++
++	if (qi->desc_status[wait_index] == QI_ABORT)
++		return -EAGAIN;
++
++	fault = readl(iommu->iommu.reg + DMAR_FSTS_REG);
++
++	/*
++	 * If IQE happens, the head points to the descriptor associated
++	 * with the error. No new descriptors are fetched until the IQE
++	 * is cleared.
++	 */
++	if (fault & DMA_FSTS_IQE) {
++		writel(DMA_FSTS_IQE, iommu->iommu.reg + DMAR_FSTS_REG);
++		pkvm_dbg("pkvm: Invalidation Queue Error (IQE) cleared\n");
++	}
++
++	/*
++	 * If ITE happens, all pending wait_desc commands are aborted.
++	 * No new descriptors are fetched until the ITE is cleared.
++	 */
++	if (fault & DMA_FSTS_ITE) {
++		writel(DMA_FSTS_ITE, iommu->iommu.reg + DMAR_FSTS_REG);
++		pkvm_dbg("pkvm: Invalidation Time-out Error (ITE) cleared\n");
++	}
++
++	if (fault & DMA_FSTS_ICE) {
++		writel(DMA_FSTS_ICE, iommu->iommu.reg + DMAR_FSTS_REG);
++		pkvm_dbg("pkvm: Invalidation Completion Error (ICE) cleared\n");
++	}
++
++	return 0;
++}
++
++static void __submit_qi(struct pkvm_iommu *iommu, struct qi_desc *base, int count)
++{
++	int len = IQ_DESC_LEN(iommu->piommu_iqa), i, wait_index;
++	int shift = IQ_DESC_SHIFT(iommu->piommu_iqa);
++	struct q_inval *qi = &iommu->qi;
++	struct qi_desc *to, *from;
++	int required_cnt = count + 2;
++	void *desc = qi->desc;
++	int *desc_status, rc;
++
++	pkvm_spin_lock(&iommu->qi_lock);
++	/*
++	 * Detect if the free descriptor count is enough or not
++	 */
++	while (qi->free_cnt < required_cnt) {
++		u64 head = readq(iommu->iommu.reg + DMAR_IQH_REG) >> shift;
++		int busy_cnt = (READ_ONCE(qi->free_head) + len - head) % len;
++		int free_cnt = len - busy_cnt;
++
++		if (free_cnt >= required_cnt) {
++			qi->free_cnt = free_cnt;
++			break;
++		}
++		pkvm_spin_unlock(&iommu->qi_lock);
++		cpu_relax();
++		pkvm_spin_lock(&iommu->qi_lock);
++	}
++
++	for (i = 0; i < count; i++) {
++		from = base + i;
++		to = qi->desc + (((qi->free_head + i) % len) << shift);
++		to->qw0 = from->qw0;
++		to->qw1 = from->qw1;
++	}
++
++	wait_index = (qi->free_head + count) % len;
++	/* setup wait descriptor */
++	to = desc + (wait_index << shift);
++	to->qw0 = QI_IWD_STATUS_DATA(QI_DONE) |
++		  QI_IWD_STATUS_WRITE | QI_IWD_TYPE;
++
++	desc_status = &qi->desc_status[wait_index];
++	WRITE_ONCE(*desc_status, QI_IN_USE);
++	to->qw1 = pkvm_virt_to_phys(desc_status);
++
++	/* submit to hardware with wait descriptor */
++	qi->free_cnt -= count + 1;
++	qi->free_head = (qi->free_head + count + 1) % len;
++	writel(qi->free_head << shift, iommu->iommu.reg + DMAR_IQT_REG);
++
++	while (READ_ONCE(*desc_status) != QI_DONE) {
++		rc = qi_check_fault(iommu, wait_index);
++		if (rc)
++			break;
++		pkvm_spin_unlock(&iommu->qi_lock);
++		cpu_relax();
++		pkvm_spin_lock(&iommu->qi_lock);
++	}
++
++	if (*desc_status != QI_DONE)
++		pkvm_err("pkvm: %s: failed with status %d\n",
++			 __func__, *desc_status);
++
++	/* release the free_cnt */
++	qi->free_cnt += count + 1;
++
++	pkvm_spin_unlock(&iommu->qi_lock);
++}
++
++static void submit_qi(struct pkvm_iommu *iommu, struct qi_desc *base, int count)
++{
++	int max_len = IQ_DESC_LEN(iommu->piommu_iqa) - 2;
++	int submit_count;
++
++	do {
++		submit_count = count > max_len ? max_len : count;
++		__submit_qi(iommu, base, submit_count);
++
++		count -= submit_count;
++		base += submit_count;
++	} while (count > 0);
++}
++
++static void flush_context_cache(struct pkvm_iommu *iommu, u16 did,
++				u16 sid, u8 fm, u64 type)
++{
++	struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0};
++
++	desc.qw0 = QI_CC_FM(fm) | QI_CC_SID(sid) | QI_CC_DID(did) |
++		   QI_CC_GRAN(type) | QI_CC_TYPE;
++
++	submit_qi(iommu, &desc, 1);
++}
++
++static void flush_pasid_cache(struct pkvm_iommu *iommu, u16 did,
++			      u64 granu, u32 pasid)
++{
++	struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0};
++
++	desc.qw0 = QI_PC_PASID(pasid) | QI_PC_DID(did) |
++		   QI_PC_GRAN(granu) | QI_PC_TYPE;
++
++	submit_qi(iommu, &desc, 1);
++}
++
++static void setup_iotlb_qi_desc(struct pkvm_iommu *iommu,
++				struct qi_desc *desc, u16 did,
++				u64 addr, unsigned int size_order,
++				u64 type)
++{
++	u8 dw = 0, dr = 0;
++
++	if (cap_write_drain(iommu->iommu.cap))
++		dw = 1;
++
++	if (cap_read_drain(iommu->iommu.cap))
++		dr = 1;
++
++	desc->qw0 = QI_IOTLB_DID(did) | QI_IOTLB_DR(dr) | QI_IOTLB_DW(dw) |
++		    QI_IOTLB_GRAN(type) | QI_IOTLB_TYPE;
++	desc->qw1 = QI_IOTLB_ADDR(addr) | QI_IOTLB_AM(size_order);
++	desc->qw2 = 0;
++	desc->qw3 = 0;
++}
++
++static void flush_iotlb(struct pkvm_iommu *iommu, u16 did, u64 addr,
++			unsigned int size_order, u64 type)
++{
++	struct qi_desc desc;
++
++	setup_iotlb_qi_desc(iommu, &desc, did, addr, size_order, type);
++	submit_qi(iommu, &desc, 1);
++}
++
++static void set_root_table(struct pkvm_iommu *iommu)
++{
++	u64 val = iommu->pgt.root_pa;
++	void __iomem *reg = iommu->iommu.reg;
++	u32 sts;
++
++	/* Set scalable mode */
++	if (ecap_smts(iommu->iommu.ecap))
++		val |= DMA_RTADDR_SMT;
++
++	writeq(val, reg + DMAR_RTADDR_REG);
++
++	/*
++	 * The shadow root table provides identical remapping results comparing
++	 * with the previous guest root table, so it is allowed to switch if
++	 * Translation Enable Status is still 1 according to IOMMU spec 6.6:
++	 *
++	 *  "
++	 *  If software sets the root-table pointer while remapping hardware is
++	 *  active (TES=1 in Global Status register), software must ensure the
++	 *  structures referenced by the new root-table pointer provide identical
++	 *  remapping results as the structures referenced by the previous root-table
++	 *  pointer so that inflight requests are properly translated.
++	 *  "
++	 *
++	 *  So don't need to turn off TE first before switching.
++	 */
++	writel(iommu->iommu.gcmd | DMA_GCMD_SRTP, reg + DMAR_GCMD_REG);
++
++	PKVM_IOMMU_WAIT_OP(reg + DMAR_GSTS_REG, readl, (sts & DMA_GSTS_RTPS), sts);
++
++	flush_context_cache(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
++	if (ecap_smts(iommu->iommu.ecap))
++		flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
++	flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
++}
++
++static void enable_translation(struct pkvm_iommu *iommu)
++{
++	void __iomem *reg = iommu->iommu.reg;
++	u32 sts;
++
++	if (iommu->iommu.gcmd & DMA_GCMD_TE)
++		return;
++
++	iommu->iommu.gcmd |= DMA_GCMD_TE;
++
++	writel(iommu->iommu.gcmd, reg + DMAR_GCMD_REG);
++
++	PKVM_IOMMU_WAIT_OP(reg + DMAR_GSTS_REG, readl, (sts & DMA_GSTS_TES), sts);
++}
++
++static void initialize_viommu_reg(struct pkvm_iommu *iommu)
++{
++	struct viommu_reg *vreg = &iommu->viommu.vreg;
++	void __iomem *reg_base = iommu->iommu.reg;
++
++	vreg->cap = readq(reg_base + DMAR_CAP_REG);
++	vreg->ecap = readq(reg_base + DMAR_ECAP_REG);
++	pkvm_update_iommu_virtual_caps(&vreg->cap, &vreg->ecap);
++
++	vreg->gsts = readl(reg_base + DMAR_GSTS_REG);
++	vreg->rta = readq(reg_base + DMAR_RTADDR_REG);
++
++	pkvm_dbg("%s: iommu phys reg 0x%llx cap 0x%llx ecap 0x%llx gsts 0x%x rta 0x%llx\n",
++		 __func__, iommu->iommu.reg_phys, vreg->cap, vreg->ecap, vreg->gsts, vreg->rta);
++
++	/* Invalidate Queue regs are updated when create descriptor */
++}
++
++static int activate_iommu(struct pkvm_iommu *iommu)
++{
++	unsigned long vaddr = 0, vaddr_end = IOMMU_MAX_VADDR;
++	int ret;
++
++	pkvm_dbg("%s: iommu%d\n", __func__, iommu->iommu.seq_id);
++
++	pkvm_spin_lock(&iommu->lock);
++
++	ret = initialize_iommu_pgt(iommu);
++	if (ret)
++		goto out;
++
++	initialize_viommu_reg(iommu);
++
++	ret = sync_shadow_id(iommu, vaddr, vaddr_end, 0, NULL);
++	if (ret)
++		goto out;
++
++	ret = create_qi_desc(iommu);
++	if (ret)
++		goto free_shadow;
++
++	set_root_table(iommu);
++
++	/*
++	 * It is possible that some IOMMU devices do not have memory
++	 * remapping translation enabled by the host IOMMU driver during boot
++	 * time, so pkvm IOMMU driver needs to make sure this enabled to
++	 * guarantee the IO isolation from the devices behind this IOMMU.
++	 *
++	 */
++	enable_translation(iommu);
++
++	iommu->activated = true;
++	root_tbl_walk(iommu);
++
++	pkvm_spin_unlock(&iommu->lock);
++	return 0;
++
++free_shadow:
++	free_shadow_id(iommu, vaddr, vaddr_end);
++out:
++	pkvm_spin_unlock(&iommu->lock);
++	return ret;
++}
++
++static int context_cache_invalidate(struct pkvm_iommu *iommu, struct qi_desc *desc)
++{
++	u16 sid = QI_DESC_CC_SID(desc->qw0);
++	u16 did = ecap_smts(iommu->iommu.ecap) ? 0 : QI_DESC_CC_DID(desc->qw0);
++	u64 granu = QI_DESC_CC_GRANU(desc->qw0) << DMA_CCMD_INVL_GRANU_OFFSET;
++	unsigned long start, end;
++	int ret;
++
++	switch (granu) {
++	case DMA_CCMD_GLOBAL_INVL:
++		start = 0;
++		end = MAX_NUM_OF_ADDRESS_SPACE(iommu);
++		pkvm_dbg("pkvm: %s: iommu%d: global\n", __func__, iommu->iommu.seq_id);
++		ret = sync_shadow_id(iommu, start, end, 0, NULL);
++		break;
++	case DMA_CCMD_DOMAIN_INVL:
++		/*
++		 * Domain selective invalidation which is processed by
++		 * hardware as global invalidations for scalable mode
++		 * according to spec 6.5.2.1
++		 */
++		start = 0;
++		end = MAX_NUM_OF_ADDRESS_SPACE(iommu);
++		pkvm_dbg("pkvm: %s: iommu%d: domain selective\n",
++			 __func__, iommu->iommu.seq_id);
++		ret = sync_shadow_id(iommu, start, end, did, NULL);
++		break;
++	case DMA_CCMD_DEVICE_INVL:
++		if (ecap_smts(iommu->iommu.ecap)) {
++			start = (unsigned long)sid << DEVFN_SHIFT;
++			end = ((unsigned long)sid + 1) << DEVFN_SHIFT;
++		} else {
++			start = (unsigned long)sid << LM_DEVFN_SHIFT;
++			end = ((unsigned long)sid + 1) << LM_DEVFN_SHIFT;
++		}
++		pkvm_dbg("pkvm: %s: iommu%d: device selective sid 0x%x\n",
++			 __func__, iommu->iommu.seq_id, sid);
++		ret = sync_shadow_id(iommu, start, end, did, NULL);
++		break;
++	default:
++		pkvm_err("pkvm: %s: iommu%d: invalidate granu %lld\n",
++			__func__, iommu->iommu.seq_id, granu >> DMA_CCMD_INVL_GRANU_OFFSET);
++		ret = -EINVAL;
++		break;
++	}
++
++	if (ret)
++		pkvm_err("pkvm: %s: iommu%d: granularity %lld failed with ret %d\n",
++			__func__, iommu->iommu.seq_id, granu >> DMA_CCMD_INVL_GRANU_OFFSET, ret);
++	return ret;
++}
++
++static int pasid_cache_invalidate(struct pkvm_iommu *iommu, struct qi_desc *desc)
++{
++	int pasid = QI_DESC_PC_PASID(desc->qw0);
++	u16 did = QI_DESC_PC_DID(desc->qw0);
++	int granu = QI_DESC_PC_GRANU(desc->qw0);
++	unsigned long start, end;
++	int ret;
++
++	switch (granu) {
++	case QI_PC_ALL_PASIDS:
++		/*
++		 * This is more like a global invalidation but to check
++		 * if matching with a specific DID.
++		 */
++		pkvm_dbg("pkvm: %s: iommu%d: ALL_PASID did %d\n",
++			 __func__, iommu->iommu.seq_id, did);
++		start = 0;
++		end = IOMMU_MAX_VADDR;
++		ret = sync_shadow_id(iommu, start, end, did, NULL);
++		break;
++	case QI_PC_PASID_SEL: {
++		/*
++		 * Sync specific PASID entry for all contexts
++		 */
++		u64 bdf, end_bdf = 0x10000;
++
++		pkvm_dbg("pkvm: %s: iommu%d: PASID_SEL did %d pasid 0x%x\n",
++			 __func__, iommu->iommu.seq_id, did, pasid);
++		for (bdf = 0; bdf < end_bdf; bdf++) {
++			start = (bdf << DEVFN_SHIFT) + pasid;
++			end = start + 1;
++			ret = sync_shadow_id(iommu, start, end, did, NULL);
++			if (ret)
++				break;
++		}
++		break;
++	}
++	case QI_PC_GLOBAL:
++		start = 0;
++		end = IOMMU_MAX_VADDR;
++		pkvm_dbg("pkvm: %s: iommu%d: global\n", __func__, iommu->iommu.seq_id);
++		ret = sync_shadow_id(iommu, start, end, 0, NULL);
++		break;
++	default:
++		pkvm_err("pkvm: %s: iommu%d: invalid granularity %d 0x%llx\n",
++			 __func__, iommu->iommu.seq_id, granu, desc->qw0);
++		ret = -EINVAL;
++		break;
++	}
++
++	if (ret)
++		pkvm_err("pkvm: %s: iommu%d: granularity %d failed with ret %d\n",
++			 __func__, iommu->iommu.seq_id, granu, ret);
++
++	return ret;
++}
++
++static int iotlb_lm_invalidate(struct pkvm_iommu *iommu, struct qi_desc *desc)
++{
++	u16 did = QI_DESC_IOTLB_DID(desc->qw0);
++	u64 granu = QI_DESC_IOTLB_GRANU(desc->qw0) << DMA_TLB_FLUSH_GRANU_OFFSET;
++	u64 addr = QI_DESC_IOTLB_ADDR(desc->qw1);
++	u64 mask = ((u64)-1) << (VTD_PAGE_SHIFT + QI_DESC_IOTLB_AM(desc->qw1));
++	struct shadow_pgt_sync_data data;
++	struct pkvm_ptdev *p;
++	int ret;
++
++	switch (granu) {
++	case DMA_TLB_GLOBAL_FLUSH:
++		pkvm_dbg("pkvm: %s: iommu%d: global\n", __func__, iommu->iommu.seq_id);
++		ret = sync_shadow_id(iommu, 0, IOMMU_LM_MAX_VADDR, 0, NULL);
++		break;
++	case DMA_TLB_DSI_FLUSH:
++		pkvm_dbg("pkvm: %s: iommu%d: domain selective did %u\n",
++			 __func__, iommu->iommu.seq_id, did);
++
++		/* optimization: walk just the needed devices, not the entire bdf space */
++		list_for_each_entry(p, &iommu->ptdev_head, iommu_node)
++			if (p->did == did)
++				ret = sync_shadow_id(iommu, p->bdf, p->bdf + 1, did, NULL);
++		break;
++	case DMA_TLB_PSI_FLUSH:
++		data.vaddr = addr & mask;
++		data.vaddr_end = (addr | ~mask) + 1;
++		pkvm_dbg("pkvm: %s: iommu%d: page selective did %u start 0x%lx end 0x%lx\n",
++			 __func__, iommu->iommu.seq_id, did, data.vaddr, data.vaddr_end);
++
++		/* optimization: walk just the needed devices, not the entire bdf space */
++		list_for_each_entry(p, &iommu->ptdev_head, iommu_node)
++			if (p->did == did)
++				ret = sync_shadow_id(iommu, p->bdf, p->bdf + 1, did, &data);
++		break;
++	default:
++		pkvm_err("pkvm: %s: iommu%d: invalid granularity %lld\n",
++			__func__, iommu->iommu.seq_id, granu >> DMA_TLB_FLUSH_GRANU_OFFSET);
++		ret = -EINVAL;
++		break;
++	}
++
++	if (ret)
++		pkvm_err("pkvm: %s: iommu%d: granularity %lld failed with ret %d\n",
++			__func__, iommu->iommu.seq_id, granu >> DMA_TLB_FLUSH_GRANU_OFFSET, ret);
++
++	return ret;
++}
++
++static int handle_descriptor(struct pkvm_iommu *iommu, struct qi_desc *desc)
++{
++	int type = QI_DESC_TYPE(desc->qw0);
++	int ret = 0;
++
++	switch (type) {
++	/*
++	 * TODO: is it necessary to intercept the
++	 * PGRP_RESP & PSTRM_RESP?
++	 */
++	case QI_PGRP_RESP_TYPE:
++	case QI_PSTRM_RESP_TYPE:
++	case QI_DIOTLB_TYPE:
++	case QI_DEIOTLB_TYPE:
++	case QI_IEC_TYPE:
++	case QI_IWD_TYPE:
++	case QI_EIOTLB_TYPE:
++		break;
++	case QI_CC_TYPE:
++		ret = context_cache_invalidate(iommu, desc);
++		break;
++	case QI_PC_TYPE:
++		ret = pasid_cache_invalidate(iommu, desc);
++		break;
++	case QI_IOTLB_TYPE:
++		if (!ecap_smts(iommu->iommu.ecap))
++			ret = iotlb_lm_invalidate(iommu, desc);
++		break;
++	default:
++		pkvm_err("pkvm: %s: iommu%d: invalid type %d desc addr 0x%llx val 0x%llx\n",
++			 __func__, iommu->iommu.seq_id, type, (u64)desc, desc->qw0);
++		ret = -EINVAL;
++		break;
++	}
++
++	return ret;
++}
++
++static void handle_qi_submit(struct pkvm_iommu *iommu, void *vdesc, int vhead, int count)
++{
++	struct pkvm_viommu *viommu = &iommu->viommu;
++	int vlen = IQ_DESC_LEN(viommu->iqa);
++	int vshift = IQ_DESC_SHIFT(viommu->iqa);
++	int len = IQ_DESC_LEN(iommu->piommu_iqa);
++	int shift = IQ_DESC_SHIFT(iommu->piommu_iqa);
++	struct q_inval *qi = &iommu->qi;
++	struct qi_desc *to, *from;
++	int required_cnt = count + 1, i;
++
++	pkvm_spin_lock(&iommu->qi_lock);
++	/*
++	 * Detect if the free descriptor count is enough or not
++	 */
++	while (qi->free_cnt < required_cnt) {
++		u64 head = readq(iommu->iommu.reg + DMAR_IQH_REG) >> shift;
++		int busy_cnt = (READ_ONCE(qi->free_head) + len - head) % len;
++		int free_cnt = len - busy_cnt;
++
++		if (free_cnt >= required_cnt) {
++			qi->free_cnt = free_cnt;
++			break;
++		}
++		pkvm_spin_unlock(&iommu->qi_lock);
++		cpu_relax();
++		pkvm_spin_lock(&iommu->qi_lock);
++	}
++
++	for (i = 0; i < count; i++) {
++		from = vdesc + (((vhead + i) % vlen) << vshift);
++		to = qi->desc + (((qi->free_head + i) % len) << shift);
++
++		to->qw0 = from->qw0;
++		to->qw1 = from->qw1;
++	}
++
++	/*
++	 * Reuse the desc_status from host so that host can poll
++	 * the desc_status itself instead of waiting in pkvm.
++	 */
++	qi->free_cnt -= count;
++	qi->free_head = (qi->free_head + count) % len;
++	writel(qi->free_head << shift, iommu->iommu.reg + DMAR_IQT_REG);
++
++	pkvm_spin_unlock(&iommu->qi_lock);
++}
++
++static int handle_qi_invalidation(struct pkvm_iommu *iommu, unsigned long val)
++{
++	struct pkvm_viommu *viommu = &iommu->viommu;
++	u64 viommu_iqa = viommu->iqa;
++	struct qi_desc *wait_desc;
++	int len = IQ_DESC_LEN(viommu_iqa);
++	int shift = IQ_DESC_SHIFT(viommu_iqa);
++	int head = viommu->vreg.iq_head >> shift;
++	int count, i, ret = 0;
++	int *desc_status;
++	void *desc;
++
++	viommu->vreg.iq_tail = val;
++	desc = pkvm_phys_to_virt(IQ_DESC_BASE_PHYS(viommu_iqa));
++	count = ((val >> shift) + len - head) % len;
++
++	for (i = 0; i < count; i++) {
++		viommu->vreg.iq_head = ((head + i) % len) << shift;
++		ret = handle_descriptor(iommu, desc + viommu->vreg.iq_head);
++		if (ret)
++			break;
++	}
++	/* update iq_head */
++	viommu->vreg.iq_head = val;
++
++	if (likely(!ret)) {
++		/*
++		 * Submit the descriptor to hardware. The desc_status
++		 * will be taken care by hardware.
++		 */
++		handle_qi_submit(iommu, desc, head, count);
++	} else {
++		pkvm_err("pkvm: %s: failed with ret %d\n", __func__, ret);
++		/*
++		 * The descriptor seems invalid. Mark the desc_status as
++		 * QI_ABORT to make sure host driver won't be blocked.
++		 */
++		wait_desc = desc + (((head + count - 1) % len) << shift);
++		if (QI_DESC_TYPE(wait_desc->qw0) == QI_IWD_TYPE) {
++			desc_status = pkvm_phys_to_virt(wait_desc->qw1);
++			WRITE_ONCE(*desc_status, QI_ABORT);
++		}
++	}
++
++	return ret;
++}
++
++static void handle_gcmd_te(struct pkvm_iommu *iommu, bool en)
++{
++	unsigned long vaddr = 0, vaddr_end = MAX_NUM_OF_ADDRESS_SPACE(iommu);
++	struct pkvm_viommu *viommu = &iommu->viommu;
++
++	if (en) {
++		viommu->vreg.gsts |= DMA_GSTS_TES;
++		/*
++		 * Sync shadow id table to emulate Translation enable.
++		 */
++		if (sync_shadow_id(iommu, vaddr, vaddr_end, 0, NULL))
++			return;
++		pkvm_dbg("pkvm: %s: enable TE\n", __func__);
++		goto out;
++	}
++
++	/*
++	 * Free shadow to emulate Translation disable.
++	 *
++	 * Not really disable translation as still
++	 * need to protect against the device.
++	 */
++	free_shadow_id(iommu, vaddr, vaddr_end);
++	viommu->vreg.gsts &= ~DMA_GSTS_TES;
++	pkvm_dbg("pkvm: %s: disable TE\n", __func__);
++out:
++	flush_context_cache(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
++	if (ecap_smts(iommu->iommu.ecap))
++		flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
++	flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
++
++	root_tbl_walk(iommu);
++}
++
++static void handle_gcmd_srtp(struct pkvm_iommu *iommu)
++{
++	struct viommu_reg *vreg = &iommu->viommu.vreg;
++	struct pkvm_pgtable *vpgt = &iommu->viommu.pgt;
++
++	vreg->gsts &= ~DMA_GSTS_RTPS;
++
++	/* Set the root table phys address from vreg */
++	vpgt->root_pa = vreg->rta & VTD_PAGE_MASK;
++
++	pkvm_dbg("pkvm: %s: set SRTP val 0x%llx\n", __func__, vreg->rta);
++
++	if (vreg->gsts & DMA_GSTS_TES) {
++		unsigned long vaddr = 0, vaddr_end = MAX_NUM_OF_ADDRESS_SPACE(iommu);
++
++		/* TE is already enabled, sync shadow */
++		if (sync_shadow_id(iommu, vaddr, vaddr_end, 0, NULL))
++			return;
++
++		flush_context_cache(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
++		if (ecap_smts(iommu->iommu.ecap))
++			flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
++		flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
++	}
++
++	vreg->gsts |= DMA_GSTS_RTPS;
++
++	root_tbl_walk(iommu);
++}
++
++static void handle_gcmd_qie(struct pkvm_iommu *iommu, bool en)
++{
++	struct viommu_reg *vreg = &iommu->viommu.vreg;
++
++	if (en) {
++		if (vreg->iq_tail != 0) {
++			pkvm_err("pkvm: Queue invalidation descriptor tail is not zero\n");
++			return;
++		}
++
++		/* Update the iqa from vreg */
++		iommu->viommu.iqa = vreg->iqa;
++		vreg->iq_head = 0;
++		vreg->gsts |= DMA_GSTS_QIES;
++		pkvm_dbg("pkvm: %s: enabled QI\n", __func__);
++		return;
++	}
++
++	if (vreg->iq_head != vreg->iq_tail) {
++		pkvm_err("pkvm: Queue invalidation descriptor is not empty yet\n");
++		return;
++	}
++
++	vreg->iq_head = 0;
++	vreg->gsts &= ~DMA_GSTS_QIES;
++	pkvm_dbg("pkvm: %s: disabled QI\n", __func__);
++}
++
++static void handle_gcmd_direct(struct pkvm_iommu *iommu, u32 val)
++{
++	struct viommu_reg *vreg = &iommu->viommu.vreg;
++	unsigned long changed = ((vreg->gsts ^ val) & DMAR_GCMD_DIRECT) &
++				DMAR_GSTS_EN_BITS;
++	unsigned long set = (val & DMAR_GCMD_DIRECT) & ~DMAR_GSTS_EN_BITS;
++	u32 cmd, gcmd, sts;
++	int bit;
++
++	if ((changed | set) & DMAR_GCMD_PROTECTED) {
++		pkvm_dbg("pkvm:%s touching protected bits changed 0x%lx set 0x%lx\n",
++			 __func__, changed, set);
++		return;
++	}
++
++	if (changed) {
++		pkvm_dbg("pkvm: %s: changed 0x%lx\n", __func__, changed);
++		gcmd = READ_ONCE(iommu->iommu.gcmd);
++		for_each_set_bit(bit, &changed, BITS_PER_BYTE * sizeof(vreg->gsts)) {
++			cmd = 1 << bit;
++			if (val & cmd) {
++				/* enable */
++				gcmd |= cmd;
++				writel(gcmd, iommu->iommu.reg + DMAR_GCMD_REG);
++				PKVM_IOMMU_WAIT_OP(iommu->iommu.reg + DMAR_GSTS_REG,
++						   readl, (sts & cmd), sts);
++				vreg->gsts |= cmd;
++				pkvm_dbg("pkvm: %s: enable cmd bit %d\n", __func__, bit);
++			} else {
++				/* disable */
++				gcmd &= ~cmd;
++				writel(gcmd, iommu->iommu.reg + DMAR_GCMD_REG);
++				PKVM_IOMMU_WAIT_OP(iommu->iommu.reg + DMAR_GSTS_REG,
++						   readl, !(sts & cmd), sts);
++				vreg->gsts &= ~cmd;
++				pkvm_dbg("pkvm: %s: disable cmd bit %d\n", __func__, bit);
++			}
++		}
++		WRITE_ONCE(iommu->iommu.gcmd, gcmd);
++	}
++
++	if (set) {
++		pkvm_dbg("pkvm: %s: set 0x%lx\n", __func__, set);
++		gcmd = READ_ONCE(iommu->iommu.gcmd);
++		for_each_set_bit(bit, &set, BITS_PER_BYTE * sizeof(vreg->gsts)) {
++			cmd = 1 << bit;
++			vreg->gsts &= ~cmd;
++			writel(gcmd | cmd, iommu->iommu.reg + DMAR_GCMD_REG);
++			PKVM_IOMMU_WAIT_OP(iommu->iommu.reg + DMAR_GSTS_REG,
++					   readl, (sts & cmd), sts);
++			vreg->gsts |= cmd;
++			pkvm_dbg("pkvm: %s: set cmd bit %d\n", __func__, bit);
++		}
++	}
++}
++
++static void handle_global_cmd(struct pkvm_iommu *iommu, u32 val)
++{
++	u32 changed = iommu->viommu.vreg.gsts ^ val;
++
++	pkvm_dbg("pkvm: iommu%d: handle gcmd val 0x%x gsts 0x%x changed 0x%x\n",
++		  iommu->iommu.seq_id, val, iommu->viommu.vreg.gsts, changed);
++
++	if (changed & DMA_GCMD_TE)
++		handle_gcmd_te(iommu, !!(val & DMA_GCMD_TE));
++
++	if (val & DMA_GCMD_SRTP)
++		handle_gcmd_srtp(iommu);
++
++	if (changed & DMA_GCMD_QIE)
++		handle_gcmd_qie(iommu, !!(val & DMA_GCMD_QIE));
++
++	handle_gcmd_direct(iommu, val);
++}
++
++static struct pkvm_iommu *find_iommu_by_reg_phys(unsigned long phys)
++{
++	struct pkvm_iommu *iommu;
++
++	for_each_valid_iommu(iommu) {
++		if ((phys >= iommu->iommu.reg_phys) &&
++			(phys < (iommu->iommu.reg_phys + iommu->iommu.reg_size)))
++			return iommu;
++	}
++
++	return NULL;
++}
++
++static unsigned long direct_access_iommu_mmio(struct pkvm_iommu *iommu,
++					      bool is_read, int len,
++					      unsigned long phys,
++					      unsigned long val)
++{
++	unsigned long offset = phys - iommu->iommu.reg_phys;
++	void *reg = iommu->iommu.reg + offset;
++	unsigned long ret = 0;
++
++	switch (len) {
++	case 4:
++		if (is_read)
++			ret = (unsigned long)readl(reg);
++		else
++			writel((u32)val, reg);
++		break;
++	case 8:
++		if (is_read)
++			ret = (unsigned long)readq(reg);
++		else
++			writeq((u64)val, reg);
++		break;
++	default:
++		pkvm_err("%s: %s: unsupported len %d\n", __func__,
++			 is_read ? "read" : "write", len);
++		break;
++	}
++
++	return ret;
++}
++
++static unsigned long access_iommu_mmio(struct pkvm_iommu *iommu, bool is_read,
++				       int len, unsigned long phys,
++				       unsigned long val)
++{
++	struct pkvm_viommu *viommu = &iommu->viommu;
++	unsigned long offset = phys - iommu->iommu.reg_phys;
++	unsigned long ret = 0;
++
++	/* pkvm IOMMU driver is not activated yet, so directly access MMIO */
++	if (unlikely(!iommu->activated))
++		return direct_access_iommu_mmio(iommu, is_read, len, phys, val);
++
++	/* Only need to emulate part of the MMIO */
++	switch (offset) {
++	case DMAR_CAP_REG:
++		if (is_read)
++			ret = viommu->vreg.cap;
++		break;
++	case DMAR_ECAP_REG:
++		if (is_read)
++			ret = viommu->vreg.ecap;
++		break;
++	case DMAR_GCMD_REG:
++		if (is_read)
++			ret = 0;
++		else
++			handle_global_cmd(iommu, val);
++		break;
++	case DMAR_GSTS_REG:
++		if (is_read)
++			ret = viommu->vreg.gsts;
++		break;
++	case DMAR_RTADDR_REG:
++		if (is_read)
++			ret = viommu->vreg.rta;
++		else
++			viommu->vreg.rta = val;
++		break;
++	case DMAR_IQA_REG:
++		if (is_read)
++			ret = viommu->vreg.iqa;
++		else
++			viommu->vreg.iqa = val;
++		break;
++	case DMAR_IQH_REG:
++		if (is_read)
++			ret = viommu->vreg.iq_head;
++		break;
++	case DMAR_IQT_REG:
++		if (is_read)
++			ret = viommu->vreg.iq_tail;
++		else {
++			if (viommu->vreg.gsts & DMA_GSTS_QIES)
++				ret = handle_qi_invalidation(iommu, val);
++			else
++				viommu->vreg.iq_tail = val;
++		}
++		break;
++	default:
++		/* Not emulated MMIO can directly goes to hardware */
++		ret = direct_access_iommu_mmio(iommu, is_read, len, phys, val);
++		break;
++	}
++
++	return ret;
++}
++
++unsigned long pkvm_access_iommu(bool is_read, int len, unsigned long phys, unsigned long val)
++{
++	struct pkvm_iommu *pkvm_iommu = find_iommu_by_reg_phys(phys);
++	unsigned long ret;
++
++	if (!pkvm_iommu) {
++		pkvm_err("%s: cannot find pkvm iommu for reg 0x%lx\n",
++			__func__, phys);
++		return 0;
++	}
++
++	pkvm_spin_lock(&pkvm_iommu->lock);
++	ret = access_iommu_mmio(pkvm_iommu, is_read, len, phys, val);
++	pkvm_spin_unlock(&pkvm_iommu->lock);
++
++	return ret;
++}
++
++int pkvm_activate_iommu(void)
++{
++	struct pkvm_iommu *iommu;
++	int ret = 0;
++
++	for_each_valid_iommu(iommu) {
++		ret = activate_iommu(iommu);
++		if (ret)
++			return ret;
++	}
++
++	return 0;
++}
++
++bool is_mem_range_overlap_iommu(unsigned long start, unsigned long end)
++{
++	struct pkvm_iommu *iommu;
++
++	for_each_valid_iommu(iommu) {
++		if (end < iommu->iommu.reg_phys ||
++			start > (iommu->iommu.reg_phys + iommu->iommu.reg_size - 1))
++			continue;
++
++		return true;
++	}
++
++	return false;
++}
++
++/*
++ * TODO:
++ * Currently assume that the bdf/pasid has ever been synced
++ * so that the IOMMU can be found. If not synced, then cannot
++ * get a valid IOMMU by calling this function.
++ *
++ * To handle this case, pKVM IOMMU driver needs to check the
++ * DMAR to know which IOMMU should be used for this bdf/pasid.
++ */
++static struct pkvm_iommu *bdf_pasid_to_iommu(u16 bdf, u32 pasid)
++{
++	struct pkvm_iommu *iommu, *find = NULL;
++	struct pkvm_ptdev *p;
++
++	for_each_valid_iommu(iommu) {
++		pkvm_spin_lock(&iommu->lock);
++		list_for_each_entry(p, &iommu->ptdev_head, iommu_node) {
++			if (match_ptdev(p, bdf, pasid)) {
++				find = iommu;
++				break;
++			}
++		}
++		pkvm_spin_unlock(&iommu->lock);
++		if (find)
++			break;
++	}
++
++	return find;
++}
++
++/*
++ * pkvm_iommu_sync() - Sync IOMMU context/pasid entry according to a ptdev
++ *
++ * @bdf/pasid:		The corresponding IOMMU page table entry needs to sync.
++ */
++int pkvm_iommu_sync(u16 bdf, u32 pasid)
++{
++	struct pkvm_iommu *iommu = bdf_pasid_to_iommu(bdf, pasid);
++	unsigned long id_addr, id_addr_end;
++	struct pkvm_ptdev *ptdev;
++	u16 old_did;
++	int ret;
++
++	if (!iommu)
++		return -ENODEV;
++
++	ptdev = pkvm_get_ptdev(bdf, pasid);
++	if (!ptdev)
++		return -ENODEV;
++
++	old_did = ptdev->did;
++
++	if (ecap_smts(iommu->iommu.ecap)) {
++		id_addr = ((unsigned long)bdf << DEVFN_SHIFT) |
++			  ((unsigned long)pasid & ((1UL << MAX_NR_PASID_BITS) - 1));
++		id_addr_end = id_addr + 1;
++	} else {
++		id_addr = (unsigned long)bdf << LM_DEVFN_SHIFT;
++		id_addr_end = ((unsigned long)bdf + 1) << LM_DEVFN_SHIFT;
++	}
++
++	pkvm_spin_lock(&iommu->lock);
++	ret = sync_shadow_id(iommu, id_addr, id_addr_end, 0, NULL);
++	if (!ret) {
++		if (old_did != ptdev->did) {
++			/* Flush pasid cache and IOTLB for the valid old_did */
++			if (ecap_smts(iommu->iommu.ecap))
++				flush_pasid_cache(iommu, old_did, QI_PC_PASID_SEL, pasid);
++			else
++				flush_context_cache(iommu, old_did, 0, 0, DMA_CCMD_DOMAIN_INVL);
++			flush_iotlb(iommu, old_did, 0, 0, DMA_TLB_DSI_FLUSH);
++		}
++
++		/* Flush pasid cache and IOTLB to make sure no stale TLB for the new did */
++		if (ecap_smts(iommu->iommu.ecap))
++			flush_pasid_cache(iommu, ptdev->did, QI_PC_PASID_SEL, pasid);
++		else
++			flush_context_cache(iommu, ptdev->did, 0, 0, DMA_CCMD_DOMAIN_INVL);
++		flush_iotlb(iommu, ptdev->did, 0, 0, DMA_TLB_DSI_FLUSH);
++	}
++	pkvm_spin_unlock(&iommu->lock);
++
++	pkvm_put_ptdev(ptdev);
++	return ret;
++}
++
++bool pkvm_iommu_coherency(u16 bdf, u32 pasid)
++{
++	struct pkvm_iommu *iommu = bdf_pasid_to_iommu(bdf, pasid);
++
++	/*
++	 * If cannot find a valid IOMMU by bdf/pasid, return
++	 * false to present noncoherent, so that can guarantee
++	 * the coherency through flushing cache by pkvm itself.
++	 */
++	if (!iommu)
++		return false;
++
++	return iommu_coherency(iommu->iommu.ecap);
++}
++
++struct iotlb_flush_data {
++	unsigned long desired_root_pa;
++	unsigned long addr;
++	int size_order;
++	struct qi_desc *desc;
++	int desc_max_index;
++};
++
++static void iommu_flush_iotlb(struct pkvm_iommu *iommu, struct iotlb_flush_data *data)
++{
++	struct pkvm_ptdev *ptdev;
++	struct qi_desc *desc = data->desc;
++	int qi_desc_index = 0;
++
++	pkvm_spin_lock(&iommu->lock);
++
++	/* No need to flush IOTLB if there is no device on this IOMMU */
++	if (list_empty(&iommu->ptdev_head))
++		goto out;
++
++	/*
++	 * If the descriptor buffer is NULL, pKVM has to submit the QI
++	 * request one by one which may be slow if there are a lot of
++	 * devices connected to this IOMMU unit. So in this case, choose
++	 * to submit one single global flush request to flush the IOTLB
++	 * for all the devices.
++	 */
++	if (!desc) {
++		flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
++		goto out;
++	}
++
++	/* Flush per domain */
++	list_for_each_entry(ptdev, &iommu->ptdev_head, iommu_node) {
++		struct qi_desc *tmp = desc;
++		bool did_exist = false;
++		int i;
++
++		if (!ptdev->pgt || ptdev->pgt->root_pa != data->desired_root_pa)
++			continue;
++
++		for (i = 0; i < qi_desc_index; i++, tmp++) {
++			/* The same did is already in descriptor page */
++			if (ptdev->did == QI_DESC_IOTLB_DID(tmp->qw0)) {
++				did_exist = true;
++				break;
++			}
++		}
++
++		if (did_exist)
++			continue;
++		/*
++		 * Setup the page-selective or domain-selective qi descriptor
++		 * based on IOMMU capability, and submit to HW when qi descriptor
++		 * number reaches to the maximum count.
++		 */
++		if (cap_pgsel_inv(iommu->iommu.cap) &&
++		    data->size_order <= cap_max_amask_val(iommu->iommu.cap))
++			setup_iotlb_qi_desc(iommu, desc + qi_desc_index++,
++					    ptdev->did, data->addr, data->size_order,
++					    DMA_TLB_PSI_FLUSH);
++		else
++			setup_iotlb_qi_desc(iommu, desc + qi_desc_index++,
++					    ptdev->did, 0, 0,
++					    DMA_TLB_DSI_FLUSH);
++
++		if (qi_desc_index == data->desc_max_index) {
++			submit_qi(iommu, desc, qi_desc_index);
++			qi_desc_index = 0;
++		}
++	}
++
++	if (qi_desc_index)
++		submit_qi(iommu, desc, qi_desc_index);
++out:
++	pkvm_spin_unlock(&iommu->lock);
++}
++
++void pkvm_iommu_flush_iotlb(struct pkvm_pgtable *pgt, unsigned long addr, unsigned long size)
++{
++	int size_order = ilog2(__roundup_pow_of_two(size >> VTD_PAGE_SHIFT));
++	struct iotlb_flush_data data = {
++		.desired_root_pa = pgt->root_pa,
++		.addr = ALIGN_DOWN(addr, (1ULL << (VTD_PAGE_SHIFT + size_order))),
++		.size_order = size_order,
++	};
++	struct pkvm_iommu *iommu;
++
++	data.desc = iommu_zalloc_pages(PKVM_QI_DESC_ALIGNED_SIZE);
++	if (data.desc)
++		/* Reserve space for one wait desc and one desc between head and tail */
++		data.desc_max_index = PKVM_QI_DESC_ALIGNED_SIZE / sizeof(struct qi_desc) - 2;
++
++	for_each_valid_iommu(iommu)
++		iommu_flush_iotlb(iommu, &data);
++
++	if (data.desc)
++		iommu_put_page(data.desc);
++}
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/iommu.h b/arch/x86/kvm/vmx/pkvm/hyp/iommu.h
+new file mode 100644
+index 000000000000..dd7fc31373e0
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/iommu.h
+@@ -0,0 +1,16 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++#ifndef _PKVM_IOMMU_H_
++#define _PKVM_IOMMU_H_
++
++int pkvm_init_iommu(unsigned long mem_base, unsigned long nr_pages);
++unsigned long pkvm_access_iommu(bool is_read, int len, unsigned long reg, unsigned long val);
++bool is_mem_range_overlap_iommu(unsigned long start, unsigned long end);
++int pkvm_activate_iommu(void);
++int pkvm_iommu_sync(u16 bdf, u32 pasid);
++bool pkvm_iommu_coherency(u16 bdf, u32 pasid);
++void pkvm_iommu_flush_iotlb(struct pkvm_pgtable *pgt, unsigned long addr, unsigned long size);
++
++#endif
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/iommu_debug.c b/arch/x86/kvm/vmx/pkvm/hyp/iommu_debug.c
+new file mode 100644
+index 000000000000..9dfadadc2b74
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/iommu_debug.c
+@@ -0,0 +1,199 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++#include <linux/types.h>
++#include <linux/list.h>
++#include <linux/dmar.h>
++#include <linux/intel-iommu.h>
++#include <linux/pci.h>
++
++#include <pkvm.h>
++#include <pkvm_spinlock.h>
++#include <gfp.h>
++#include "debug.h"
++#include "memory.h"
++#include "pgtable.h"
++#include "ept.h"
++#include "pkvm_hyp.h"
++#include "iommu_internal.h"
++
++struct tbl_walk {
++	u16 bus;
++	u16 devfn;
++	u32 pasid;
++	struct root_entry *rt_entry;
++	struct context_entry *ctx_entry;
++	struct pasid_entry *pasid_tbl_entry;
++};
++
++#define PASID_PDE_SHIFT			6
++#define PASID_TBL_ENTRIES               BIT(PASID_PDE_SHIFT)
++#define get_pasid_dir_size(entry)      (1 << ((((entry)->lo >> 9) & 0x7) + 7))
++
++static inline struct pasid_dir_entry *context_entry_present(struct context_entry *ce)
++{
++	if (!(READ_ONCE(ce->lo) & 1))
++		return NULL;
++
++	return pkvm_phys_to_virt(READ_ONCE(ce->lo) & VTD_PAGE_MASK);
++}
++
++/* Get PRESENT bit of a PASID directory entry. */
++static inline bool pasid_pde_is_present(struct pasid_dir_entry *pde)
++{
++	return READ_ONCE(pde->val) & 1;
++}
++
++/* Get PASID table from a PASID directory entry. */
++static inline struct pasid_entry *
++get_pasid_table_from_pde(struct pasid_dir_entry *pde)
++{
++	if (!pasid_pde_is_present(pde))
++		return NULL;
++
++	return pkvm_phys_to_virt(READ_ONCE(pde->val) & VTD_PAGE_MASK);
++}
++
++static struct context_entry *context_addr(struct pkvm_iommu *iommu, u8 bus, u8 devfn)
++{
++	struct root_entry *root_entry = pkvm_phys_to_virt(iommu->pgt.root_pa);
++	struct root_entry *root = &root_entry[bus];
++	struct context_entry *context;
++	u64 *entry;
++
++	entry = &root->lo;
++	if (ecap_smts(iommu->iommu.ecap)) {
++		if (devfn >= 0x80) {
++			devfn -= 0x80;
++			entry = &root->hi;
++		}
++		devfn *= 2;
++	}
++
++	if (*entry & 1)
++		context = pkvm_phys_to_virt(*entry & VTD_PAGE_MASK);
++	else
++		return NULL;
++
++	return &context[devfn];
++}
++
++static inline void print_tbl_walk(struct tbl_walk *tbl_wlk)
++{
++	/*
++	 * A legacy mode DMAR doesn't support PASID, hence default it to -1
++	 * indicating that it's invalid. Also, default all PASID related fields
++	 * to 0.
++	 */
++	if (!tbl_wlk->pasid_tbl_entry)
++		pkvm_dbg("%02x:%02x.%x\t0x%016llx:0x%016llx\t0x%016llx:0x%016llx\t%-6d\t0x%016llx:0x%016llx:0x%016llx\n",
++			   tbl_wlk->bus, PCI_SLOT(tbl_wlk->devfn),
++			   PCI_FUNC(tbl_wlk->devfn), tbl_wlk->rt_entry->hi,
++			   tbl_wlk->rt_entry->lo, tbl_wlk->ctx_entry->hi,
++			   tbl_wlk->ctx_entry->lo, -1,
++			   (u64)0, (u64)0, (u64)0);
++	else
++		pkvm_dbg("%02x:%02x.%x\t0x%016llx:0x%016llx\t0x%016llx:0x%016llx\t%-6d\t0x%016llx:0x%016llx:0x%016llx\n",
++			   tbl_wlk->bus, PCI_SLOT(tbl_wlk->devfn),
++			   PCI_FUNC(tbl_wlk->devfn), tbl_wlk->rt_entry->hi,
++			   tbl_wlk->rt_entry->lo, tbl_wlk->ctx_entry->hi,
++			   tbl_wlk->ctx_entry->lo, tbl_wlk->pasid,
++			   tbl_wlk->pasid_tbl_entry->val[2],
++			   tbl_wlk->pasid_tbl_entry->val[1],
++			   tbl_wlk->pasid_tbl_entry->val[0]);
++}
++
++static void pasid_tbl_walk(struct tbl_walk *tbl_wlk, struct pasid_entry *tbl_entry, u16 dir_idx)
++{
++	u8 tbl_idx;
++
++	for (tbl_idx = 0; tbl_idx < PASID_TBL_ENTRIES; tbl_idx++) {
++		if (pasid_pte_is_present(tbl_entry)) {
++			tbl_wlk->pasid_tbl_entry = tbl_entry;
++			tbl_wlk->pasid = (dir_idx << PASID_PDE_SHIFT) + tbl_idx;
++			print_tbl_walk(tbl_wlk);
++		}
++
++		tbl_entry++;
++	}
++}
++
++static void pasid_dir_walk(struct tbl_walk *tbl_wlk, u64 pasid_dir_ptr,
++			   u16 pasid_dir_size)
++{
++	struct pasid_dir_entry *dir_entry = pkvm_phys_to_virt(pasid_dir_ptr);
++	struct pasid_entry *pasid_tbl;
++	u16 dir_idx;
++
++	for (dir_idx = 0; dir_idx < pasid_dir_size; dir_idx++) {
++		pasid_tbl = get_pasid_table_from_pde(dir_entry);
++		if (pasid_tbl)
++			pasid_tbl_walk(tbl_wlk, pasid_tbl, dir_idx);
++
++		dir_entry++;
++	}
++}
++
++static void ctx_tbl_walk(struct pkvm_iommu *iommu, u16 bus)
++{
++	struct root_entry *root_entry = pkvm_phys_to_virt(iommu->pgt.root_pa);
++	struct context_entry *context;
++	u16 devfn, pasid_dir_size;
++	u64 pasid_dir_ptr;
++
++	for (devfn = 0; devfn < 256; devfn++) {
++		struct tbl_walk tbl_wlk = {0};
++
++		/*
++		 * Scalable mode root entry points to upper scalable mode
++		 * context table and lower scalable mode context table. Each
++		 * scalable mode context table has 128 context entries whereas
++		 * legacy mode context table has 256 context entries. So in
++		 * scalable mode, the context entries for the former 128 devices are
++		 * in the lower scalable mode context table, while the latter
++		 * 128 devices are in the upper scalable mode context table.
++		 * In scalable mode, when devfn > 127, iommu_context_addr()
++		 * automatically refers to the upper scalable mode context table and
++		 * hence the caller doesn't have to worry about differences
++		 * between scalable mode and non scalable mode.
++		 */
++		context = context_addr(iommu, bus, devfn);
++		if (!context)
++			return;
++
++		if (!context_entry_present(context))
++			continue;
++
++		tbl_wlk.bus = bus;
++		tbl_wlk.devfn = devfn;
++		tbl_wlk.rt_entry = &root_entry[bus];
++		tbl_wlk.ctx_entry = context;
++
++		if (ecap_smts(iommu->iommu.ecap)) {
++			pasid_dir_ptr = context->lo & VTD_PAGE_MASK;
++			pasid_dir_size = get_pasid_dir_size(context);
++			pasid_dir_walk(&tbl_wlk, pasid_dir_ptr, pasid_dir_size);
++			continue;
++		}
++
++		print_tbl_walk(&tbl_wlk);
++	}
++}
++
++void root_tbl_walk(struct pkvm_iommu *iommu)
++{
++	u16 bus;
++
++	pkvm_dbg("IOMMU %d: Root Table Address: 0x%llx\n",
++		 iommu->iommu.seq_id, (u64)iommu->pgt.root_pa);
++	pkvm_dbg("B.D.F\tRoot_entry\t\t\t\tContext_entry\t\t\t\tPASID\tPASID_table_entry\n");
++
++	/*
++	 * No need to check if the root entry is present or not because
++	 * iommu_context_addr() performs the same check before returning
++	 * context entry.
++	 */
++	for (bus = 0; bus < 256; bus++)
++		ctx_tbl_walk(iommu, bus);
++}
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/iommu_internal.h b/arch/x86/kvm/vmx/pkvm/hyp/iommu_internal.h
+new file mode 100644
+index 000000000000..35b78fe21d48
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/iommu_internal.h
+@@ -0,0 +1,347 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++#ifndef __PKVM_IOMMU_INTERNAL_H
++#define __PKVM_IOMMU_INTERNAL_H
++
++#include <linux/intel-iommu.h>
++#include <asm/pkvm.h>
++#include <pkvm_spinlock.h>
++#include "pgtable.h"
++
++#define PKVM_QI_DESC_ALIGNED_SIZE		ALIGN(QI_LENGTH * sizeof(struct qi_desc), PAGE_SIZE)
++#define PKVM_QI_DESC_STATUS_ALIGNED_SIZE	ALIGN(QI_LENGTH * sizeof(int), PAGE_SIZE)
++
++struct viommu_reg {
++	u64 cap;
++	u64 ecap;
++	u32 gsts;
++	u64 rta;
++	u64 iq_head;
++	u64 iq_tail;
++	u64 iqa;
++};
++
++struct pkvm_viommu {
++	struct pkvm_pgtable pgt;
++	struct viommu_reg vreg;
++	u64 iqa;
++};
++
++struct pkvm_iommu {
++	struct intel_iommu iommu;
++	pkvm_spinlock_t lock;
++	bool activated;
++	struct pkvm_pgtable pgt;
++	struct pkvm_viommu viommu;
++
++	struct q_inval qi;
++	pkvm_spinlock_t qi_lock;
++	u64 piommu_iqa;
++
++	/* Link ptdev information of this IOMMU */
++	struct list_head ptdev_head;
++};
++
++enum lm_level {
++	IOMMU_LM_CONTEXT = 1,
++	IOMMU_LM_ROOT,
++};
++
++enum sm_level {
++	IOMMU_PASID_TABLE = 1,
++	IOMMU_PASID_DIR,
++	IOMMU_SM_CONTEXT,
++	IOMMU_SM_ROOT,
++	IOMMU_SM_LEVEL_NUM,
++};
++
++#define LAST_LEVEL(level)	\
++	(((level) == 1) ? true : false)
++
++#define LM_DEVFN_BITS	8
++#define LM_DEVFN_SHIFT	0
++
++#define LM_BUS_BITS		8
++#define LM_BUS_SHIFT	8
++#define IOMMU_LM_MAX_VADDR         BIT(16)
++
++#define PASID_PTE_PRESENT	1
++#define PASID_PTE_FPD		2
++#define MAX_NR_PASID_BITS	PKVM_MAX_PASID_BITS
++
++#define PASIDTAB_BITS		6
++#define PASIDTAB_SHIFT		0
++
++#define PASIDDIR_BITS		(MAX_NR_PASID_BITS - PASIDTAB_BITS)
++#define PASIDDIR_SHIFT		PASIDTAB_BITS
++
++#define DEVFN_BITS		8
++#define DEVFN_SHIFT		(PASIDDIR_SHIFT + PASIDDIR_BITS)
++
++#define BUS_BITS		8
++#define BUS_SHIFT		(DEVFN_SHIFT + DEVFN_BITS)
++
++/* Used to calculate the level-to-index */
++#define SM_DEVFN_BITS		7
++#define SM_BUS_BITS		9
++#define SM_BUS_SHIFT		(DEVFN_SHIFT + SM_DEVFN_BITS)
++
++#define IOMMU_MAX_VADDR_LEN	(BUS_SHIFT + BUS_BITS)
++#define IOMMU_MAX_VADDR		BIT(IOMMU_MAX_VADDR_LEN)
++
++#define MAX_NUM_OF_ADDRESS_SPACE(_iommu)		\
++	(ecap_smts((_iommu)->iommu.ecap) ?		\
++		IOMMU_MAX_VADDR : IOMMU_LM_MAX_VADDR)
++
++#define DMAR_GSTS_EN_BITS	(DMA_GCMD_TE | DMA_GCMD_EAFL | \
++				 DMA_GCMD_QIE | DMA_GCMD_IRE | \
++				 DMA_GCMD_CFI)
++#define DMAR_GCMD_PROTECTED	(DMA_GCMD_TE | DMA_GCMD_SRTP | \
++				 DMA_GCMD_QIE)
++#define DMAR_GCMD_DIRECT	(DMA_GCMD_SFL | DMA_GCMD_EAFL | \
++				 DMA_GCMD_WBF | DMA_GCMD_IRE | \
++				 DMA_GCMD_SIRTP | DMA_GCMD_CFI)
++
++#define PKVM_IOMMU_WAIT_OP(offset, op, cond, sts)			\
++do {									\
++	while (1) {							\
++		(sts) = op(offset);					\
++		if (cond)						\
++			break;						\
++		cpu_relax();						\
++	}								\
++} while (0)
++
++#define IQ_DESC_BASE_PHYS(reg)		((reg) & ~0xfff)
++#define IQ_DESC_DW(reg)			(((reg) >> 11) & 1)
++#define IQ_DESC_QS(reg)			((reg) & GENMASK_ULL(2, 0))
++#define IQ_DESC_LEN(reg)		(1 << (7 + IQ_DESC_QS(reg) + !IQ_DESC_DW(reg)))
++#define IQ_DESC_SHIFT(reg)		(4 + IQ_DESC_DW(reg))
++
++#define QI_DESC_TYPE(qw)		((qw) & GENMASK_ULL(3, 0))
++#define QI_DESC_CC_GRANU(qw)		(((qw) & GENMASK_ULL(5, 4)) >> 4)
++#define QI_DESC_CC_DID(qw)		(((qw) & GENMASK_ULL(31, 16)) >> 16)
++#define QI_DESC_CC_SID(qw)		(((qw) & GENMASK_ULL(47, 32)) >> 32)
++
++#define QI_DESC_PC_GRANU(qw)		(((qw) & GENMASK_ULL(5, 4)) >> 4)
++#define QI_DESC_PC_DID(qw)		(((qw) & GENMASK_ULL(31, 16)) >> 16)
++#define QI_DESC_PC_PASID(qw)		(((qw) & GENMASK_ULL(51, 32)) >> 32)
++
++#define QI_DESC_IOTLB_GRANU(qw)		(((qw) & GENMASK_ULL(5, 4)) >> 4)
++#define QI_DESC_IOTLB_DID(qw)		(((qw) & GENMASK_ULL(31, 16)) >> 16)
++#define QI_DESC_IOTLB_ADDR(qw)		((qw) & VTD_PAGE_MASK)
++#define QI_DESC_IOTLB_AM(qw)		((qw) & GENMASK_ULL(5, 0))
++
++#define pgt_to_pkvm_iommu(_pgt) container_of(_pgt, struct pkvm_iommu, pgt)
++
++struct pasid_dir_entry {
++	u64 val;
++};
++
++struct pasid_entry {
++	u64 val[8];
++};
++
++static inline void entry_set_bits(u64 *ptr, u64 mask, u64 bits)
++{
++	u64 old;
++
++	old = READ_ONCE(*ptr);
++	WRITE_ONCE(*ptr, (old & ~mask) | bits);
++}
++
++static inline void context_sm_clear_dte(struct context_entry *ce)
++{
++	entry_set_bits(&ce->lo, 1 << 2, 0);
++}
++
++static inline bool context_lm_is_present(struct context_entry *ce)
++{
++	return READ_ONCE(ce->lo) & 1;
++}
++
++static inline u8 context_lm_get_tt(struct context_entry *ce)
++{
++	return (READ_ONCE(ce->lo) >> 2) & 3;
++}
++
++static inline u64 context_lm_get_slptr(struct context_entry *ce)
++{
++	return READ_ONCE(ce->lo) & VTD_PAGE_MASK;
++}
++
++static inline u8 context_lm_get_aw(struct context_entry *ce)
++{
++	return READ_ONCE(ce->hi) & 0x7;
++}
++
++static inline u16 context_lm_get_did(struct context_entry *ce)
++{
++	return (READ_ONCE(ce->hi) >> 8) & 0xffff;
++}
++
++static inline void context_lm_set_tt(struct context_entry *ce, u8 value)
++{
++	entry_set_bits(&ce->lo, 3 << 2, value << 2);
++}
++
++static inline void context_lm_set_slptr(struct context_entry *ce, u64 value)
++{
++	entry_set_bits(&ce->lo, VTD_PAGE_MASK, value);
++}
++
++static inline void context_lm_set_aw(struct context_entry *ce, u8 value)
++{
++	entry_set_bits(&ce->hi, 0x7, value);
++}
++
++/* Get PRESENT bit of a PASID table entry. */
++static inline bool pasid_pte_is_present(struct pasid_entry *pte)
++{
++	return READ_ONCE(pte->val[0]) & PASID_PTE_PRESENT;
++}
++
++/* Get PGTT field of a PASID table entry */
++static inline u16 pasid_pte_get_pgtt(struct pasid_entry *pte)
++{
++	return (u16)((READ_ONCE(pte->val[0]) >> 6) & 0x7);
++}
++
++/*
++ * Interfaces for PASID table entry manipulation:
++ */
++static inline void pasid_clear_entry(struct pasid_entry *pe)
++{
++	WRITE_ONCE(pe->val[0], 0);
++	WRITE_ONCE(pe->val[1], 0);
++	WRITE_ONCE(pe->val[2], 0);
++	WRITE_ONCE(pe->val[3], 0);
++	WRITE_ONCE(pe->val[4], 0);
++	WRITE_ONCE(pe->val[5], 0);
++	WRITE_ONCE(pe->val[6], 0);
++	WRITE_ONCE(pe->val[7], 0);
++}
++
++/*
++ * Get domain ID value of a scalable mode PASID entry.
++ */
++static inline u16
++pasid_get_domain_id(struct pasid_entry *pe)
++{
++	return (u16)(READ_ONCE(pe->val[1]) & GENMASK_ULL(15, 0));
++}
++
++/*
++ * Get the FLPTPTR(First Level Page Table Pointer) field (Bit 140 ~ 191)
++ * of a scalable mode PASID entry.
++ */
++static inline u64
++pasid_get_flptr(struct pasid_entry *pe)
++{
++	return (u64)(READ_ONCE(pe->val[2]) & VTD_PAGE_MASK);
++}
++
++/*
++ * Get the First Level Paging Mode field (Bit 130~131) of a
++ * scalable mode PASID entry.
++ */
++static inline u8
++pasid_get_flpm(struct pasid_entry *pe)
++{
++	return (u8)((READ_ONCE(pe->val[2]) & GENMASK_ULL(3, 2)) >> 2);
++}
++
++/*
++ * Setup the SLPTPTR(Second Level Page Table Pointer) field (Bit 12~63)
++ * of a scalable mode PASID entry.
++ */
++static inline void
++pasid_set_slptr(struct pasid_entry *pe, u64 value)
++{
++	entry_set_bits(&pe->val[0], VTD_PAGE_MASK, value);
++}
++
++/*
++ * Setup the AW(Address Width) field (Bit 2~4) of a scalable mode PASID
++ * entry.
++ */
++static inline void
++pasid_set_address_width(struct pasid_entry *pe, u64 value)
++{
++	entry_set_bits(&pe->val[0], GENMASK_ULL(4, 2), value << 2);
++}
++
++/*
++ * Setup the PGTT(PASID Granular Translation Type) field (Bit 6~8)
++ * of a scalable mode PASID entry.
++ */
++static inline void
++pasid_set_translation_type(struct pasid_entry *pe, u64 value)
++{
++	entry_set_bits(&pe->val[0], GENMASK_ULL(8, 6), value << 6);
++}
++
++/*
++ * Setup Page Walk Snoop bit (Bit 87) of a scalable mode PASID
++ * entry.
++ */
++static inline void pasid_set_page_snoop(struct pasid_entry *pe, bool value)
++{
++	entry_set_bits(&pe->val[1], 1 << 23, value << 23);
++}
++
++/*
++ * Setup the Page Snoop (PGSNP) field (Bit 88) of a scalable mode
++ * PASID entry.
++ */
++static inline void
++pasid_set_pgsnp(struct pasid_entry *pe)
++{
++	entry_set_bits(&pe->val[1], 1ULL << 24, 1ULL << 24);
++}
++
++#define PASID_ENTRY_PGTT_FL_ONLY        (1)
++#define PASID_ENTRY_PGTT_SL_ONLY        (2)
++#define PASID_ENTRY_PGTT_NESTED         (3)
++#define PASID_ENTRY_PGTT_PT             (4)
++
++/*
++ * Set the Second Stage Execute Enable field (Bit 5) of a scalable mode
++ * PASID entry.
++ */
++static inline void pasid_set_ssee(struct pasid_entry *pe, bool value)
++{
++	entry_set_bits(&pe->val[0], 1 << 5, value << 5);
++}
++
++/*
++ * Set the Second Stage Access/Dirty bit Enable field (Bit 9) of a scalable mode
++ * PASID entry.
++ */
++static inline void pasid_set_ssade(struct pasid_entry *pe, bool value)
++{
++	entry_set_bits(&pe->val[0], 1 << 9, value << 9);
++}
++
++static inline bool pasid_copy_entry(struct pasid_entry *to, struct pasid_entry *from)
++{
++	bool updated = false;
++	int i;
++
++	for (i = 0; i < 8; i++) {
++		u64 new = READ_ONCE(from->val[i]);
++
++		if (READ_ONCE(to->val[i]) != new) {
++			WRITE_ONCE(to->val[i], new);
++			updated = true;
++		}
++	}
++
++	return updated;
++}
++
++extern void root_tbl_walk(struct pkvm_iommu *iommu);
++
++#endif
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/iommu_spgt.c b/arch/x86/kvm/vmx/pkvm/hyp/iommu_spgt.c
+new file mode 100644
+index 000000000000..1da2fca89e5d
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/iommu_spgt.c
+@@ -0,0 +1,106 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright(c) 2022 Intel Corporation.
++ * Copyright(c) 2023 Semihalf.
++ */
++
++#include <linux/hashtable.h>
++#include <pkvm_spinlock.h>
++#include <pkvm.h>
++#include <gfp.h>
++#include "pkvm_hyp.h"
++#include "iommu_spgt.h"
++#include "ept.h"
++#include "bug.h"
++
++static DEFINE_HASHTABLE(iommu_spgt_hasht, 8);
++static DECLARE_BITMAP(iommu_spgt_bitmap, PKVM_MAX_PDEV_NUM);
++static struct pkvm_iommu_spgt pkvm_iommu_spgt[PKVM_MAX_PDEV_NUM];
++static pkvm_spinlock_t iommu_spgt_lock = { __ARCH_PKVM_SPINLOCK_UNLOCKED };
++
++struct pkvm_pgtable *pkvm_get_host_iommu_spgt(unsigned long root_gpa, bool coherency)
++{
++	struct pkvm_iommu_spgt *spgt = NULL, *tmp;
++	unsigned long index;
++	int ret;
++
++	pkvm_spin_lock(&iommu_spgt_lock);
++
++	hash_for_each_possible(iommu_spgt_hasht, tmp, hnode, root_gpa) {
++		if (tmp->root_gpa == root_gpa) {
++			if (tmp->refcount > 0) {
++				spgt = tmp;
++				break;
++			}
++		}
++	}
++
++	if (spgt) {
++		spgt->refcount++;
++		spgt->noncoherent_count += !coherency;
++		pkvm_shadow_sl_iommu_pgt_update_coherency(&spgt->pgt,
++							  !spgt->noncoherent_count);
++		goto out;
++	}
++
++	index = find_first_zero_bit(iommu_spgt_bitmap, PKVM_MAX_PDEV_NUM);
++	if (index < PKVM_MAX_PDEV_NUM) {
++		spgt = &pkvm_iommu_spgt[index];
++
++		ret = pkvm_pgtable_init(&spgt->pgt,
++					pkvm_shadow_sl_iommu_pgt_get_mm_ops(coherency),
++					&ept_ops, &pkvm_hyp->ept_cap, true);
++		if (ret) {
++			pkvm_err("%s: pgtable init failed err=%d\n", __func__, ret);
++			spgt = NULL;
++			goto out;
++		}
++
++		__set_bit(index, iommu_spgt_bitmap);
++		spgt->root_gpa = root_gpa;
++		spgt->index = index;
++		spgt->refcount = 1;
++		spgt->noncoherent_count = !coherency;
++		hash_add(iommu_spgt_hasht, &spgt->hnode, root_gpa);
++	}
++out:
++	pkvm_spin_unlock(&iommu_spgt_lock);
++
++	return spgt ? &spgt->pgt : NULL;
++}
++
++void pkvm_put_host_iommu_spgt(struct pkvm_pgtable *pgt, bool coherency)
++{
++	struct pkvm_iommu_spgt *spgt = NULL, *tmp;
++	int bkt;
++
++	pkvm_spin_lock(&iommu_spgt_lock);
++
++	hash_for_each(iommu_spgt_hasht, bkt, tmp, hnode) {
++		if (&tmp->pgt == pgt) {
++			spgt = tmp;
++			break;
++		}
++	}
++	PKVM_ASSERT(spgt);
++	PKVM_ASSERT(spgt->refcount > 0);
++
++	if (--spgt->refcount > 0) {
++		spgt->noncoherent_count -= !coherency;
++		PKVM_ASSERT(spgt->noncoherent_count >= 0);
++		pkvm_shadow_sl_iommu_pgt_update_coherency(&spgt->pgt,
++							  !spgt->noncoherent_count);
++		goto out;
++	}
++
++	hash_del(&spgt->hnode);
++
++	__clear_bit(spgt->index, iommu_spgt_bitmap);
++
++	pkvm_pgtable_destroy(&spgt->pgt, NULL);
++
++	memset(spgt, 0, sizeof(struct pkvm_iommu_spgt));
++
++out:
++	pkvm_spin_unlock(&iommu_spgt_lock);
++}
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/iommu_spgt.h b/arch/x86/kvm/vmx/pkvm/hyp/iommu_spgt.h
+new file mode 100644
+index 000000000000..9fb4667318b3
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/iommu_spgt.h
+@@ -0,0 +1,19 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright(c) 2022 Intel Corporation.
++ * Copyright(c) 2023 Semihalf.
++ */
++
++#include "pgtable.h"
++
++struct pkvm_iommu_spgt {
++	int refcount;
++	int noncoherent_count;
++	struct hlist_node hnode;
++	unsigned long root_gpa;
++	unsigned long index;
++	struct pkvm_pgtable pgt;
++};
++
++struct pkvm_pgtable *pkvm_get_host_iommu_spgt(unsigned long root_gpa, bool coherency);
++void pkvm_put_host_iommu_spgt(struct pkvm_pgtable *spgt, bool coherency);
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/irq.c b/arch/x86/kvm/vmx/pkvm/hyp/irq.c
+new file mode 100644
+index 000000000000..0580edb21313
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/irq.c
+@@ -0,0 +1,60 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++
++#include <pkvm.h>
++#include <vmx/vmx_lib.h>
++#include "cpu.h"
++#include "pkvm_hyp.h"
++#include "debug.h"
++
++void handle_noop(void)
++{
++	pkvm_err("%s: unexpected exception\n", __func__);
++}
++
++void handle_nmi(void)
++{
++	int cpu_id = get_pcpu_id();
++	struct pkvm_host_vcpu *pkvm_host_vcpu =
++		pkvm_hyp->host_vm.host_vcpus[cpu_id];
++	struct vcpu_vmx *vmx = &pkvm_host_vcpu->vmx;
++
++	if (!pkvm_host_vcpu || !vmx)
++		return;
++
++	if (pkvm_host_vcpu->pending_nmi) {
++		pkvm_dbg("%s: CPU%d already has a pending NMI\n",
++			__func__, cpu_id);
++		return;
++	}
++
++	/* load host vcpu vmcs for sure */
++	vmcs_load(vmx->loaded_vmcs->vmcs);
++
++	/*
++	 * This NMI could happen either before executing
++	 * the injection code or after.
++	 * For the before case, should record a pending NMI.
++	 * For the after case, if no NMI is injected in guest
++	 * we also need to record a pending NMI. If NMI is
++	 * injected already, it is not necessary to inject
++	 * again but injecting it in the next round should also
++	 * be fine. So simply record a pending NMI here.
++	 */
++	pkvm_host_vcpu->pending_nmi = true;
++
++	pkvm_dbg("%s: CPU%d pending NMI\n", __func__, cpu_id);
++
++	/* For case that when NMI happens the injection code is
++	 * already executed, open the NMI window. For the case
++	 * happens before, opening NMI window doesn't cause trouble.
++	 */
++	_vmx_enable_nmi_window(vmx, false);
++
++	/* switch if the current one is not host vcpu vmcs */
++	if (pkvm_host_vcpu->current_vmcs &&
++			(pkvm_host_vcpu->current_vmcs != vmx->loaded_vmcs->vmcs))
++		vmcs_load(pkvm_host_vcpu->current_vmcs);
++}
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/lapic.c b/arch/x86/kvm/vmx/pkvm/hyp/lapic.c
+new file mode 100644
+index 000000000000..19bd45f2d394
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/lapic.c
+@@ -0,0 +1,222 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++#include "pkvm.h"
++#include "cpu.h"
++#include "memory.h"
++#include "mmu.h"
++#include "pgtable.h"
++#include "bug.h"
++#include "pkvm_hyp.h"
++
++struct pkvm_lapic {
++	bool x2apic;
++	u32 apic_id;
++	unsigned long apic_base_phys;
++	void *apic_base_va;
++};
++
++static struct pkvm_lapic pkvm_lapic[CONFIG_NR_CPUS];
++
++#define APIC_BASE_PHYS_MASK GENMASK_ULL(get_max_physaddr_bits(), 12)
++
++static u32 __pkvm_lapic_read(struct pkvm_lapic *lapic, u32 reg)
++{
++	u64 val;
++
++	if (lapic->x2apic)
++		pkvm_rdmsrl(APIC_BASE_MSR + (reg >> 4), val);
++	else
++		val = readl(lapic->apic_base_va + reg);
++
++	return (u32)val;
++}
++
++static u64 __pkvm_lapic_icr_read(struct pkvm_lapic *lapic)
++{
++	u64 val;
++
++	if (lapic->x2apic)
++		pkvm_rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val);
++	else {
++		u64 icr2;
++
++		icr2 = readl(lapic->apic_base_va + APIC_ICR2);
++		val = readl(lapic->apic_base_va + APIC_ICR);
++		val |= icr2 << 32;
++	}
++
++	return val;
++}
++
++static void __pkvm_wait_icr_idle(struct pkvm_lapic *lapic)
++{
++	/* x2apic mode doesn't have delivery status bit */
++	if (lapic->x2apic)
++		return;
++
++	while (__pkvm_lapic_icr_read(lapic) & APIC_ICR_BUSY)
++		cpu_relax();
++}
++
++static void __pkvm_lapic_icr_write(struct pkvm_lapic *lapic, u32 low, u32 id)
++{
++	if (lapic->x2apic)
++		pkvm_wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4),
++			    low | ((u64)id << 32));
++	else {
++		writel(id, lapic->apic_base_va + APIC_ICR2);
++		writel(low, lapic->apic_base_va + APIC_ICR);
++		__pkvm_wait_icr_idle(lapic);
++	}
++}
++
++static int __pkvm_setup_lapic(struct pkvm_lapic *lapic, u64 apicbase)
++{
++	/* Not allow lapic to be disabled as it will be used for kick */
++	PKVM_ASSERT(apicbase & (X2APIC_ENABLE | XAPIC_ENABLE));
++
++	if (!(apicbase & X2APIC_ENABLE)) {
++		unsigned long base_phys = apicbase & APIC_BASE_PHYS_MASK;
++		void *vaddr = pkvm_iophys_to_virt(base_phys);
++
++		if ((unsigned long)vaddr == INVALID_ADDR)
++			return -EINVAL;
++
++		if ((lapic->apic_base_phys == base_phys) &&
++				(lapic->apic_base_va == vaddr))
++			goto done;
++
++		/* unmap the previous MMIO mapping then map the new one */
++		if (lapic->apic_base_va) {
++			pkvm_mmu_unmap((unsigned long)lapic->apic_base_va,
++					PAGE_SIZE);
++			lapic->apic_base_phys = 0;
++			lapic->apic_base_va = NULL;
++		}
++
++		if (pkvm_mmu_map((unsigned long)vaddr, base_phys, PAGE_SIZE,
++				 0, PKVM_PAGE_IO_NOCACHE))
++			return -ENOMEM;
++
++		lapic->apic_base_phys = base_phys;
++		lapic->apic_base_va = vaddr;
++		lapic->x2apic = false;
++	} else
++		lapic->x2apic = true;
++done:
++	/*
++	 * APIC_ID reg is writable for primary VM so it is
++	 * possible for primary VM to change the APIC_ID.
++	 * So pkvm should have a way to intercept the APIC_ID
++	 * changing. For x2apic mode, this can be done through
++	 * intercepting the APIC_ID msr write.
++	 *
++	 * TODO: handling the APIC_ID changing for xapic mode.
++	 */
++	lapic->apic_id = __pkvm_lapic_read(lapic, APIC_ID);
++
++	return 0;
++}
++
++static inline bool is_lapic_setup(struct pkvm_pcpu *pcpu)
++{
++	return !!pcpu->lapic;
++}
++
++int pkvm_setup_lapic(struct pkvm_pcpu *pcpu, int cpu)
++{
++	struct pkvm_lapic *lapic = &pkvm_lapic[cpu];
++	u64 apicbase;
++
++	/* Nothing needs to be done if already setup */
++	if (is_lapic_setup(pcpu))
++		return 0;
++
++	pkvm_rdmsrl(MSR_IA32_APICBASE, apicbase);
++
++	pcpu->lapic = lapic;
++
++	return __pkvm_setup_lapic(lapic, apicbase);
++}
++
++void pkvm_apic_base_msr_write(struct kvm_vcpu *vcpu, u64 apicbase)
++{
++	struct pkvm_pcpu *pcpu = to_pkvm_hvcpu(vcpu)->pcpu;
++	struct pkvm_lapic *lapic = pcpu->lapic;
++
++	/*
++	 * MSR is accessed before the init finalizing phase
++	 * that pkvm has not setup lapic yet. In this case, let the
++	 * wrmsr directly go to the hardware.
++	 */
++	if (!is_lapic_setup(pcpu)) {
++		pkvm_wrmsrl(MSR_IA32_APICBASE, apicbase);
++		return;
++	}
++
++	/* A fatal error when is running at runtime */
++	PKVM_ASSERT(__pkvm_setup_lapic(lapic, apicbase) == 0);
++
++	pkvm_wrmsrl(MSR_IA32_APICBASE, apicbase);
++}
++
++int pkvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 val)
++{
++	struct pkvm_pcpu *pcpu = to_pkvm_hvcpu(vcpu)->pcpu;
++	struct pkvm_lapic *lapic = pcpu->lapic;
++	u32 reg = (msr - APIC_BASE_MSR) << 4;
++
++	/*
++	 * MSR is accessed before the init finalizing phase
++	 * that pkvm has not setup lapic yet. In this case, let the
++	 * wrmsr directly go to the hardware.
++	 */
++	if (!is_lapic_setup(pcpu)) {
++		pkvm_wrmsrl(msr, val);
++		return 0;
++	}
++
++	/* Ensure lapic is in x2apic mode */
++	if (!lapic->x2apic)
++		return -EINVAL;
++
++	switch (reg) {
++	case APIC_ID:
++		/*
++		 * Not allow primary VM to modify the lapic ID which
++		 * can result in the failure of pkvm to kick.
++		 */
++		PKVM_ASSERT(lapic->apic_id == (u32)val);
++		break;
++	default:
++		break;
++	}
++
++	pkvm_wrmsrl(msr, val);
++	return 0;
++}
++
++void pkvm_lapic_send_init(struct pkvm_pcpu *dst_pcpu)
++{
++	u32 icrlow = APIC_INT_ASSERT | APIC_DM_INIT;
++	int cpu_id = get_pcpu_id();
++	struct pkvm_pcpu *pcpu = pkvm_hyp->pcpus[cpu_id];
++	struct pkvm_lapic *dst_lapic = dst_pcpu->lapic;
++
++	/* Not to send INIT to self */
++	if (pcpu == dst_pcpu)
++		return;
++	/*
++	 * If the lapic is not setup yet, which is during the finalizing
++	 * phase, cannot send INIT. Also not necessary to use INIT for tlb
++	 * shoot down as when isolating some memory from the primary VM in
++	 * the finalizing phase, as we can flush ept tlbs at the end of
++	 * finalizing for each CPU.
++	 */
++	if (unlikely(!is_lapic_setup(pcpu) || !is_lapic_setup(dst_pcpu)))
++		return;
++
++	__pkvm_lapic_icr_write(pcpu->lapic, icrlow, dst_lapic->apic_id);
++}
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/lapic.h b/arch/x86/kvm/vmx/pkvm/hyp/lapic.h
+new file mode 100644
+index 000000000000..d4513afe5c80
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/lapic.h
+@@ -0,0 +1,12 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++#ifndef _PKVM_LAPIC_H_
++#define _PKVM_LAPIC_H_
++
++int pkvm_setup_lapic(struct pkvm_pcpu *pcpu, int cpu);
++void pkvm_apic_base_msr_write(struct kvm_vcpu *vcpu, u64 apicbase);
++int pkvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 val);
++void pkvm_lapic_send_init(struct pkvm_pcpu *dst_pcpu);
++#endif
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/lib/list_debug.c b/arch/x86/kvm/vmx/pkvm/hyp/lib/list_debug.c
+new file mode 100644
+index 000000000000..67f295a4668b
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/lib/list_debug.c
+@@ -0,0 +1,16 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ */
++
++#include <linux/list.h>
++
++bool __list_add_valid(struct list_head *new, struct list_head *prev,
++		struct list_head *next)
++{
++	return true;
++}
++
++bool __list_del_entry_valid(struct list_head *entry)
++{
++	return true;
++}
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/lib/memcpy_64.S b/arch/x86/kvm/vmx/pkvm/hyp/lib/memcpy_64.S
+new file mode 100644
+index 000000000000..b976f646d352
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/lib/memcpy_64.S
+@@ -0,0 +1,26 @@
++/* SPDX-License-Identifier: GPL-2.0-only */
++/* Copyright 2002 Andi Kleen */
++
++#include <linux/linkage.h>
++
++/*
++ * memcpy - Copy a memory block.
++ *
++ * Input:
++ *  rdi destination
++ *  rsi source
++ *  rdx count
++ *
++ * Output:
++ * rax original destination
++ *
++ * This is enhanced fast string memcpy. It is faster and
++ * simpler than old memcpy.
++ */
++
++SYM_FUNC_START(memcpy)
++	movq %rdi, %rax
++	movq %rdx, %rcx
++	rep movsb
++	RET
++SYM_FUNC_END(memcpy)
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/lib/memset_64.S b/arch/x86/kvm/vmx/pkvm/hyp/lib/memset_64.S
+new file mode 100644
+index 000000000000..8c30d2f5f925
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/lib/memset_64.S
+@@ -0,0 +1,24 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/* Copyright 2002 Andi Kleen, SuSE Labs */
++
++#include <linux/linkage.h>
++
++/*
++ * ISO C memset - set a memory block to a byte value. This function uses
++ * enhanced rep stosb to override the fast string function.
++ * The code is simpler and shorter than the fast string function as well.
++ *
++ * rdi   destination
++ * rsi   value (char)
++ * rdx   count (bytes)
++ *
++ * rax   original destination
++ */
++SYM_FUNC_START(memset)
++	movq %rdi,%r9
++	movb %sil,%al
++	movq %rdx,%rcx
++	rep stosb
++	movq %r9,%rax
++	RET
++SYM_FUNC_END(memset)
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/lib/retpoline.S b/arch/x86/kvm/vmx/pkvm/hyp/lib/retpoline.S
+new file mode 100644
+index 000000000000..7758ec40fe7c
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/lib/retpoline.S
+@@ -0,0 +1,115 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ */
++
++#include <linux/linkage.h>
++#include <asm/dwarf2.h>
++#include <asm/nospec-branch.h>
++#include <asm/unwind_hints.h>
++#include <asm/frame.h>
++
++	.section .text.__x86.indirect_thunk
++
++.macro RETPOLINE reg
++	ANNOTATE_INTRA_FUNCTION_CALL
++	call    .Ldo_rop_\@
++.Lspec_trap_\@:
++	UNWIND_HINT_EMPTY
++	pause
++	lfence
++	jmp .Lspec_trap_\@
++.Ldo_rop_\@:
++	mov     %\reg, (%_ASM_SP)
++	UNWIND_HINT_FUNC
++	RET
++.endm
++
++.macro THUNK reg
++
++	.align RETPOLINE_THUNK_SIZE
++SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL)
++	UNWIND_HINT_EMPTY
++
++	RETPOLINE \reg
++
++.endm
++
++	.align RETPOLINE_THUNK_SIZE
++SYM_CODE_START(__x86_indirect_thunk_array)
++
++#define GEN(reg) THUNK reg
++#include <asm/GEN-for-each-reg.h>
++#undef GEN
++
++	.align RETPOLINE_THUNK_SIZE
++SYM_CODE_END(__x86_indirect_thunk_array)
++
++/*
++ * This function name is magical and is used by -mfunction-return=thunk-extern
++ * for the compiler to generate JMPs to it.
++ */
++#ifdef CONFIG_RETHUNK
++
++	.section .text.__x86.return_thunk
++
++/*
++ * Safety details here pertain to the AMD Zen{1,2} microarchitecture:
++ * 1) The RET at __x86_return_thunk must be on a 64 byte boundary, for
++ *    alignment within the BTB.
++ * 2) The instruction at zen_untrain_ret must contain, and not
++ *    end with, the 0xc3 byte of the RET.
++ * 3) STIBP must be enabled, or SMT disabled, to prevent the sibling thread
++ *    from re-poisoning the BTB prediction.
++ */
++	.align 64
++	.skip 63, 0xcc
++SYM_FUNC_START_NOALIGN(zen_untrain_ret);
++
++	/*
++	 * As executed from zen_untrain_ret, this is:
++	 *
++	 *   TEST $0xcc, %bl
++	 *   LFENCE
++	 *   JMP __x86_return_thunk
++	 *
++	 * Executing the TEST instruction has a side effect of evicting any BTB
++	 * prediction (potentially attacker controlled) attached to the RET, as
++	 * __x86_return_thunk + 1 isn't an instruction boundary at the moment.
++	 */
++	.byte	0xf6
++
++	/*
++	 * As executed from __x86_return_thunk, this is a plain RET.
++	 *
++	 * As part of the TEST above, RET is the ModRM byte, and INT3 the imm8.
++	 *
++	 * We subsequently jump backwards and architecturally execute the RET.
++	 * This creates a correct BTB prediction (type=ret), but in the
++	 * meantime we suffer Straight Line Speculation (because the type was
++	 * no branch) which is halted by the INT3.
++	 *
++	 * With SMT enabled and STIBP active, a sibling thread cannot poison
++	 * RET's prediction to a type of its choice, but can evict the
++	 * prediction due to competitive sharing. If the prediction is
++	 * evicted, __x86_return_thunk will suffer Straight Line Speculation
++	 * which will be contained safely by the INT3.
++	 */
++SYM_INNER_LABEL(__x86_return_thunk, SYM_L_GLOBAL)
++	ret
++	int3
++SYM_CODE_END(__x86_return_thunk)
++
++	/*
++	 * Ensure the TEST decoding / BTB invalidation is complete.
++	 */
++	lfence
++
++	/*
++	 * Jump back and execute the RET in the middle of the TEST instruction.
++	 * INT3 is for SLS protection.
++	 */
++	jmp __x86_return_thunk
++	int3
++SYM_FUNC_END(zen_untrain_ret)
++
++#endif /* CONFIG_RETHUNK */
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/mem_protect.c b/arch/x86/kvm/vmx/pkvm/hyp/mem_protect.c
+new file mode 100644
+index 000000000000..5e6ee262fbe8
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/mem_protect.c
+@@ -0,0 +1,1013 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++#include <linux/bitfield.h>
++#include <pkvm.h>
++#include <gfp.h>
++#include "pkvm_hyp.h"
++#include "mem_protect.h"
++#include "pgtable.h"
++#include "ept.h"
++
++struct check_walk_data {
++	int 			nstate;
++	enum pkvm_page_state	*desired;
++};
++
++enum pkvm_component_id {
++	PKVM_ID_HOST,
++	PKVM_ID_HYP,
++	PKVM_ID_GUEST,
++};
++
++struct pkvm_mem_trans_desc {
++	enum pkvm_component_id	id;
++	union {
++		struct {
++			struct pkvm_pgtable	*pgt_override;
++			u64	addr;
++		} host;
++
++		struct {
++			u64	addr;
++		} hyp;
++
++		struct {
++			struct pkvm_pgtable	*pgt;
++			u64			addr;
++			u64			phys;
++		} guest;
++	};
++	u64			prot;
++};
++
++struct pkvm_mem_transition {
++	u64				size;
++	struct pkvm_mem_trans_desc	initiator;
++	struct pkvm_mem_trans_desc	completer;
++};
++
++static void guest_pgstate_pgt_lock(struct pkvm_pgtable *pgt)
++{
++	pkvm_spin_lock(&pgstate_pgt_to_shadow_vm(pgt)->lock);
++}
++
++static void guest_pgstate_pgt_unlock(struct pkvm_pgtable *pgt)
++{
++	pkvm_spin_unlock(&pgstate_pgt_to_shadow_vm(pgt)->lock);
++}
++
++static u64 pkvm_init_invalid_leaf_owner(pkvm_id owner_id)
++{
++	/* the page owned by others also means NOPAGE in page state */
++	return FIELD_PREP(PKVM_INVALID_PTE_OWNER_MASK, owner_id) |
++		FIELD_PREP(PKVM_PAGE_STATE_PROT_MASK, PKVM_NOPAGE);
++}
++
++static int host_ept_set_owner_locked(struct pkvm_pgtable *pgt_override, phys_addr_t addr,
++				     u64 size, pkvm_id owner_id)
++{
++	u64 annotation = pkvm_init_invalid_leaf_owner(owner_id);
++
++	/*
++	 * The memory [addr, addr + size) will be unmapped from host ept. At the
++	 * same time, the annotation with a NOPAGE flag will be put in the
++	 * invalid pte that has been unmapped. And the information shows that
++	 * the page has been used by some guest and its id can be read from
++	 * annotation. Also when later these pages are back to host, the annotation
++	 * will be helpful to check the right page transition.
++	 */
++	return pkvm_pgtable_annotate(pgt_override ? pgt_override : pkvm_hyp->host_vm.ept,
++				     addr, size, annotation);
++}
++
++static int host_ept_create_idmap_locked(struct pkvm_pgtable *pgt_override, u64 addr,
++					u64 size, int pgsz_mask, u64 prot)
++{
++	return pkvm_pgtable_map(pgt_override ? pgt_override : pkvm_hyp->host_vm.ept,
++				addr, addr, size, pgsz_mask, prot, NULL);
++}
++
++static int
++__check_page_state_walker(struct pkvm_pgtable *pgt, unsigned long vaddr,
++			  unsigned long vaddr_end, int level, void *ptep,
++			  unsigned long flags, struct pgt_flush_data *flush_data,
++			  void *const arg)
++{
++	struct check_walk_data *data = arg;
++	int i;
++
++	for (i = 0; i < data->nstate; i++)
++		if (pkvm_getstate(*(u64 *)ptep) == data->desired[i])
++			return 0;
++
++	return -EPERM;
++}
++
++static int check_page_state_range(struct pkvm_pgtable *pgt, u64 addr, u64 size,
++				  enum pkvm_page_state *states, int nstate)
++{
++	struct check_walk_data data = {
++		.nstate			= nstate,
++		.desired		= states,
++	};
++	struct pkvm_pgtable_walker walker = {
++		.cb		= __check_page_state_walker,
++		.flags		= PKVM_PGTABLE_WALK_LEAF,
++		.arg		= &data,
++	};
++
++	return pgtable_walk(pgt, addr, size, true, &walker);
++}
++
++static int __host_check_page_state_range(struct pkvm_pgtable *pgt_override, u64 addr,
++					 u64 size, enum pkvm_page_state state)
++{
++	struct pkvm_pgtable *host_ept = pgt_override ? pgt_override : pkvm_hyp->host_vm.ept;
++
++	return check_page_state_range(host_ept, addr, size, &state, 1);
++}
++
++static int __guest_check_page_state_range(struct pkvm_pgtable *pgt,
++					  u64 addr, u64 size,
++					  enum pkvm_page_state state)
++{
++	return check_page_state_range(pgt, addr, size, &state, 1);
++}
++
++static pkvm_id pkvm_guest_id(struct pkvm_pgtable *pgt)
++{
++	/* Using the shadow_vm_handle as guest_id. */
++	return pgstate_pgt_to_shadow_vm(pgt)->shadow_vm_handle;
++}
++
++static pkvm_id __pkvm_owner_id(const struct pkvm_mem_trans_desc *desc)
++{
++	switch (desc->id) {
++	case PKVM_ID_HYP:
++		return pkvm_hyp_id;
++	case PKVM_ID_GUEST:
++		return pkvm_guest_id(desc->guest.pgt);
++	default:
++		WARN_ON(1);
++		return -1;
++	}
++}
++
++static pkvm_id initiator_owner_id(const struct pkvm_mem_transition *tx)
++{
++	return __pkvm_owner_id(&tx->initiator);
++}
++
++static pkvm_id completer_owner_id(const struct pkvm_mem_transition *tx)
++{
++	return __pkvm_owner_id(&tx->completer);
++}
++
++static int host_request_donation(const struct pkvm_mem_transition *tx)
++{
++	u64 addr = tx->initiator.host.addr;
++	u64 size = tx->size;
++
++	return __host_check_page_state_range(tx->initiator.host.pgt_override,
++					     addr, size, PKVM_PAGE_OWNED);
++}
++
++static int guest_request_donation(const struct pkvm_mem_transition *tx)
++{
++	u64 addr = tx->initiator.guest.addr;
++	u64 size = tx->size;
++	enum pkvm_page_state states[] = { PKVM_PAGE_OWNED,
++					  PKVM_PAGE_SHARED_OWNED,
++					};
++
++	/*
++	 * When destroying vm, there may be multiple page states in the guest
++	 * pgstate ept. In such case, both page states are ok to be reclaimed
++	 * back by host.
++	 */
++	return check_page_state_range(tx->initiator.guest.pgt,
++				      addr, size, states, ARRAY_SIZE(states));
++}
++
++static int host_ack_donation(const struct pkvm_mem_transition *tx)
++{
++	u64 addr = tx->completer.host.addr;
++	u64 size = tx->size;
++	enum pkvm_page_state states[] = { PKVM_NOPAGE,
++					  PKVM_PAGE_SHARED_BORROWED,
++					};
++	struct pkvm_pgtable *host_ept = tx->completer.host.pgt_override ?
++					tx->completer.host.pgt_override :
++					pkvm_hyp->host_vm.ept;
++
++	/* Same as guest_request_donation. */
++	return check_page_state_range(host_ept, addr, size, states, ARRAY_SIZE(states));
++}
++
++static int guest_ack_donation(const struct pkvm_mem_transition *tx)
++{
++	u64 addr = tx->completer.guest.addr;
++	u64 size = tx->size;
++
++	return __guest_check_page_state_range(tx->completer.guest.pgt, addr,
++					      size, PKVM_NOPAGE);
++}
++
++static int check_donation(const struct pkvm_mem_transition *tx)
++{
++	int ret;
++
++	switch (tx->initiator.id) {
++	case PKVM_ID_HOST:
++		ret = host_request_donation(tx);
++		break;
++	case PKVM_ID_HYP:
++		ret = 0;
++		break;
++	case PKVM_ID_GUEST:
++		ret = guest_request_donation(tx);
++		break;
++	default:
++		ret = -EINVAL;
++	}
++
++	if (ret)
++		return ret;
++
++	switch (tx->completer.id) {
++	case PKVM_ID_HOST:
++		ret = host_ack_donation(tx);
++		break;
++	case PKVM_ID_HYP:
++		ret = 0;
++		break;
++	case PKVM_ID_GUEST:
++		ret = guest_ack_donation(tx);
++		break;
++	default:
++		ret = -EINVAL;
++	}
++
++	return ret;
++}
++
++static int host_initiate_donation(const struct pkvm_mem_transition *tx)
++{
++	pkvm_id owner_id = completer_owner_id(tx);
++	u64 addr = tx->initiator.host.addr;
++	u64 size = tx->size;
++
++	return host_ept_set_owner_locked(tx->initiator.host.pgt_override, addr, size, owner_id);
++}
++
++static int guest_initiate_donation(const struct pkvm_mem_transition *tx)
++{
++	u64 addr = tx->initiator.guest.addr;
++	u64 phys = tx->initiator.guest.phys;
++	u64 size = tx->size;
++
++	return pkvm_pgtable_unmap_safe(tx->initiator.guest.pgt, addr, phys, size, NULL);
++}
++
++static int host_complete_donation(const struct pkvm_mem_transition *tx)
++{
++	u64 addr = tx->completer.host.addr;
++	u64 size = tx->size;
++	u64 prot = pkvm_mkstate(tx->completer.prot, PKVM_PAGE_OWNED);
++
++	return host_ept_create_idmap_locked(tx->completer.host.pgt_override, addr, size, 0, prot);
++}
++
++static int guest_complete_donation(const struct pkvm_mem_transition *tx)
++{
++	struct pkvm_pgtable *pgt = tx->completer.guest.pgt;
++	u64 addr = tx->completer.guest.addr;
++	u64 size = tx->size;
++	u64 phys = tx->completer.guest.phys;
++	u64 prot = tx->completer.prot;
++
++	prot = pkvm_mkstate(prot, PKVM_PAGE_OWNED);
++	return pkvm_pgtable_map(pgt, addr, phys, size, 0, prot, NULL);
++}
++
++static int __do_donate(const struct pkvm_mem_transition *tx)
++{
++	int ret;
++
++	switch (tx->initiator.id) {
++	case PKVM_ID_HOST:
++		ret = host_initiate_donation(tx);
++		break;
++	case PKVM_ID_HYP:
++		ret = 0;
++		break;
++	case PKVM_ID_GUEST:
++		ret = guest_initiate_donation(tx);
++		break;
++	default:
++		ret = -EINVAL;
++	}
++
++	if (ret)
++		return ret;
++
++	switch (tx->completer.id) {
++	case PKVM_ID_HOST:
++		ret = host_complete_donation(tx);
++		break;
++	case PKVM_ID_HYP:
++		ret = 0;
++		break;
++	case PKVM_ID_GUEST:
++		ret = guest_complete_donation(tx);
++		break;
++	default:
++		ret = -EINVAL;
++	}
++
++	return ret;
++}
++
++/*
++ * do_donate - the page owner transfer ownership to another component.
++ *
++ * Initiator: OWNED	=> NO_PAGE
++ * Completer: NO_APGE	=> OWNED
++ *
++ * The special component is pkvm_hyp. Since pkvm_hyp can access all the
++ * memory, nothing needs to be done if the page owner is transferred to hyp or
++ * hyp transfers the ownership to other entities.
++ */
++static int do_donate(const struct pkvm_mem_transition *donation)
++{
++	int ret;
++
++	ret = check_donation(donation);
++	if (ret)
++		return ret;
++
++	return WARN_ON(__do_donate(donation));
++}
++
++int __pkvm_host_donate_hyp(u64 hpa, u64 size)
++{
++	int ret;
++	u64 hyp_addr = (u64)__pkvm_va(hpa);
++	struct pkvm_mem_transition donation = {
++		.size		= size,
++		.initiator	= {
++			.id	= PKVM_ID_HOST,
++			.host	= {
++				.addr	= hpa,
++			},
++		},
++		.completer	= {
++			.id	= PKVM_ID_HYP,
++			.hyp	= {
++				.addr = hyp_addr,
++			},
++		},
++	};
++
++	host_ept_lock();
++
++	ret = do_donate(&donation);
++
++	host_ept_unlock();
++
++	return ret;
++}
++
++int __pkvm_hyp_donate_host(u64 hpa, u64 size)
++{
++	int ret;
++	u64 hyp_addr = (u64)__pkvm_va(hpa);
++	struct pkvm_mem_transition donation = {
++		.size		= size,
++		.initiator	= {
++			.id	= PKVM_ID_HYP,
++			.hyp	= {
++				.addr	= hyp_addr,
++			},
++		},
++		.completer	= {
++			.id	= PKVM_ID_HOST,
++			.host	= {
++				.addr	= hpa,
++			},
++			.prot	= HOST_EPT_DEF_MEM_PROT,
++		},
++	};
++
++	host_ept_lock();
++
++	ret = do_donate(&donation);
++
++	host_ept_unlock();
++
++	return ret;
++}
++
++int __pkvm_host_donate_guest(u64 hpa, struct pkvm_pgtable *guest_pgt,
++			     u64 gpa, u64 size, u64 prot)
++{
++	int ret;
++	struct pkvm_mem_transition donation = {
++		.size		= size,
++		.initiator	= {
++			.id	= PKVM_ID_HOST,
++			.host	= {
++				.addr	= hpa,
++			},
++		},
++		.completer	= {
++			.id	= PKVM_ID_GUEST,
++			.guest	= {
++				.pgt	= guest_pgt,
++				.addr	= gpa,
++				.phys	= hpa,
++			},
++			.prot	= prot,
++		},
++	};
++
++	host_ept_lock();
++
++	ret = do_donate(&donation);
++
++	host_ept_unlock();
++
++	return ret;
++}
++
++/*
++ * Fastpath interface will use the host EPT instance without doing tlbflushing
++ * to have a better performance. It is usually used in the scenario that caller
++ * needs to change a bunch of pages' state without having the TLB flushing
++ * overhead in the each iteration, but caller still needs to do TLB flushing
++ * after completing all the iterations.
++ */
++int __pkvm_host_donate_guest_fastpath(u64 hpa, struct pkvm_pgtable *guest_pgt,
++				      u64 gpa, u64 size, u64 prot)
++{
++	int ret;
++	struct pkvm_mem_transition donation = {
++		.size		= size,
++		.initiator	= {
++			.id	= PKVM_ID_HOST,
++			.host	= {
++				.pgt_override	= pkvm_hyp->host_vm.ept_notlbflush,
++				.addr		= hpa,
++			},
++		},
++		.completer	= {
++			.id	= PKVM_ID_GUEST,
++			.guest	= {
++				.pgt		= guest_pgt,
++				.addr		= gpa,
++				.phys		= hpa,
++			},
++			.prot	= prot,
++		},
++	};
++
++	host_ept_lock();
++
++	ret = do_donate(&donation);
++
++	host_ept_unlock();
++
++	return ret;
++}
++
++int __pkvm_host_undonate_guest(u64 hpa, struct pkvm_pgtable *guest_pgt,
++			       u64 gpa, u64 size)
++{
++	int ret;
++	struct pkvm_mem_transition donation = {
++		.size		= size,
++		.initiator	= {
++			.id	= PKVM_ID_GUEST,
++			.guest	= {
++				.addr	= gpa,
++				.phys	= hpa,
++				.pgt	= guest_pgt,
++			},
++		},
++		.completer	= {
++			.id	= PKVM_ID_HOST,
++			.host	= {
++				.addr	= hpa,
++			},
++			.prot	= HOST_EPT_DEF_MEM_PROT,
++		},
++	};
++
++	host_ept_lock();
++
++	ret = do_donate(&donation);
++
++	host_ept_unlock();
++
++	return ret;
++}
++
++static int host_request_share(const struct pkvm_mem_transition *tx)
++{
++	u64 addr = tx->initiator.host.addr;
++	u64 size = tx->size;
++
++	return __host_check_page_state_range(tx->initiator.host.pgt_override,
++					     addr, size, PKVM_PAGE_OWNED);
++}
++
++static int guest_request_share(const struct pkvm_mem_transition *tx)
++{
++	u64 addr = tx->initiator.guest.addr;
++	u64 size = tx->size;
++
++	return __guest_check_page_state_range(tx->initiator.guest.pgt,
++					      addr, size, PKVM_PAGE_OWNED);
++}
++
++static int host_ack_share(const struct pkvm_mem_transition *tx)
++{
++	u64 addr = tx->completer.host.addr;
++	u64 size = tx->size;
++
++	return __host_check_page_state_range(tx->completer.host.pgt_override,
++					     addr, size, PKVM_NOPAGE);
++}
++
++static int guest_ack_share(const struct pkvm_mem_transition *tx)
++{
++	u64 addr = tx->completer.guest.addr;
++	u64 size = tx->size;
++
++	return __guest_check_page_state_range(tx->completer.guest.pgt, addr,
++					      size, PKVM_NOPAGE);
++}
++
++static int check_share(const struct pkvm_mem_transition *tx)
++{
++	int ret;
++
++	switch (tx->initiator.id) {
++	case PKVM_ID_HOST:
++		ret = host_request_share(tx);
++		break;
++	case PKVM_ID_GUEST:
++		ret = guest_request_share(tx);
++		break;
++	default:
++		ret = -EINVAL;
++	}
++
++	if (ret)
++		return ret;
++
++	switch (tx->completer.id) {
++	case PKVM_ID_HOST:
++		ret = host_ack_share(tx);
++		break;
++	case PKVM_ID_GUEST:
++		ret = guest_ack_share(tx);
++		break;
++	default:
++		ret = -EINVAL;
++	}
++
++	return ret;
++}
++
++static int host_initiate_share(const struct pkvm_mem_transition *tx)
++{
++	u64 addr = tx->initiator.host.addr;
++	u64 size = tx->size;
++	u64 prot = pkvm_mkstate(tx->initiator.prot, PKVM_PAGE_SHARED_OWNED);
++
++	return host_ept_create_idmap_locked(tx->initiator.host.pgt_override, addr, size, 0, prot);
++}
++
++static int guest_initiate_share(const struct pkvm_mem_transition *tx)
++{
++	struct pkvm_pgtable *pgt = tx->initiator.guest.pgt;
++	u64 addr = tx->initiator.guest.addr;
++	u64 phys = tx->initiator.guest.phys;
++	u64 size = tx->size;
++	u64 prot = pkvm_mkstate(tx->initiator.prot, PKVM_PAGE_SHARED_OWNED);
++
++	return pkvm_pgtable_map(pgt, addr, phys, size, 0, prot, NULL);
++}
++
++static int host_complete_share(const struct pkvm_mem_transition *tx)
++{
++	u64 addr = tx->completer.host.addr;
++	u64 size = tx->size;
++	u64 prot = pkvm_mkstate(tx->completer.prot, PKVM_PAGE_SHARED_BORROWED);
++
++	return host_ept_create_idmap_locked(tx->completer.host.pgt_override, addr, size, 0, prot);
++}
++
++static int guest_complete_share(const struct pkvm_mem_transition *tx)
++{
++	struct pkvm_pgtable *pgt = tx->completer.guest.pgt;
++	u64 addr = tx->completer.guest.addr;
++	u64 size = tx->size;
++	u64 phys = tx->completer.guest.phys;
++	u64 prot = tx->completer.prot;
++
++	prot = pkvm_mkstate(prot, PKVM_PAGE_SHARED_BORROWED);
++	return pkvm_pgtable_map(pgt, addr, phys, size, 0, prot, NULL);
++}
++
++static int __do_share(const struct pkvm_mem_transition *tx)
++{
++	int ret;
++
++	switch (tx->initiator.id) {
++	case PKVM_ID_HOST:
++		ret = host_initiate_share(tx);
++		break;
++	case PKVM_ID_GUEST:
++		ret = guest_initiate_share(tx);
++		break;
++	default:
++		ret = -EINVAL;
++	}
++
++	if (ret)
++		return ret;
++
++	switch (tx->completer.id) {
++	case PKVM_ID_HOST:
++		ret = host_complete_share(tx);
++		break;
++	case PKVM_ID_GUEST:
++		ret = guest_complete_share(tx);
++		break;
++	default:
++		ret = -EINVAL;
++	}
++
++	return ret;
++}
++
++/*
++ * do_share() - The page owner grants access to another component with a given
++ * set of permissions.
++ *
++ * Initiator: OWNED	=> SHARED_OWNED
++ * Completer: NOPAGE	=> SHARED_BORROWED
++ */
++static int do_share(const struct pkvm_mem_transition *share)
++{
++	int ret;
++
++	ret = check_share(share);
++	if (ret)
++		return ret;
++
++	return WARN_ON(__do_share(share));
++}
++
++int __pkvm_host_share_guest(u64 hpa, struct pkvm_pgtable *guest_pgt,
++			    u64 gpa, u64 size, u64 prot)
++{
++	int ret;
++	struct pkvm_mem_transition share = {
++		.size		= size,
++		.initiator	= {
++			.id	= PKVM_ID_HOST,
++			.host	= {
++				.addr	= hpa,
++			},
++			.prot	= HOST_EPT_DEF_MEM_PROT,
++		},
++		.completer	= {
++			.id	= PKVM_ID_GUEST,
++			.guest	= {
++				.pgt	= guest_pgt,
++				.addr	= gpa,
++				.phys	= hpa,
++			},
++			.prot	= prot,
++		},
++	};
++
++	host_ept_lock();
++
++	ret = do_share(&share);
++
++	host_ept_unlock();
++
++	return ret;
++}
++
++static int __pkvm_guest_share_host_page(struct pkvm_pgtable *guest_pgt,
++					u64 gpa, u64 hpa, u64 guest_prot)
++{
++	struct pkvm_mem_transition share = {
++		.size		= PAGE_SIZE,
++		.initiator	= {
++			.id	= PKVM_ID_GUEST,
++			.guest	= {
++				.pgt	= guest_pgt,
++				.addr	= gpa,
++				.phys	= hpa,
++			},
++			.prot	= guest_prot,
++		},
++		.completer	= {
++			.id	= PKVM_ID_HOST,
++			.host	= {
++				.addr	= hpa,
++			},
++			.prot	= HOST_EPT_DEF_MEM_PROT,
++		},
++	};
++
++	return do_share(&share);
++}
++
++int __pkvm_guest_share_host(struct pkvm_pgtable *guest_pgt,
++			    u64 gpa, u64 size)
++{
++	unsigned long hpa;
++	u64 prot;
++	int ret = 0;
++
++	if (!PAGE_ALIGNED(size))
++		return -EINVAL;
++
++	guest_pgstate_pgt_lock(guest_pgt);
++	host_ept_lock();
++
++	while (size) {
++		pkvm_pgtable_lookup(guest_pgt, gpa, &hpa, &prot, NULL);
++		if (hpa == INVALID_ADDR) {
++			ret = -EINVAL;
++			break;
++		}
++
++		ret = __pkvm_guest_share_host_page(guest_pgt, gpa, hpa, prot);
++		if (ret)
++			break;
++
++		size -= PAGE_SIZE;
++		gpa += PAGE_SIZE;
++	}
++
++	host_ept_unlock();
++	guest_pgstate_pgt_unlock(guest_pgt);
++
++
++	return ret;
++}
++
++static int host_request_unshare(const struct pkvm_mem_transition *tx)
++{
++	u64 addr = tx->initiator.host.addr;
++	u64 size = tx->size;
++
++	return __host_check_page_state_range(tx->initiator.host.pgt_override, addr,
++					     size, PKVM_PAGE_SHARED_OWNED);
++}
++
++static int guest_request_unshare(const struct pkvm_mem_transition *tx)
++{
++	u64 addr = tx->initiator.guest.addr;
++	u64 size = tx->size;
++
++	return __guest_check_page_state_range(tx->initiator.guest.pgt,
++					      addr, size, PKVM_PAGE_SHARED_OWNED);
++}
++
++static int host_ack_unshare(const struct pkvm_mem_transition *tx)
++{
++	u64 addr = tx->completer.host.addr;
++	u64 size = tx->size;
++
++	return __host_check_page_state_range(tx->completer.host.pgt_override, addr,
++					     size, PKVM_PAGE_SHARED_BORROWED);
++}
++
++static int guest_ack_unshare(const struct pkvm_mem_transition *tx)
++{
++	u64 addr = tx->completer.guest.addr;
++	u64 size = tx->size;
++
++	return __guest_check_page_state_range(tx->completer.guest.pgt, addr,
++					      size, PKVM_PAGE_SHARED_BORROWED);
++}
++
++int check_unshare(const struct pkvm_mem_transition *tx)
++{
++	int ret;
++
++	switch (tx->initiator.id) {
++	case PKVM_ID_HOST:
++		ret = host_request_unshare(tx);
++		break;
++	case PKVM_ID_GUEST:
++		ret = guest_request_unshare(tx);
++		break;
++	default:
++		ret = -EINVAL;
++	}
++
++	if (ret)
++		return ret;
++
++	switch (tx->completer.id) {
++	case PKVM_ID_HOST:
++		ret = host_ack_unshare(tx);
++		break;
++	case PKVM_ID_GUEST:
++		ret = guest_ack_unshare(tx);
++		break;
++	default:
++		ret = -EINVAL;
++	}
++
++	return ret;
++}
++
++static int host_initiate_unshare(const struct pkvm_mem_transition *tx)
++{
++	u64 addr = tx->initiator.host.addr;
++	u64 size = tx->size;
++	u64 prot = pkvm_mkstate(tx->initiator.prot, PKVM_PAGE_OWNED);
++
++	return host_ept_create_idmap_locked(tx->initiator.host.pgt_override, addr, size, 0, prot);
++}
++
++static int guest_initiate_unshare(const struct pkvm_mem_transition *tx)
++{
++	struct pkvm_pgtable *pgt = tx->initiator.guest.pgt;
++	u64 addr = tx->initiator.guest.addr;
++	u64 phys = tx->initiator.guest.phys;
++	u64 size = tx->size;
++	u64 prot = pkvm_mkstate(tx->initiator.prot, PKVM_PAGE_OWNED);
++
++	return pkvm_pgtable_map(pgt, addr, phys, size, 0, prot, NULL);
++}
++
++static int host_complete_unshare(const struct pkvm_mem_transition *tx)
++{
++	u64 addr = tx->completer.host.addr;
++	u64 size = tx->size;
++	u64 owner_id = initiator_owner_id(tx);
++
++	return host_ept_set_owner_locked(tx->completer.host.pgt_override, addr, size, owner_id);
++}
++
++static int guest_complete_unshare(const struct pkvm_mem_transition *tx)
++{
++	struct pkvm_pgtable *pgt = tx->completer.guest.pgt;
++	u64 addr = tx->completer.guest.addr;
++	u64 phys = tx->completer.guest.phys;
++	u64 size = tx->size;
++
++	return pkvm_pgtable_unmap_safe(pgt, addr, phys, size, NULL);
++}
++
++static int __do_unshare(struct pkvm_mem_transition *tx)
++{
++	int ret;
++
++	switch (tx->initiator.id) {
++	case PKVM_ID_HOST:
++		ret = host_initiate_unshare(tx);
++		break;
++	case PKVM_ID_GUEST:
++		ret = guest_initiate_unshare(tx);
++		break;
++	default:
++		ret = -EINVAL;
++	}
++
++	if (ret)
++		return ret;
++
++	switch (tx->completer.id) {
++	case PKVM_ID_HOST:
++		ret = host_complete_unshare(tx);
++		break;
++	case PKVM_ID_GUEST:
++		ret = guest_complete_unshare(tx);
++		break;
++	default:
++		ret = -EINVAL;
++	}
++
++	return ret;
++}
++
++/*
++ * do_unshare() - The page owner takes back the page access for another
++ * component.
++ *
++ * Initiator: SHARED_OWNED	=> OWNED
++ * Completer: SHARED_BORROWED	=> NOPAGE
++ */
++int do_unshare(struct pkvm_mem_transition *share)
++{
++	int ret;
++
++	ret = check_unshare(share);
++	if (ret)
++		return ret;
++
++	return WARN_ON(__do_unshare(share));
++}
++
++int __pkvm_host_unshare_guest(u64 hpa, struct pkvm_pgtable *guest_pgt,
++			      u64 gpa, u64 size)
++{
++	int ret;
++	struct pkvm_mem_transition share = {
++		.size = size,
++		.initiator	= {
++			.id	= PKVM_ID_HOST,
++			.host	= {
++				.addr	= hpa,
++			},
++			.prot	= HOST_EPT_DEF_MEM_PROT,
++		},
++		.completer	= {
++			.id	= PKVM_ID_GUEST,
++			.guest	= {
++				.pgt	= guest_pgt,
++				.addr	= gpa,
++				.phys	= hpa,
++			},
++		},
++	};
++
++	host_ept_lock();
++
++	ret = do_unshare(&share);
++
++	host_ept_unlock();
++
++	return ret;
++}
++
++static int __pkvm_guest_unshare_host_page(struct pkvm_pgtable *guest_pgt,
++					  u64 gpa, u64 hpa, u64 guest_prot)
++{
++	struct pkvm_mem_transition share = {
++		.size = PAGE_SIZE,
++		.initiator	= {
++			.id	= PKVM_ID_GUEST,
++			.guest	= {
++				.pgt	= guest_pgt,
++				.addr	= gpa,
++				.phys	= hpa,
++			},
++			.prot	= guest_prot,
++		},
++		.completer	= {
++			.id	= PKVM_ID_HOST,
++			.host	= {
++				.addr	= hpa,
++			},
++		},
++	};
++
++	return do_unshare(&share);
++}
++
++int __pkvm_guest_unshare_host(struct pkvm_pgtable *guest_pgt,
++			      u64 gpa, u64 size)
++{
++	unsigned long hpa;
++	u64 prot;
++	int ret = 0;
++
++	guest_pgstate_pgt_lock(guest_pgt);
++	host_ept_lock();
++
++	while (size) {
++		pkvm_pgtable_lookup(guest_pgt, gpa, &hpa, &prot, NULL);
++		if (hpa == INVALID_ADDR) {
++			ret = -EINVAL;
++			break;
++		}
++
++		ret = __pkvm_guest_unshare_host_page(guest_pgt, gpa, hpa, prot);
++		if (ret)
++			break;
++
++		size -= PAGE_SIZE;
++		gpa += PAGE_SIZE;
++	}
++
++	host_ept_unlock();
++	guest_pgstate_pgt_unlock(guest_pgt);
++
++	return ret;
++}
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/mem_protect.h b/arch/x86/kvm/vmx/pkvm/hyp/mem_protect.h
+new file mode 100644
+index 000000000000..f71c55c46d3a
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/mem_protect.h
+@@ -0,0 +1,205 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++#ifndef __PKVM_MEM_PROTECT_H__
++#define __PKVM_MEM_PROTECT_H__
++
++/*
++ * enum pkvm_pgtable_prot - The ignored bits in page-table.
++ * pkvm will use these ignored bits as software bits to
++ * identify the page status.
++ */
++enum pkvm_pgtable_prot {
++	PKVM_PGTABLE_PROT_SW0		= BIT(56),
++	PKVM_PGTABLE_PROT_SW1		= BIT(57),
++};
++
++/*
++ * Using the ignored bits in page-table as SW bits.
++ * SW bits 0-1 are used to track the memory ownership state of each page:
++ *   00: The page has no mapping in page table (also invalid pte). And under
++ *   this page state, host ept is using the pte ignored bits to record owner_id.
++ *   01: The page is owned exclusively by the page-table owner.
++ *   10: The page is owned by the page-table owner, but is shared
++ *   	with another entity.
++ *   11: The page is shared with, but not owned by the page-table owner.
++ */
++enum pkvm_page_state {
++	PKVM_NOPAGE			= 0ULL,
++	PKVM_PAGE_OWNED			= PKVM_PGTABLE_PROT_SW0,
++	PKVM_PAGE_SHARED_OWNED		= PKVM_PGTABLE_PROT_SW1,
++	PKVM_PAGE_SHARED_BORROWED	= PKVM_PGTABLE_PROT_SW0 |
++					  PKVM_PGTABLE_PROT_SW1,
++};
++
++#define PKVM_PAGE_STATE_PROT_MASK	(PKVM_PGTABLE_PROT_SW0 | PKVM_PGTABLE_PROT_SW1)
++/* use 20 bits[12~31] - not conflict w/ low 12 bits pte prot */
++#define PKVM_INVALID_PTE_OWNER_MASK	GENMASK(31, 12)
++
++static inline u64 pkvm_mkstate(u64 prot, enum pkvm_page_state state)
++{
++	return (prot & ~PKVM_PAGE_STATE_PROT_MASK) | state;
++}
++
++static inline enum pkvm_page_state pkvm_getstate(u64 pte)
++{
++	return pte & PKVM_PAGE_STATE_PROT_MASK;
++}
++
++typedef u32 pkvm_id;
++static const pkvm_id pkvm_hyp_id = 0;
++
++/*
++ * __pkvm_host_donate_hyp() - Donate pages from host to hyp, then host cannot
++ * access these donated pages.
++ *
++ * @hpa:	Start hpa of being donated pages, must be continuous.
++ * @size:	The size of memory to be donated.
++ *
++ * A range of pages [hpa, hpa + size) will be donated from host to hyp. And
++ * this will unmap these pages from host ept and set the page owner as hyp_id
++ * in the pte in host ept. For hyp mmu, it will do nothing as hyp mmu can
++ * access all the memory by default, but modifying host ept is necessary because a
++ * page used by pkvm is private and can't be accessed by host.
++ */
++int __pkvm_host_donate_hyp(u64 hpa, u64 size);
++
++/*
++ * __pkvm_hyp_donate_host() - Donate pages from hyp to host, then host can
++ * access these pages.
++ *
++ * @hpa:	Start hpa of being donated pages, must be continuous.
++ * @size:	The size of memory to be donated.
++ *
++ * A range of pages [hpa, hpa + size) will be donated from hyp to host. This
++ * will create mapping in host ept for these pages, and nothing to do with hyp
++ * mmu. This is paired with __pkvm_host_donate_hyp(), and same as host reclaiming
++ * these pages back.
++ */
++int __pkvm_hyp_donate_host(u64 hpa, u64 size);
++
++/*
++ * __pkvm_host_share_guest() - Share pages between host and guest. Host still
++ * ownes the page and guest will have temporary access to these pages.
++ *
++ * @hpa:	Start hpa of being shared pages, must be continuous.
++ * @guest_pgt:	The guest ept pagetable.
++ * @gpa:	Start gpa that will be used for mapping into the guest ept.
++ * @size:	The size of pages to be shared.
++ * @prot:	The prot that will be used for creating mapping for guest ept.
++ *
++ * A range of pages [hpa, hpa + size) in host ept that their page state
++ * will be modified from PAGE_OWNED to PAGE_SHARED_OWNED. There will be
++ * mapping from gfn to pfn to be created in guest ept. The @prot
++ * and PAGE_SHARED_BORROWED will be used to create such mapping.
++ */
++int __pkvm_host_share_guest(u64 hpa, struct pkvm_pgtable *guest_pgt,
++			    u64 gpa, u64 size, u64 prot);
++
++/*
++ * __pkvm_host_unshare_guest() - Host unshare pages that have been shared to guest
++ * previously. Guest will not be able to access these pages.
++ *
++ * @hpa:	Start hpa of being shared pages, must be continuous.
++ * @guest_pgt:	The guest ept pagetable.
++ * @gpa:	Start gpa of shared pages being mapped in guest ept.
++ * @size:	The size of pages to be shared.
++ *
++ * Unmap the range [gfn, gfn + nr_pages) in guest ept pagetable. And change
++ * the page state from PAGE_SHARED_BORROWED to PAGE_OWNED in the host ept.
++ */
++int __pkvm_host_unshare_guest(u64 hpa, struct pkvm_pgtable *guest_pgt,
++			      u64 gpa, u64 size);
++
++/*
++ * __pkvm_host_donate_guest() - Host donate pages to guest. Then host can't
++ * access these pages and guest can access.
++ *
++ * @hpa:	Start hpa of being donated pages, must be continuous.
++ * @guest_pgt:	The guest ept pagetable.
++ * @gpa:	Start gpa of donated pages that will be mapped in guest ept.
++ * @size:	The size of pages to being donated.
++ * @prot:	The prot that will be used for creating mapping in guest ept.
++ *
++ * A range of pages [hpa, hpa + size) will be donated from host to guest. And
++ * this will unmap these pages from host ept and set the page owner as guest_id
++ * in the pte in host ept. The guest_id is equal to the vm's shadow_handle+1. In
++ * the same time, the mapping gpa -> hpa with @size will be created in guest ept
++ * with @prot.
++ */
++int __pkvm_host_donate_guest(u64 hpa, struct pkvm_pgtable *guest_pgt,
++			     u64 gpa, u64 size, u64 prot);
++
++/*
++ * __pkvm_host_donate_guest_fastpath() - Similar to __pkvm_host_donate_guest() but
++ * will use the fastpath to set annotation in host EPT to donate a page. The fastpath
++ * of setting annotation doesn't do the TLB flushing when unmaps from the host EPT.
++ * This function is used in the scenario that, the caller can do TLB flushing after
++ * doing a bunch of donating pages which can improve the performance. The caller
++ * should guarantee that doing TLB flushing after donating doesn't bring any security
++ * window that host can steal the data from the donated page.
++ */
++int __pkvm_host_donate_guest_fastpath(u64 hpa, struct pkvm_pgtable *guest_pgt,
++				      u64 gpa, u64 size, u64 prot);
++
++/*
++ * __pkvm_host_undoate_guest() - Host reclaim these pages donated to guest.
++ * Then guest can't access these pages and host can access.
++ *
++ * @hpa:	Start hpa of being donated pages, must be continuous.
++ * @guest_pgt:	The guest ept pagetable.
++ * @gpa:	Start gpa of donated pages that will be unmapped in guest ept.
++ * @size:	The size of pages to be donated.
++ *
++ * A range of pages [hpa, hpa + size) will be donated from guest to host. And
++ * this will unmap these pages [gpa, gpa + size) from guest ept. In the same
++ * time, the identity mapping for hpa will be created in host ept.
++ */
++int __pkvm_host_undonate_guest(u64 hpa, struct pkvm_pgtable *guest_pgt,
++			       u64 gpa, u64 size);
++/*
++ * __pkvm_guest_share_host() - Guest share pages to host. Guest still
++ * ownes the pages and host will have temporary access to these pages.
++ *
++ * @guest_pgt:	The guest ept pagetable.
++ * @gpa:	Start gpa of being shared pages, must be continuous.
++ * @size:	The size of pages to be shared, should be PAGE_ALIGNED.
++ *
++ * The parameter does not have hpa, as the caller does not know it. The hpa
++ * depends on looking up the guest ept to get it.
++ *
++ * Now the function will share one PAGE at a time. If the size is larger than
++ * PAGE_SIZE, it will split it into multiple PAGE_SIZE pages and share them using
++ * a loop.
++ *
++ * A range of pages [gpa, gpa + size) in guest ept that its page state
++ * will be modified from PAGE_OWNED to PAGE_SHARED_OWNED. There will be
++ * mapping to be created in host ept for addr hpa, and its page state will be
++ * PAGE_SHARED_BORROWED.
++ */
++int __pkvm_guest_share_host(struct pkvm_pgtable *guest_pgt,
++			    u64 gpa, u64 size);
++
++/*
++ * __pkvm_guest_unshare_host() - Guest reclaim these pages donated to host.
++ * Then host can't access these pages and guest still ownes it.
++ *
++ * @guest_pgt:	The guest ept pagetable.
++ * @gpa:	Start gpa of being unshared pages, must be continuous.
++ * @size:	The size of pages to be unshared, should be PAGE_ALIGNED.
++ *
++ * The parameter does not have hpa, as the caller does not know it. The hpa
++ * depends on looking up the guest ept to get it.
++ *
++ * Now the function will unshare one PAGE at a time. If the size is larger than
++ * PAGE_SIZE, it will split it into multiple PAGE_SIZE pages and unshare them
++ * using a loop.
++ *
++ * A range of pages [gpa, gpa + size) in guest ept that its page state will be
++ * modified from PAGE_SHARED_OWNED to PAGE_OWNED. The mapping for these
++ * pages in host ept will be unmapped and the owner_id will be set to guest_id.
++ */
++int __pkvm_guest_unshare_host(struct pkvm_pgtable *guest_pgt,
++			      u64 gpa, u64 size);
++#endif
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/memory.c b/arch/x86/kvm/vmx/pkvm/hyp/memory.c
+new file mode 100644
+index 000000000000..94e458cf8d1d
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/memory.c
+@@ -0,0 +1,363 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++
++#include <linux/types.h>
++#include <asm/kvm_pkvm.h>
++
++#include <pkvm.h>
++#include "memory.h"
++#include "pgtable.h"
++#include "pkvm_hyp.h"
++#include "cpu.h"
++
++unsigned long __page_base_offset;
++unsigned long __symbol_base_offset;
++unsigned long __x86_clflush_size;
++static u8 max_physaddr_bits;
++
++unsigned int pkvm_memblock_nr;
++struct memblock_region pkvm_memory[PKVM_MEMBLOCK_REGIONS];
++
++void *pkvm_iophys_to_virt(unsigned long phys)
++{
++	unsigned long iova = PKVM_IOVA_OFFSET + phys;
++
++	if (iova >= __page_base_offset)
++		return (void *)INVALID_ADDR;
++
++	return (void *)iova;
++}
++
++void *pkvm_phys_to_virt(unsigned long phys)
++{
++	return (void *)__page_base_offset + phys;
++}
++
++unsigned long pkvm_virt_to_phys(void *virt)
++{
++	/* this api only take care direct & io mapping */
++	if ((unsigned long)virt < PKVM_IOVA_OFFSET)
++		return INVALID_ADDR;
++
++	return ((unsigned long)virt >= __page_base_offset) ?
++		(unsigned long)virt - __page_base_offset :
++		(unsigned long)virt - PKVM_IOVA_OFFSET;
++}
++
++unsigned long pkvm_virt_to_symbol_phys(void *virt)
++{
++	return (unsigned long)virt - __symbol_base_offset;
++}
++
++void *host_gpa2hva(unsigned long gpa)
++{
++	/* host gpa = hpa */
++	return pkvm_phys_to_virt(gpa);
++}
++
++unsigned long host_gpa2hpa(unsigned long gpa)
++{
++	/* Host VM is using identity mapping so GPA == HPA */
++	return gpa;
++}
++
++void *host_mmio2hva(unsigned long gpa)
++{
++	return pkvm_iophys_to_virt(gpa);
++}
++
++extern struct pkvm_pgtable_ops mmu_ops;
++static struct pkvm_mm_ops mm_ops = {
++	.phys_to_virt = host_gpa2hva,
++};
++
++static int check_translation(struct kvm_vcpu *vcpu, gva_t gva, gpa_t gpa,
++		u64 prot, u32 access, struct x86_exception *exception)
++{
++	u16 errcode = 0;
++	bool page_rw_flags_on = true;
++	bool user_mode_addr = true;
++	const int user_mode_access = access & PFERR_USER_MASK;
++	const int write_access = access & PFERR_WRITE_MASK;
++	bool cr4_smap = vmcs_readl(GUEST_CR4) & X86_CR4_SMAP;
++	bool cr0_wp = vmcs_readl(GUEST_CR0) & X86_CR0_WP;
++
++	/*
++	 * As pkvm hypervisor will not do instruction emulation, here we do not
++	 * expect guest memory access for instruction fetch.
++	 */
++	WARN_ON(access & PFERR_FETCH_MASK);
++
++	/* pte is not present */
++	if (gpa == INVALID_ADDR) {
++		goto check_fault;
++	} else {
++		errcode |= PFERR_PRESENT_MASK;
++
++		/*TODO: check reserved bits and PK */
++
++		/* check for R/W */
++		if ((prot & _PAGE_RW) == 0) {
++			if (write_access && (user_mode_access || cr0_wp))
++				/*
++				 * case 1: Supermode and wp is 1
++				 * case 2: Usermode
++				 */
++				goto check_fault;
++			page_rw_flags_on = false;
++		}
++
++		/* check for U/S */
++		if ((prot & _PAGE_USER) == 0) {
++			user_mode_addr = false;
++			if (user_mode_access)
++				goto check_fault;
++		}
++
++		/*
++		 * When SMAP is on, we only need to apply check when address is
++		 * user-mode address.
++		 *
++		 * Also SMAP only impacts the supervisor-mode access.
++		 */
++		/* if SMAP is enabled and supervisor-mode access */
++		if (cr4_smap && (!user_mode_access) && user_mode_addr) {
++			bool acflag = vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_AC;
++
++			/* read from user mode address, eflags.ac = 0 */
++			if ((!write_access) && (!acflag)) {
++				goto check_fault;
++			} else if (write_access) {
++				/* write to user mode address */
++
++				/* cr0.wp = 0, eflags.ac = 0 */
++				if ((!cr0_wp) && (!acflag))
++					goto check_fault;
++
++				/*
++				 * cr0.wp = 1, eflags.ac = 1, r/w flag is 0
++				 * on any paging structure entry
++				 */
++				if (cr0_wp && acflag && (!page_rw_flags_on))
++					goto check_fault;
++
++				/* cr0.wp = 1, eflags.ac = 0 */
++				if (cr0_wp && (!acflag))
++					goto check_fault;
++			} else {
++				/* do nothing */
++			}
++		}
++	}
++
++	return 0;
++
++check_fault:
++	errcode |= write_access | user_mode_access;
++	exception->error_code = errcode;
++	exception->vector = PF_VECTOR;
++	exception->error_code_valid = true;
++	exception->address = gva;
++	exception->nested_page_fault = false;
++	exception->async_page_fault = false;
++	return -EFAULT;
++
++}
++
++int gva2gpa(struct kvm_vcpu *vcpu, gva_t gva, gpa_t *gpa,
++		u32 access, struct x86_exception *exception)
++{
++	struct pkvm_pgtable guest_mmu;
++	gpa_t _gpa;
++	u64 prot;
++	int pg_level;
++
++	/* caller should ensure exception is not NULL */
++	WARN_ON(exception == NULL);
++
++	memset(exception, 0, sizeof(*exception));
++
++	/*TODO: support other paging mode beside long mode */
++	guest_mmu.root_pa = vcpu->arch.cr3 & PAGE_MASK;
++	pkvm_pgtable_init(&guest_mmu, &mm_ops, &mmu_ops, &pkvm_hyp->mmu_cap, false);
++	pkvm_pgtable_lookup(&guest_mmu, (unsigned long)gva,
++			(unsigned long *)&_gpa, &prot, &pg_level);
++	*gpa = _gpa;
++
++	return check_translation(vcpu, gva, _gpa, prot, access, exception);
++}
++
++static inline int __copy_gpa(struct kvm_vcpu *vcpu, void *addr, gpa_t gpa,
++			     unsigned int size, unsigned int pg_size,
++			     bool from_guest)
++{
++	unsigned int len, offset_in_pg;
++	void *hva;
++
++	offset_in_pg = (unsigned int)gpa & (pg_size - 1);
++	len = (size > (pg_size - offset_in_pg)) ? (pg_size - offset_in_pg) : size;
++
++	hva = host_gpa2hva(gpa);
++	if (from_guest)
++		memcpy(addr, hva, len);
++	else
++		memcpy(hva, addr, len);
++
++	return len;
++}
++
++/* only support host VM now */
++static int copy_gva(struct kvm_vcpu *vcpu, gva_t gva, void *addr,
++		unsigned int bytes, struct x86_exception *exception, bool from_guest)
++{
++	u32 access = VMX_AR_DPL(vmcs_read32(GUEST_SS_AR_BYTES)) == 3 ? PFERR_USER_MASK : 0;
++	gpa_t gpa;
++	unsigned int len;
++	int ret = 0;
++
++	if (!from_guest)
++		access |= PFERR_WRITE_MASK;
++
++	while ((bytes > 0) && (ret == 0)) {
++		ret = gva2gpa(vcpu, gva, &gpa, access, exception);
++		if (ret >= 0) {
++			len = __copy_gpa(vcpu, addr, gpa, bytes, PAGE_SIZE, from_guest);
++			if (len == 0)
++				return -EINVAL;
++			gva += len;
++			addr += len;
++			bytes -= len;
++		}
++	}
++
++	return ret;
++}
++
++int read_gva(struct kvm_vcpu *vcpu, gva_t gva, void *addr,
++		unsigned int bytes, struct x86_exception *exception)
++{
++	return copy_gva(vcpu, gva, addr, bytes, exception, true);
++}
++
++int write_gva(struct kvm_vcpu *vcpu, gva_t gva, void *addr,
++		unsigned int bytes, struct x86_exception *exception)
++{
++	return copy_gva(vcpu, gva, addr, bytes, exception, false);
++}
++
++/* only support host VM now */
++static int copy_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, void *addr,
++		unsigned int bytes, bool from_guest)
++{
++	unsigned int len;
++
++	while (bytes > 0) {
++		len = __copy_gpa(vcpu, addr, gpa, bytes, PAGE_SIZE, from_guest);
++		if (len == 0)
++			return -EINVAL;
++		gpa += len;
++		addr += len;
++		bytes -= len;
++	}
++
++	return 0;
++}
++
++int read_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, void *addr, unsigned int bytes)
++{
++	return copy_gpa(vcpu, gpa, addr, bytes, true);
++}
++
++int write_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, void *addr, unsigned int bytes)
++{
++	return copy_gpa(vcpu, gpa, addr, bytes, false);
++}
++
++bool find_mem_range(unsigned long addr, struct mem_range *range)
++{
++	int cur, left = 0, right = pkvm_memblock_nr;
++	struct memblock_region *reg;
++	unsigned long end;
++
++	range->start = 0;
++	range->end = ULONG_MAX;
++
++	/* The list of memblock regions is sorted, binary search it */
++	while (left < right) {
++		cur = (left + right) >> 1;
++		reg = &pkvm_memory[cur];
++		end = reg->base + reg->size;
++		if (addr < reg->base) {
++			right = cur;
++			range->end = reg->base;
++		} else if (addr >= end) {
++			left = cur + 1;
++			range->start = end;
++		} else {
++			range->start = reg->base;
++			range->end = end;
++			return true;
++		}
++	}
++
++	return false;
++}
++
++bool mem_range_included(struct mem_range *child, struct mem_range *parent)
++{
++	return parent->start <= child->start && child->end <= parent->end;
++}
++
++static void pkvm_clflush_cache_range_opt(void *vaddr, unsigned int size)
++{
++	const unsigned long clflush_size = __x86_clflush_size;
++	void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1));
++	void *vend = vaddr + size;
++
++	if (p >= vend)
++		return;
++
++	for (; p < vend; p += clflush_size)
++		clflushopt(p);
++}
++
++/**
++ * pkvm_clflush_cache_range - flush a cache range with clflush
++ * which is implemented by referring to clflush_cache_range() in kernel.
++ *
++ * @vaddr:	virtual start address
++ * @size:	number of bytes to flush
++ */
++void pkvm_clflush_cache_range(void *vaddr, unsigned int size)
++{
++	/*
++	 * clflush is an unordered instruction which needs fencing
++	 * with MFENCE or SFENCE to avoid ordering issue. Put a mb()
++	 * before the clflush.
++	 */
++	mb();
++	pkvm_clflush_cache_range_opt(vaddr, size);
++	/* And also put another one after. */
++	mb();
++}
++
++u64 get_max_physaddr_bits(void)
++{
++	u32 eax, ebx, ecx, edx;
++
++	if (max_physaddr_bits)
++		return max_physaddr_bits;
++
++	eax = 0x80000000;
++	ecx = 0;
++	native_cpuid(&eax, &ebx, &ecx, &edx);
++	if (eax >= 0x80000008) {
++		eax = 0x80000008;
++		native_cpuid(&eax, &ebx, &ecx, &edx);
++		max_physaddr_bits = (u8)eax & 0xff;
++	}
++
++	return max_physaddr_bits;
++}
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/memory.h b/arch/x86/kvm/vmx/pkvm/hyp/memory.h
+new file mode 100644
+index 000000000000..ba6608ec6800
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/memory.h
+@@ -0,0 +1,51 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++#ifndef _PKVM_MEMORY_H_
++#define _PKVM_MEMORY_H_
++
++#include <asm/kvm_pkvm.h>
++
++#define INVALID_ADDR (~(unsigned long)0)
++
++/*
++ * simply define IOVA offset from bit 43 to avoid
++ * canonical addressing check for the linear address
++ * as max linear address bits usually >= 47
++ */
++#define PKVM_IOVA_OFFSET	0x0000080000000000
++
++/* MMU entry property bits for UC. Can be used to map MMIO. */
++#define PKVM_PAGE_IO_NOCACHE	((u64)(__PAGE_KERNEL | _PAGE_PWT | _PAGE_PCD))
++
++unsigned long pkvm_virt_to_symbol_phys(void *virt);
++#define __pkvm_pa_symbol(x) pkvm_virt_to_symbol_phys((void *)x)
++
++void *pkvm_iophys_to_virt(unsigned long phys);
++
++#include <linux/kvm_host.h>
++void *host_gpa2hva(unsigned long gpa);
++unsigned long host_gpa2hpa(unsigned long gpa);
++void *host_mmio2hva(unsigned long gpa);
++int gva2gpa(struct kvm_vcpu *vcpu, gva_t gva, gpa_t *gpa,
++		u32 access, struct x86_exception *exception);
++int read_gva(struct kvm_vcpu *vcpu, gva_t gva, void *addr,
++		unsigned int bytes, struct x86_exception *exception);
++int write_gva(struct kvm_vcpu *vcpu, gva_t gva, void *addr,
++		unsigned int bytes, struct x86_exception *exception);
++int read_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, void *addr, unsigned int bytes);
++int write_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, void *addr, unsigned int bytes);
++
++struct mem_range {
++	unsigned long start;
++	unsigned long end;
++};
++
++bool find_mem_range(unsigned long addr, struct mem_range *range);
++bool mem_range_included(struct mem_range *child, struct mem_range *parent);
++
++void pkvm_clflush_cache_range(void *vaddr, unsigned int size);
++
++u64 get_max_physaddr_bits(void);
++#endif
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/mmu.c b/arch/x86/kvm/vmx/pkvm/hyp/mmu.c
+new file mode 100644
+index 000000000000..5cf5c784e501
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/mmu.c
+@@ -0,0 +1,258 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++
++#include <linux/memblock.h>
++#include <asm/kvm_pkvm.h>
++#include <mmu.h>
++#include <mmu/spte.h>
++
++#include <pkvm.h>
++#include <gfp.h>
++#include "pkvm_hyp.h"
++#include "early_alloc.h"
++#include "pgtable.h"
++#include "mmu.h"
++#include "debug.h"
++
++static struct pkvm_pool mmu_pool;
++static struct pkvm_pgtable hyp_mmu;
++static pkvm_spinlock_t _hyp_mmu_lock = __PKVM_SPINLOCK_UNLOCKED;
++
++static void *mmu_zalloc_page(void)
++{
++	return pkvm_alloc_pages(&mmu_pool, 0);
++}
++
++static void mmu_get_page(void *vaddr)
++{
++	pkvm_get_page(&mmu_pool, vaddr);
++}
++
++static void mmu_put_page(void *vaddr)
++{
++	pkvm_put_page(&mmu_pool, vaddr);
++}
++
++static void flush_tlb_noop(struct pkvm_pgtable *pgt,
++			   unsigned long addr, unsigned long size)
++{
++}
++
++static struct pkvm_mm_ops mmu_mm_ops = {
++	.phys_to_virt = pkvm_phys_to_virt,
++	.virt_to_phys = pkvm_virt_to_phys,
++	.zalloc_page = mmu_zalloc_page,
++	.get_page = mmu_get_page,
++	.put_page = mmu_put_page,
++	.page_count = pkvm_page_count,
++	.flush_tlb = flush_tlb_noop,
++};
++
++static bool mmu_entry_present(void *ptep)
++{
++	return pte_present(*(pte_t *)ptep);
++}
++
++static bool mmu_entry_huge(void *ptep)
++{
++	return pte_huge(*(pte_t *)ptep);
++}
++
++static void mmu_entry_mkhuge(void *ptep)
++{
++	pte_t *ptep_ptr = (pte_t *)ptep;
++
++	*ptep_ptr = pte_mkhuge(*ptep_ptr);
++}
++
++static unsigned long mmu_entry_to_phys(void *ptep)
++{
++	return native_pte_val(*(pte_t *)ptep) & PTE_PFN_MASK;
++}
++
++static u64 mmu_entry_to_prot(void *ptep)
++{
++	return (u64)pte_flags(pte_clear_flags(*(pte_t *)ptep, _PAGE_PSE));
++}
++
++static int mmu_entry_to_index(unsigned long vaddr, int level)
++{
++	return PT_LEVEL_INDEX(vaddr, level);
++}
++
++static bool mmu_entry_is_leaf(void *ptep, int level)
++{
++	if (level == PG_LEVEL_4K ||
++		!mmu_entry_present(ptep) ||
++		mmu_entry_huge(ptep))
++		return true;
++
++	return false;
++}
++
++static int mmu_level_entry_size(int level)
++{
++	return PAGE_SIZE / PTRS_PER_PTE;
++}
++
++static int mmu_level_to_entries(int level)
++{
++	return PTRS_PER_PTE;
++}
++
++static unsigned long mmu_level_to_size(int level)
++{
++	return page_level_size(level);
++}
++
++static void mmu_set_entry(void *ptep, u64 pte)
++{
++	native_set_pte((pte_t *)ptep, native_make_pte(pte));
++}
++
++static u64 mmu_level_page_mask(int level)
++{
++	return (~((1UL << PT64_LEVEL_SHIFT(level)) - 1));
++}
++
++struct pkvm_pgtable_ops mmu_ops = {
++	.pgt_entry_present = mmu_entry_present,
++	.pgt_entry_mapped = mmu_entry_present,
++	.pgt_entry_huge = mmu_entry_huge,
++	.pgt_entry_mkhuge = mmu_entry_mkhuge,
++	.pgt_entry_to_phys = mmu_entry_to_phys,
++	.pgt_entry_to_prot = mmu_entry_to_prot,
++	.pgt_entry_to_index = mmu_entry_to_index,
++	.pgt_level_page_mask = mmu_level_page_mask,
++	.pgt_entry_is_leaf = mmu_entry_is_leaf,
++	.pgt_level_entry_size = mmu_level_entry_size,
++	.pgt_level_to_entries = mmu_level_to_entries,
++	.pgt_level_to_size = mmu_level_to_size,
++	.pgt_set_entry = mmu_set_entry,
++	.default_prot = MMU_PROT_DEF,
++};
++
++static int finalize_host_mappings_walker(struct pkvm_pgtable *mmu,
++					 unsigned long vaddr,
++					 unsigned long vaddr_end,
++					 int level,
++					 void *ptep,
++					 unsigned long flags,
++					 struct pgt_flush_data *flush_data,
++					 void *const arg)
++{
++	struct pkvm_mm_ops *mm_ops = arg;
++	struct pkvm_pgtable_ops *pgt_ops = mmu->pgt_ops;
++
++	if (!pgt_ops->pgt_entry_present(ptep))
++		return 0;
++
++	/*
++	 * Fix-up the refcount for the page-table pages as the early allocator
++	 * was unable to access the pkvm_vmemmap and so the buddy allocator has
++	 * initialized the refcount to '1'.
++	 */
++	mm_ops->get_page(ptep);
++
++	return 0;
++}
++
++static int fix_pgtable_refcnt(void)
++{
++	unsigned long size;
++	struct pkvm_pgtable_ops *pgt_ops;
++	struct pkvm_pgtable_walker walker = {
++		.cb 	= finalize_host_mappings_walker,
++		.flags 	= PKVM_PGTABLE_WALK_LEAF | PKVM_PGTABLE_WALK_TABLE_POST,
++		.arg 	= hyp_mmu.mm_ops,
++	};
++
++	pgt_ops = hyp_mmu.pgt_ops;
++	/*
++	 * Calculate the max address space, then walk the [0, size) address
++	 * range to fixup refcount of every used page.
++	 */
++#ifdef CONFIG_PKVM_INTEL_DEBUG
++	/*
++	 * only fix vmmemap range for debug mode, now for 64T memory,
++	 * could be extended if physical memory is bigger than 64T
++	 */
++	size = (SZ_64T / PAGE_SIZE) * sizeof(struct pkvm_page);
++#else
++	size = pgt_ops->pgt_level_to_size(hyp_mmu.level + 1);
++#endif
++
++	return pgtable_walk(&hyp_mmu, 0, size, true, &walker);
++}
++
++int pkvm_mmu_map(unsigned long vaddr_start, unsigned long phys_start,
++		unsigned long size, int pgsz_mask, u64 prot)
++{
++	int ret;
++
++	pkvm_spin_lock(&_hyp_mmu_lock);
++	ret = pkvm_pgtable_map(&hyp_mmu, vaddr_start, phys_start,
++			       size, pgsz_mask, prot, NULL);
++	pkvm_spin_unlock(&_hyp_mmu_lock);
++	return ret;
++}
++
++int pkvm_mmu_unmap(unsigned long vaddr_start, unsigned long size)
++{
++	int ret;
++
++	pkvm_spin_lock(&_hyp_mmu_lock);
++	ret = pkvm_pgtable_unmap(&hyp_mmu, vaddr_start, size, NULL);
++	pkvm_spin_unlock(&_hyp_mmu_lock);
++	return ret;
++}
++
++/* early mmu init before vmemmap ready, use early allocator first */
++int pkvm_early_mmu_init(struct pkvm_pgtable_cap *cap,
++		void *mmu_pool_base, unsigned long mmu_pool_pages)
++{
++	pkvm_early_alloc_init(mmu_pool_base, mmu_pool_pages << PAGE_SHIFT);
++	pkvm_hyp->mmu = &hyp_mmu;
++	return pkvm_pgtable_init(&hyp_mmu, &pkvm_early_alloc_mm_ops, &mmu_ops, cap, true);
++}
++
++/* later mmu init after vmemmap ready, switch to buddy allocator */
++int pkvm_later_mmu_init(void *mmu_pool_base, unsigned long mmu_pool_pages)
++{
++	unsigned long reserved_pages, pfn;
++	int ret;
++
++	/* Enable buddy allocator */
++	pfn = __pkvm_pa(mmu_pool_base) >> PAGE_SHIFT;
++	reserved_pages = pkvm_early_alloc_nr_used_pages();
++	ret = pkvm_pool_init(&mmu_pool, pfn, mmu_pool_pages, reserved_pages);
++	if (ret) {
++		pkvm_err("fail to init mmu_pool");
++		return ret;
++	}
++
++	/* The ops should alloc memory from mmu_pool now */
++	hyp_mmu.mm_ops = &mmu_mm_ops;
++
++	/*
++	 * as we used early alloc mm_ops to create early pgtable mapping for mmu,
++	 * the refcount was not maintained at that time, we need fix it by re-walk
++	 * the pgtable
++	 */
++	return fix_pgtable_refcnt();
++}
++
++#ifdef CONFIG_PKVM_INTEL_DEBUG
++void pkvm_mmu_clone_host(int level, unsigned long start_vaddr)
++{
++	int i = mmu_entry_to_index(start_vaddr, level);
++	u64 *ptep = __va(hyp_mmu.root_pa);
++	u64 *host_cr3 = __va(__read_cr3() & PAGE_MASK);
++
++	for (; i < PTRS_PER_PTE; i++)
++		ptep[i] = host_cr3[i];
++
++}
++#endif
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/mmu.h b/arch/x86/kvm/vmx/pkvm/hyp/mmu.h
+new file mode 100644
+index 000000000000..ea2df00e1a5b
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/mmu.h
+@@ -0,0 +1,28 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++#ifndef _PKVM_MMU_H_
++#define _PKVM_MMU_H_
++
++#define MMU_PROT_DEF	0
++
++int pkvm_mmu_map(unsigned long vaddr_start, unsigned long phys_start,
++		unsigned long size, int pgsz_mask, u64 prot);
++
++int pkvm_mmu_unmap(unsigned long vaddr_start, unsigned long size);
++
++int pkvm_early_mmu_init(struct pkvm_pgtable_cap *cap,
++		void *mmu_pool_base, unsigned long mmu_pool_pages);
++
++int pkvm_later_mmu_init(void *mmu_pool_base, unsigned long mmu_pool_pages);
++
++#ifdef CONFIG_PKVM_INTEL_DEBUG
++void pkvm_mmu_clone_host(int level, unsigned long start_vaddr);
++#else
++static inline void pkvm_mmu_clone_host(int level, unsigned long start_vaddr) {}
++#endif
++
++extern struct pkvm_pgtable_ops mmu_ops;
++
++#endif
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/nested.c b/arch/x86/kvm/vmx/pkvm/hyp/nested.c
+new file mode 100644
+index 000000000000..ab4b4e40baf2
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/nested.c
+@@ -0,0 +1,1485 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++
++#include <pkvm.h>
++#include <asm/pkvm.h>
++#include <capabilities.h>
++#include "pkvm_hyp.h"
++#include "nested.h"
++#include "cpu.h"
++#include "vmx.h"
++#include "ept.h"
++#include "debug.h"
++#include "mem_protect.h"
++
++/*
++ * Not support shadow vmcs & vmfunc;
++ * Not support descriptor-table exiting
++ * as it requires guest memory access
++ * to decode and emulate instructions
++ * which is not supported for protected VM.
++ */
++#define NESTED_UNSUPPORTED_2NDEXEC 		\
++	(SECONDARY_EXEC_SHADOW_VMCS | 		\
++	 SECONDARY_EXEC_ENABLE_VMFUNC | 	\
++	 SECONDARY_EXEC_DESC)
++
++static const unsigned int vmx_msrs[] = {
++	LIST_OF_VMX_MSRS
++};
++
++bool is_vmx_msr(unsigned long msr)
++{
++	bool found = false;
++	int i;
++
++	for (i = 0; i < ARRAY_SIZE(vmx_msrs); i++) {
++		if (msr == vmx_msrs[i]) {
++			found = true;
++			break;
++		}
++	}
++
++	return found;
++}
++
++int read_vmx_msr(struct kvm_vcpu *vcpu, unsigned long msr, u64 *val)
++{
++	u32 low, high;
++	int err = 0;
++
++	pkvm_rdmsr(msr, low, high);
++
++	switch (msr) {
++		case MSR_IA32_VMX_PROCBASED_CTLS2:
++			high &= ~NESTED_UNSUPPORTED_2NDEXEC;
++			break;
++		case MSR_IA32_VMX_MISC:
++			/* not support PT, SMM, Shadowing */
++			low &= ~(MSR_IA32_VMX_MISC_INTEL_PT | BIT(14) | BIT(28)
++				| MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS);
++			break;
++		case MSR_IA32_VMX_VMFUNC:
++			/* not support vmfunc */
++			low = high = 0;
++			break;
++		case MSR_IA32_VMX_EPT_VPID_CAP:
++			low &= ~VMX_EPT_AD_BIT;
++			break;
++		default:
++			err = -EACCES;
++			break;
++	}
++
++	*val = (u64)high << 32 | (u64)low;
++
++	return err;
++}
++
++/**
++ * According to SDM Appendix B Field Encoding in VMCS, some fields only
++ * exist on processors that support the 1-setting of the corresponding
++ * fields in the control regs.
++ */
++static bool has_vmcs_field(u16 encoding)
++{
++	struct nested_vmx_msrs *msrs = &pkvm_hyp->vmcs_config.nested;
++
++	switch (encoding) {
++	case MSR_BITMAP:
++		return msrs->procbased_ctls_high & CPU_BASED_USE_MSR_BITMAPS;
++	case VIRTUAL_APIC_PAGE_ADDR:
++	case VIRTUAL_APIC_PAGE_ADDR_HIGH:
++	case TPR_THRESHOLD:
++		return msrs->procbased_ctls_high & CPU_BASED_TPR_SHADOW;
++	case SECONDARY_VM_EXEC_CONTROL:
++		return msrs->procbased_ctls_high &
++			CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
++	case VIRTUAL_PROCESSOR_ID:
++		return msrs->secondary_ctls_high & SECONDARY_EXEC_ENABLE_VPID;
++	case XSS_EXIT_BITMAP:
++		return msrs->secondary_ctls_high & SECONDARY_EXEC_XSAVES;
++	case PML_ADDRESS:
++		return msrs->secondary_ctls_high & SECONDARY_EXEC_ENABLE_PML;
++	case VM_FUNCTION_CONTROL:
++		return msrs->secondary_ctls_high & SECONDARY_EXEC_ENABLE_VMFUNC;
++	case EPT_POINTER:
++		return msrs->secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT;
++	case EOI_EXIT_BITMAP0:
++	case EOI_EXIT_BITMAP1:
++	case EOI_EXIT_BITMAP2:
++	case EOI_EXIT_BITMAP3:
++		return msrs->secondary_ctls_high &
++			SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
++	case VMREAD_BITMAP:
++	case VMWRITE_BITMAP:
++		return msrs->secondary_ctls_high & SECONDARY_EXEC_SHADOW_VMCS;
++	case ENCLS_EXITING_BITMAP:
++		return msrs->secondary_ctls_high &
++			SECONDARY_EXEC_ENCLS_EXITING;
++	case GUEST_INTR_STATUS:
++		return msrs->secondary_ctls_high &
++			SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
++	case GUEST_PML_INDEX:
++		return msrs->secondary_ctls_high & SECONDARY_EXEC_ENABLE_PML;
++	case APIC_ACCESS_ADDR:
++	case APIC_ACCESS_ADDR_HIGH:
++		return msrs->secondary_ctls_high &
++			SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
++	case TSC_MULTIPLIER:
++	case TSC_MULTIPLIER_HIGH:
++		return msrs->secondary_ctls_high &
++			SECONDARY_EXEC_TSC_SCALING;
++	case GUEST_PHYSICAL_ADDRESS:
++	case GUEST_PHYSICAL_ADDRESS_HIGH:
++		return msrs->secondary_ctls_high &
++			SECONDARY_EXEC_ENABLE_EPT;
++	case GUEST_PDPTR0:
++	case GUEST_PDPTR0_HIGH:
++	case GUEST_PDPTR1:
++	case GUEST_PDPTR1_HIGH:
++	case GUEST_PDPTR2:
++	case GUEST_PDPTR2_HIGH:
++	case GUEST_PDPTR3:
++	case GUEST_PDPTR3_HIGH:
++		return msrs->secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT;
++	case PLE_GAP:
++	case PLE_WINDOW:
++		return msrs->secondary_ctls_high &
++			SECONDARY_EXEC_PAUSE_LOOP_EXITING;
++	case VMX_PREEMPTION_TIMER_VALUE:
++		return msrs->pinbased_ctls_high &
++			PIN_BASED_VMX_PREEMPTION_TIMER;
++	case POSTED_INTR_DESC_ADDR:
++		return msrs->pinbased_ctls_high & PIN_BASED_POSTED_INTR;
++	case POSTED_INTR_NV:
++		return msrs->pinbased_ctls_high & PIN_BASED_POSTED_INTR;
++	case GUEST_IA32_PAT:
++	case GUEST_IA32_PAT_HIGH:
++		return (msrs->entry_ctls_high & VM_ENTRY_LOAD_IA32_PAT) ||
++			(msrs->exit_ctls_high & VM_EXIT_SAVE_IA32_PAT);
++	case GUEST_IA32_EFER:
++	case GUEST_IA32_EFER_HIGH:
++		return (msrs->entry_ctls_high & VM_ENTRY_LOAD_IA32_EFER) ||
++			(msrs->exit_ctls_high & VM_EXIT_SAVE_IA32_EFER);
++	case GUEST_IA32_PERF_GLOBAL_CTRL:
++	case GUEST_IA32_PERF_GLOBAL_CTRL_HIGH:
++		return msrs->entry_ctls_high & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
++	case GUEST_BNDCFGS:
++	case GUEST_BNDCFGS_HIGH:
++		return (msrs->entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) ||
++			(msrs->exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS);
++	case GUEST_IA32_RTIT_CTL:
++	case GUEST_IA32_RTIT_CTL_HIGH:
++		return (msrs->entry_ctls_high & VM_ENTRY_LOAD_IA32_RTIT_CTL) ||
++			(msrs->exit_ctls_high & VM_EXIT_CLEAR_IA32_RTIT_CTL);
++	case HOST_IA32_PAT:
++	case HOST_IA32_PAT_HIGH:
++		return msrs->exit_ctls_high & VM_EXIT_LOAD_IA32_PAT;
++	case HOST_IA32_EFER:
++	case HOST_IA32_EFER_HIGH:
++		return msrs->exit_ctls_high & VM_EXIT_LOAD_IA32_EFER;
++	case HOST_IA32_PERF_GLOBAL_CTRL:
++	case HOST_IA32_PERF_GLOBAL_CTRL_HIGH:
++		return msrs->exit_ctls_high & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
++	case EPTP_LIST_ADDRESS:
++		return msrs->vmfunc_controls & VMX_VMFUNC_EPTP_SWITCHING;
++	default:
++		return true;
++	}
++}
++
++enum VMXResult {
++	VMsucceed,
++	VMfailValid,
++	VMfailInvalid,
++};
++
++struct shadow_vmcs_field {
++	u16	encoding;
++	u16	offset;
++};
++
++static u8 vmx_vmread_bitmap[PAGE_SIZE] __aligned(PAGE_SIZE);
++static u8 vmx_vmwrite_bitmap[PAGE_SIZE] __aligned(PAGE_SIZE);
++
++static struct shadow_vmcs_field shadow_read_only_fields[] = {
++#define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) },
++#include "pkvm_nested_vmcs_fields.h"
++};
++static int max_shadow_read_only_fields =
++	ARRAY_SIZE(shadow_read_only_fields);
++static struct shadow_vmcs_field shadow_read_write_fields[] = {
++#define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) },
++#include "pkvm_nested_vmcs_fields.h"
++};
++static int max_shadow_read_write_fields =
++	ARRAY_SIZE(shadow_read_write_fields);
++static struct shadow_vmcs_field emulated_fields[] = {
++#define EMULATED_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) },
++#include "pkvm_nested_vmcs_fields.h"
++};
++static int max_emulated_fields =
++	ARRAY_SIZE(emulated_fields);
++
++static void init_vmcs_shadow_fields(void)
++{
++	int i, j;
++
++	memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
++	memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
++
++	for (i = j = 0; i < max_shadow_read_only_fields; i++) {
++		struct shadow_vmcs_field entry = shadow_read_only_fields[i];
++		u16 field = entry.encoding;
++
++		if (!has_vmcs_field(field))
++			continue;
++
++		if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
++		    (i + 1 == max_shadow_read_only_fields ||
++		     shadow_read_only_fields[i + 1].encoding != field + 1)) {
++			pkvm_err("Missing field from shadow_read_only_field %x\n",
++			       field + 1);
++		}
++
++		clear_bit(field, (unsigned long *)vmx_vmread_bitmap);
++		if (field & 1)
++			continue;
++		shadow_read_only_fields[j++] = entry;
++	}
++	max_shadow_read_only_fields = j;
++
++	for (i = j = 0; i < max_shadow_read_write_fields; i++) {
++		struct shadow_vmcs_field entry = shadow_read_write_fields[i];
++		u16 field = entry.encoding;
++
++		if (!has_vmcs_field(field))
++			continue;
++
++		if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
++		    (i + 1 == max_shadow_read_write_fields ||
++		     shadow_read_write_fields[i + 1].encoding != field + 1)) {
++			pkvm_err("Missing field from shadow_read_write_field %x\n",
++			       field + 1);
++		}
++
++		clear_bit(field, (unsigned long *)vmx_vmwrite_bitmap);
++		clear_bit(field, (unsigned long *)vmx_vmread_bitmap);
++		if (field & 1)
++			continue;
++		shadow_read_write_fields[j++] = entry;
++	}
++	max_shadow_read_write_fields = j;
++}
++
++static void init_emulated_vmcs_fields(void)
++{
++	int i, j;
++
++	for (i = j = 0; i < max_emulated_fields; i++) {
++		struct shadow_vmcs_field entry = emulated_fields[i];
++		u16 field = entry.encoding;
++
++		if (!has_vmcs_field(field))
++			continue;
++
++		emulated_fields[j++] = entry;
++	}
++	max_emulated_fields = j;
++}
++
++static bool is_host_fields(unsigned long field)
++{
++	return (((field) >> 10U) & 0x3U) == 3U;
++}
++
++static bool is_emulated_fields(unsigned long field_encoding)
++{
++	int i;
++
++	for (i = 0; i < max_emulated_fields; i++) {
++		if ((unsigned long)emulated_fields[i].encoding == field_encoding)
++			return true;
++	}
++
++	return false;
++}
++
++static void nested_vmx_result(enum VMXResult result, int error_number)
++{
++	u64 rflags = vmcs_readl(GUEST_RFLAGS);
++
++	rflags &= ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
++			X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF);
++
++	if (result == VMfailValid) {
++		rflags |= X86_EFLAGS_ZF;
++		vmcs_write32(VM_INSTRUCTION_ERROR, error_number);
++	} else if (result == VMfailInvalid) {
++		rflags |= X86_EFLAGS_CF;
++	} else {
++		/* VMsucceed, do nothing */
++	}
++
++	if (result != VMsucceed) {
++		pkvm_err("VMX failed: %d/%d", result, error_number);
++	}
++
++	vmcs_writel(GUEST_RFLAGS, rflags);
++}
++
++static int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
++			u32 vmx_instruction_info, gva_t *ret)
++{
++	gva_t off;
++	struct kvm_segment s;
++
++	/*
++	 * According to Vol. 3B, "Information for VM Exits Due to Instruction
++	 * Execution", on an exit, vmx_instruction_info holds most of the
++	 * addressing components of the operand. Only the displacement part
++	 * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
++	 * For how an actual address is calculated from all these components,
++	 * refer to Vol. 1, "Operand Addressing".
++	 */
++	int  scaling = vmx_instruction_info & 3;
++	int  addr_size = (vmx_instruction_info >> 7) & 7;
++	bool is_reg = vmx_instruction_info & (1u << 10);
++	int  seg_reg = (vmx_instruction_info >> 15) & 7;
++	int  index_reg = (vmx_instruction_info >> 18) & 0xf;
++	bool index_is_valid = !(vmx_instruction_info & (1u << 22));
++	int  base_reg       = (vmx_instruction_info >> 23) & 0xf;
++	bool base_is_valid  = !(vmx_instruction_info & (1u << 27));
++
++	if (is_reg) {
++		/* TODO: inject #UD */
++		return 1;
++	}
++
++	/* Addr = segment_base + offset */
++	/* offset = base + [index * scale] + displacement */
++	off = exit_qualification; /* holds the displacement */
++	if (addr_size == 1)
++		off = (gva_t)sign_extend64(off, 31);
++	else if (addr_size == 0)
++		off = (gva_t)sign_extend64(off, 15);
++	if (base_is_valid)
++		off += vcpu->arch.regs[base_reg];
++	if (index_is_valid)
++		off += vcpu->arch.regs[index_reg] << scaling;
++
++	if (seg_reg == VCPU_SREG_FS) {
++		s.base = vmcs_readl(GUEST_FS_BASE);
++	}
++	if (seg_reg == VCPU_SREG_GS) {
++		s.base = vmcs_readl(GUEST_GS_BASE);
++	}
++
++	/* TODO: support more cpu mode beside long mode */
++	/*
++	 * The effective address, i.e. @off, of a memory operand is truncated
++	 * based on the address size of the instruction.  Note that this is
++	 * the *effective address*, i.e. the address prior to accounting for
++	 * the segment's base.
++	 */
++	if (addr_size == 1) /* 32 bit */
++		off &= 0xffffffff;
++	else if (addr_size == 0) /* 16 bit */
++		off &= 0xffff;
++
++	/*
++	 * The virtual/linear address is never truncated in 64-bit
++	 * mode, e.g. a 32-bit address size can yield a 64-bit virtual
++	 * address when using FS/GS with a non-zero base.
++	 */
++	if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS)
++		*ret = s.base + off;
++	else
++		*ret = off;
++
++	/* TODO: check addr is canonical, otherwise inject #GP/#SS */
++
++	return 0;
++}
++
++static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer,
++				int *ret)
++{
++	struct vcpu_vmx *vmx = to_vmx(vcpu);
++	gva_t gva;
++	struct x86_exception e;
++	int r;
++
++	if (get_vmx_mem_address(vcpu, vmx->exit_qualification,
++			vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) {
++		*ret = 1;
++		return -EINVAL;
++	}
++
++	r = read_gva(vcpu, gva, vmpointer, sizeof(*vmpointer), &e);
++	if (r < 0) {
++		/*TODO: handle memory failure exception */
++		*ret = 1;
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++static int validate_vmcs_revision_id(struct kvm_vcpu *vcpu, gpa_t vmpointer)
++{
++	struct vmcs_config *vmcs_config = &pkvm_hyp->vmcs_config;
++	u32 rev_id;
++
++	read_gpa(vcpu, vmpointer, &rev_id, sizeof(rev_id));
++
++	return (rev_id == vmcs_config->revision_id);
++}
++
++static bool check_vmx_permission(struct kvm_vcpu *vcpu)
++{
++	struct vcpu_vmx *vmx = to_vmx(vcpu);
++	bool permit = true;
++
++	/*TODO: check more env (cr, cpl) and inject #UD/#GP */
++	if (!vmx->nested.vmxon)
++		permit = false;
++
++	return permit;
++}
++
++static void clear_shadow_indicator(struct vmcs *vmcs)
++{
++	vmcs->hdr.shadow_vmcs = 0;
++}
++
++static void set_shadow_indicator(struct vmcs *vmcs)
++{
++	vmcs->hdr.shadow_vmcs = 1;
++}
++
++/* current vmcs is vmcs02 */
++static void copy_shadow_fields_vmcs02_to_vmcs12(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
++{
++	const struct shadow_vmcs_field *fields[] = {
++		shadow_read_write_fields,
++		shadow_read_only_fields
++	};
++	const int max_fields[] = {
++		max_shadow_read_write_fields,
++		max_shadow_read_only_fields
++	};
++	struct shadow_vmcs_field field;
++	unsigned long val;
++	int i, q;
++
++	for (q = 0; q < ARRAY_SIZE(fields); q++) {
++		for (i = 0; i < max_fields[q]; i++) {
++			field = fields[q][i];
++			val = __vmcs_readl(field.encoding);
++			if (is_host_fields((field.encoding))) {
++				pkvm_err("%s: field 0x%x is host field, please remove from shadowing!",
++						__func__, field.encoding);
++				continue;
++			}
++			vmcs12_write_any(vmcs12, field.encoding, field.offset, val);
++		}
++	}
++}
++
++/* current vmcs is vmcs02 */
++static void copy_shadow_fields_vmcs12_to_vmcs02(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
++{
++	const struct shadow_vmcs_field *fields[] = {
++		shadow_read_write_fields,
++		shadow_read_only_fields
++	};
++	const int max_fields[] = {
++		max_shadow_read_write_fields,
++		max_shadow_read_only_fields
++	};
++	struct shadow_vmcs_field field;
++	unsigned long val;
++	int i, q;
++
++	for (q = 0; q < ARRAY_SIZE(fields); q++) {
++		for (i = 0; i < max_fields[q]; i++) {
++			field = fields[q][i];
++			val = vmcs12_read_any(vmcs12, field.encoding,
++					      field.offset);
++			if (is_host_fields((field.encoding))) {
++				pkvm_err("%s: field 0x%x is host field, please remove from shadowing!",
++						__func__, field.encoding);
++				continue;
++			}
++			__vmcs_writel(field.encoding, val);
++		}
++	}
++}
++
++/* current vmcs is vmcs01*/
++static void save_vmcs01_fields_for_emulation(struct kvm_vcpu *vcpu)
++{
++	struct vcpu_vmx *vmx = to_vmx(vcpu);
++
++	vcpu->arch.efer = vmcs_read64(GUEST_IA32_EFER);
++	vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
++	vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
++	vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
++}
++
++/* current vmcs is vmcs02*/
++static u64 emulate_field_for_vmcs02(struct vcpu_vmx *vmx, u16 field, u64 virt_val)
++{
++	u64 val = virt_val;
++	struct kvm_vcpu *vcpu = &vmx->vcpu;
++
++	switch (field) {
++	case VM_ENTRY_CONTROLS:
++		/* L1 host wishes to use its own MSRs for L2 guest?
++		 * vmcs02 shall use such guest states in vmcs01 as its guest states
++		 */
++		if ((val & VM_ENTRY_LOAD_IA32_EFER) != VM_ENTRY_LOAD_IA32_EFER) {
++			val |= VM_ENTRY_LOAD_IA32_EFER;
++			vmcs_write64(GUEST_IA32_EFER, vcpu->arch.efer);
++		}
++		if ((val & VM_ENTRY_LOAD_IA32_PAT) != VM_ENTRY_LOAD_IA32_PAT) {
++			val |= VM_ENTRY_LOAD_IA32_PAT;
++			vmcs_write64(GUEST_IA32_PAT, vcpu->arch.pat);
++		}
++		if ((val & VM_ENTRY_LOAD_DEBUG_CONTROLS) != VM_ENTRY_LOAD_DEBUG_CONTROLS) {
++			val |= VM_ENTRY_LOAD_DEBUG_CONTROLS;
++			vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
++			vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl);
++		}
++		break;
++	case VM_EXIT_CONTROLS:
++		/* L1 host wishes to keep use MSRs from L2 guest after its VMExit?
++		 * vmcs02 shall enable vmexit save for such guest states
++		 * then vmcs01 shall take these guest states as its before L1 VMEntry
++		 */
++		if ((val & VM_EXIT_LOAD_IA32_EFER) != VM_EXIT_LOAD_IA32_EFER)
++			val |= VM_EXIT_SAVE_IA32_EFER;
++		if ((val & VM_EXIT_LOAD_IA32_PAT) != VM_EXIT_LOAD_IA32_PAT)
++			val |= VM_EXIT_SAVE_IA32_PAT;
++		/* host always in 64bit mode */
++		val |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
++		break;
++	case SECONDARY_VM_EXEC_CONTROL:
++		val &= ~NESTED_UNSUPPORTED_2NDEXEC;
++		/* Enable the #VE, but only protected VM will use it. */
++		val |= SECONDARY_EXEC_EPT_VIOLATION_VE;
++		break;
++	}
++	return val;
++}
++
++/* current vmcs is vmcs02*/
++static void sync_vmcs12_dirty_fields_to_vmcs02(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
++{
++	struct shadow_vmcs_field field;
++	unsigned long val, phys_val;
++	int i;
++
++	if (vmx->nested.dirty_vmcs12) {
++		for (i = 0; i < max_emulated_fields; i++) {
++			field = emulated_fields[i];
++			if (field.encoding == EPT_POINTER)
++				/*
++				 * EPTP is configured as shadow EPTP when the first
++				 * time the vmcs02 is loaded. As shadow EPTP is not
++				 * changed at the runtime, also cannot use the virtual
++				 * EPT from KVM high, no need to sync to vmcs02 again.
++				 */
++				continue;
++			val = vmcs12_read_any(vmcs12, field.encoding, field.offset);
++			phys_val = emulate_field_for_vmcs02(vmx, field.encoding, val);
++			__vmcs_writel(field.encoding, phys_val);
++		}
++		vmx->nested.dirty_vmcs12 = false;
++	}
++}
++
++/* current vmcs is vmcs01, set vmcs01 guest state with vmcs02 host state */
++static void prepare_vmcs01_guest_state(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
++{
++	vmcs_writel(GUEST_CR0, vmcs12->host_cr0);
++	vmcs_writel(GUEST_CR3, vmcs12->host_cr3);
++	vmcs_writel(GUEST_CR4, vmcs12->host_cr4);
++
++	vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
++	vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
++	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
++
++	/* Both cases want vmcs01 to take EFER/PAT from L2
++	 * 1. L1 host wishes to load its own MSRs on L2 guest VMExit
++	 *    such vmcs12's host states shall be set as vmcs01's guest states
++	 * 2. L1 host wishes to keep use MSRs from L2 guest after its VMExit
++	 *    such vmcs02's guest state shall be set as vmcs01's guest states
++	 *    the vmcs02's guest state were recorded in vmcs12 host
++	 *
++	 * For case 1, IA32_PERF_GLOBAL_CTRL is separately checked.
++	 */
++	vmcs_write64(GUEST_IA32_EFER, vmcs12->host_ia32_efer);
++	vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
++	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
++		vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, vmcs12->host_ia32_perf_global_ctrl);
++
++	vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector);
++	vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector);
++	vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector);
++	vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector);
++	vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector);
++	vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector);
++	vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector);
++
++	vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base);
++	vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base);
++	vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base);
++	vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
++	vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
++
++	vmcs_writel(GUEST_RIP, vmcs12->host_rip);
++	vmcs_writel(GUEST_RSP, vmcs12->host_rsp);
++	vmcs_writel(GUEST_RFLAGS, 0x2);
++}
++
++static void nested_release_vmcs12(struct kvm_vcpu *vcpu)
++{
++	struct vcpu_vmx *vmx = to_vmx(vcpu);
++	struct pkvm_host_vcpu *pkvm_hvcpu = to_pkvm_hvcpu(vcpu);
++	struct shadow_vcpu_state *cur_shadow_vcpu = pkvm_hvcpu->current_shadow_vcpu;
++	struct vmcs *vmcs02;
++	struct vmcs12 *vmcs12;
++
++	if (vmx->nested.current_vmptr == INVALID_GPA)
++		return;
++
++	/* cur_shadow_vcpu must be valid here */
++	vmcs02 = (struct vmcs *)cur_shadow_vcpu->vmcs02;
++	vmcs12 = (struct vmcs12 *)cur_shadow_vcpu->cached_vmcs12;
++	vmcs_load_track(vmx, vmcs02);
++	copy_shadow_fields_vmcs02_to_vmcs12(vmx, vmcs12);
++
++	vmcs_clear_track(vmx, vmcs02);
++	clear_shadow_indicator(vmcs02);
++
++	/*disable shadowing*/
++	vmcs_load_track(vmx, vmx->loaded_vmcs->vmcs);
++	secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
++	vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
++
++	write_gpa(vcpu, vmx->nested.current_vmptr, vmcs12, VMCS12_SIZE);
++	vmx->nested.dirty_vmcs12 = false;
++	vmx->nested.current_vmptr = INVALID_GPA;
++	pkvm_hvcpu->current_shadow_vcpu = NULL;
++
++	WRITE_ONCE(cur_shadow_vcpu->vcpu, NULL);
++	/*
++	 * Flush the current used shadow EPT to make sure
++	 * nested_flush_shadow_ept() won't miss any flushing due to vmclear.
++	 * See comments in nested_flush_shadow_ept().
++	 */
++	pkvm_flush_shadow_ept(&cur_shadow_vcpu->vm->sept_desc);
++	kvm_clear_request(PKVM_REQ_TLB_FLUSH_SHADOW_EPT, vcpu);
++
++	put_shadow_vcpu(cur_shadow_vcpu->shadow_vcpu_handle);
++}
++
++static void nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
++{
++	struct vcpu_vmx *vmx = to_vmx(vcpu);
++	struct pkvm_host_vcpu *pkvm_hvcpu = to_pkvm_hvcpu(vcpu);
++	struct shadow_vcpu_state *cur_shadow_vcpu = pkvm_hvcpu->current_shadow_vcpu;
++	struct vmcs *vmcs02 = (struct vmcs *)cur_shadow_vcpu->vmcs02;
++	struct vmcs12 *vmcs12 = (struct vmcs12 *)cur_shadow_vcpu->cached_vmcs12;
++
++	if (vmx->nested.current_vmptr == INVALID_GPA) {
++		nested_vmx_result(VMfailInvalid, 0);
++	} else if (vmcs12->launch_state == launch) {
++		/* VMLAUNCH_NONCLEAR_VMCS or VMRESUME_NONLAUNCHED_VMCS */
++		nested_vmx_result(VMfailValid,
++			launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
++	} else {
++		/* save vmcs01 guest state for possible emulation */
++		save_vmcs01_fields_for_emulation(vcpu);
++
++		/* switch to vmcs02 */
++		vmcs_clear_track(vmx, vmcs02);
++		clear_shadow_indicator(vmcs02);
++		vmcs_load_track(vmx, vmcs02);
++
++		sync_vmcs12_dirty_fields_to_vmcs02(vmx, vmcs12);
++
++		/* mark guest mode */
++		vcpu->arch.hflags |= HF_GUEST_MASK;
++	}
++}
++
++static void setup_guest_ept(struct shadow_vcpu_state *shadow_vcpu, u64 guest_eptp)
++{
++	struct vmcs12 *vmcs12 = (struct vmcs12 *)shadow_vcpu->cached_vmcs12;
++	struct pkvm_shadow_vm *vm = shadow_vcpu->vm;
++	bool invalidate = false;
++
++	if (!is_valid_eptp(guest_eptp))
++		pkvm_guest_ept_deinit(shadow_vcpu);
++	else if (vmcs12->ept_pointer != guest_eptp) {
++		pkvm_guest_ept_deinit(shadow_vcpu);
++		pkvm_guest_ept_init(shadow_vcpu, guest_eptp);
++	}
++
++	pkvm_spin_lock(&vm->lock);
++	if (vm->sept_desc.last_guest_eptp != guest_eptp) {
++		vm->sept_desc.last_guest_eptp = guest_eptp;
++		invalidate = true;
++	}
++	pkvm_spin_unlock(&vm->lock);
++
++	if (invalidate)
++		pkvm_invalidate_shadow_ept(&vm->sept_desc);
++}
++
++int handle_vmxon(struct kvm_vcpu *vcpu)
++{
++	struct vcpu_vmx *vmx = to_vmx(vcpu);
++	gpa_t vmptr;
++	int r;
++
++	/*TODO: check env error(cr, efer, rflags, cpl) */
++	if (vmx->nested.vmxon) {
++		nested_vmx_result(VMfailValid, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
++	} else {
++		if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) {
++			nested_vmx_result(VMfailInvalid, 0);
++			return r;
++		} else if (!validate_vmcs_revision_id(vcpu, vmptr)) {
++			nested_vmx_result(VMfailInvalid, 0);
++		} else {
++			vmx->nested.current_vmptr = INVALID_GPA;
++			vmx->nested.dirty_vmcs12 = false;
++			vmx->nested.vmxon_ptr = vmptr;
++			vmx->nested.vmxon = true;
++
++			nested_vmx_result(VMsucceed, 0);
++		}
++	}
++
++	return 0;
++}
++
++int handle_vmxoff(struct kvm_vcpu *vcpu)
++{
++	struct vcpu_vmx *vmx = to_vmx(vcpu);
++
++	if (check_vmx_permission(vcpu)) {
++		vmx->nested.vmxon = false;
++		vmx->nested.vmxon_ptr = INVALID_GPA;
++
++		nested_vmx_result(VMsucceed, 0);
++	}
++
++	return 0;
++}
++
++int handle_vmptrld(struct kvm_vcpu *vcpu)
++{
++	struct vcpu_vmx *vmx = to_vmx(vcpu);
++	struct pkvm_host_vcpu *pkvm_hvcpu = to_pkvm_hvcpu(vcpu);
++	struct shadow_vcpu_state *shadow_vcpu;
++	struct vmcs *vmcs02;
++	struct vmcs12 *vmcs12;
++	gpa_t vmptr;
++	int r;
++
++	if (check_vmx_permission(vcpu)) {
++		if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) {
++			nested_vmx_result(VMfailValid, VMXERR_VMPTRLD_INVALID_ADDRESS);
++			return r;
++		} else if (vmptr == vmx->nested.vmxon_ptr) {
++			nested_vmx_result(VMfailValid, VMXERR_VMPTRLD_VMXON_POINTER);
++		} else if (!validate_vmcs_revision_id(vcpu, vmptr)) {
++			nested_vmx_result(VMfailValid, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
++		} else {
++			if (vmx->nested.current_vmptr != vmptr) {
++				s64 handle;
++
++				nested_release_vmcs12(vcpu);
++
++				handle = find_shadow_vcpu_handle_by_vmcs(vmptr);
++				if ((handle > 0) && (shadow_vcpu = get_shadow_vcpu(handle))) {
++					vmcs02 = (struct vmcs *)shadow_vcpu->vmcs02;
++					vmcs12 = (struct vmcs12 *) shadow_vcpu->cached_vmcs12;
++
++					read_gpa(vcpu, vmptr, vmcs12, VMCS12_SIZE);
++					vmx->nested.dirty_vmcs12 = true;
++
++					/*
++					 * Save vmcs01 guest state for possible emulation when
++					 * calling sync_vmcs12_dirty_fields_to_vmcs02.
++					 */
++					save_vmcs01_fields_for_emulation(vcpu);
++
++					WRITE_ONCE(shadow_vcpu->vcpu, vcpu);
++					if (!shadow_vcpu->vmcs02_inited) {
++						memset(vmcs02, 0, pkvm_hyp->vmcs_config.size);
++						vmcs02->hdr.revision_id = pkvm_hyp->vmcs_config.revision_id;
++						vmcs_load_track(vmx, vmcs02);
++						init_contant_host_state_area(pkvm_hvcpu->pcpu, vcpu->cpu);
++						vmcs_writel(HOST_RIP, (unsigned long)__pkvm_vmx_vmexit);
++						/*
++						 * EPTP is mantained by pKVM and configured with
++						 * shadow EPTP from its corresponding shadow VM.
++						 * As shadow EPTP is not changed at runtime, set
++						 * it to EPTP when the first time this vmcs02 is
++						 * loading.
++						 */
++						vmcs_write64(EPT_POINTER,
++							     shadow_vcpu->vm->sept_desc.shadow_eptp);
++						/*
++						 * Flush the shadow eptp in case there are stale
++						 * entries which are not flushed when destroying
++						 * this shadow EPTP at last time.
++						 */
++						pkvm_flush_shadow_ept(&shadow_vcpu->vm->sept_desc);
++
++						/*
++						 * Write the #VE information physical address.
++						 */
++						if (shadow_vcpu_is_protected(shadow_vcpu)) {
++							memset(&shadow_vcpu->ve_info, 0, sizeof(shadow_vcpu->ve_info));
++							vmcs_write64(VE_INFO_ADDR, __pkvm_pa(&shadow_vcpu->ve_info));
++						}
++
++						shadow_vcpu->last_cpu = vcpu->cpu;
++						shadow_vcpu->vmcs02_inited = true;
++					} else {
++						vmcs_load_track(vmx, vmcs02);
++						if (shadow_vcpu->last_cpu != vcpu->cpu) {
++							init_contant_host_state_area(pkvm_hvcpu->pcpu, vcpu->cpu);
++							shadow_vcpu->last_cpu = vcpu->cpu;
++						}
++					}
++
++					pkvm_hvcpu->current_shadow_vcpu = shadow_vcpu;
++
++					copy_shadow_fields_vmcs12_to_vmcs02(vmx, vmcs12);
++					sync_vmcs12_dirty_fields_to_vmcs02(vmx, vmcs12);
++					vmcs_clear_track(vmx, vmcs02);
++					set_shadow_indicator(vmcs02);
++
++					/* enable shadowing */
++					vmcs_load_track(vmx, vmx->loaded_vmcs->vmcs);
++					vmcs_write64(VMREAD_BITMAP, __pkvm_pa_symbol(vmx_vmread_bitmap));
++					vmcs_write64(VMWRITE_BITMAP, __pkvm_pa_symbol(vmx_vmwrite_bitmap));
++					secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
++					vmcs_write64(VMCS_LINK_POINTER, __pkvm_pa(vmcs02));
++
++					vmx->nested.current_vmptr = vmptr;
++
++					nested_vmx_result(VMsucceed, 0);
++				} else {
++					nested_vmx_result(VMfailValid, VMXERR_VMPTRLD_INVALID_ADDRESS);
++				}
++			} else {
++				nested_vmx_result(VMsucceed, 0);
++			}
++		}
++	}
++
++	return 0;
++}
++
++int handle_vmclear(struct kvm_vcpu *vcpu)
++{
++	struct vcpu_vmx *vmx = to_vmx(vcpu);
++	gpa_t vmptr;
++	u32 zero = 0;
++	int r;
++
++	if (check_vmx_permission(vcpu)) {
++		if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) {
++			nested_vmx_result(VMfailValid, VMXERR_VMPTRLD_INVALID_ADDRESS);
++			return r;
++		} else if (vmptr == vmx->nested.vmxon_ptr) {
++			nested_vmx_result(VMfailValid, VMXERR_VMCLEAR_VMXON_POINTER);
++		} else {
++			if (vmx->nested.current_vmptr == vmptr)
++				nested_release_vmcs12(vcpu);
++
++			write_gpa(vcpu, vmptr + offsetof(struct vmcs12, launch_state),
++					&zero, sizeof(zero));
++
++			nested_vmx_result(VMsucceed, 0);
++		}
++	}
++
++	return 0;
++}
++
++int handle_vmwrite(struct kvm_vcpu *vcpu)
++{
++	struct vcpu_vmx *vmx = to_vmx(vcpu);
++	struct pkvm_host_vcpu *pkvm_hvcpu = to_pkvm_hvcpu(vcpu);
++	struct shadow_vcpu_state *cur_shadow_vcpu = pkvm_hvcpu->current_shadow_vcpu;
++	struct vmcs12 *vmcs12 = (struct vmcs12 *)cur_shadow_vcpu->cached_vmcs12;
++	u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
++	struct x86_exception e;
++	unsigned long field;
++	short offset;
++	gva_t gva;
++	int r, reg;
++	u64 value = 0;
++
++	if (check_vmx_permission(vcpu)) {
++		if (vmx->nested.current_vmptr == INVALID_GPA) {
++			nested_vmx_result(VMfailInvalid, 0);
++		} else {
++			if (instr_info & BIT(10)) {
++				reg = ((instr_info) >> 3) & 0xf;
++				value = vcpu->arch.regs[reg];
++			} else {
++				if (get_vmx_mem_address(vcpu, vmx->exit_qualification,
++							instr_info, &gva))
++					return 1;
++
++				r = read_gva(vcpu, gva, &value, 8, &e);
++				if (r < 0) {
++					/*TODO: handle memory failure exception */
++					return r;
++				}
++			}
++
++			reg = ((instr_info) >> 28) & 0xf;
++			field = vcpu->arch.regs[reg];
++
++			offset = get_vmcs12_field_offset(field);
++			if (offset < 0) {
++				nested_vmx_result(VMfailInvalid, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
++				return 0;
++			}
++
++			/*TODO: check vcpu supports "VMWRITE to any supported field in the VMCS"*/
++			if (vmcs_field_readonly(field)) {
++				nested_vmx_result(VMfailInvalid, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
++				return 0;
++			}
++
++			/*
++			 * Some Intel CPUs intentionally drop the reserved bits of the AR byte
++			 * fields on VMWRITE.  Emulate this behavior to ensure consistent KVM
++			 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE
++			 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD
++			 * from L1 will return a different value than VMREAD from L2 (L1 sees
++			 * the stripped down value, L2 sees the full value as stored by KVM).
++			 */
++			if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES)
++				value &= 0x1f0ff;
++
++			if (field == EPT_POINTER)
++				setup_guest_ept(cur_shadow_vcpu, value);
++
++			vmcs12_write_any(vmcs12, field, offset, value);
++
++			if (is_emulated_fields(field)) {
++				vmx->nested.dirty_vmcs12 = true;
++				nested_vmx_result(VMsucceed, 0);
++			} else if (is_host_fields(field)){
++				nested_vmx_result(VMsucceed, 0);
++			} else {
++				pkvm_err("%s: not include emulated fields 0x%lx, please add!\n",
++						__func__, field);
++				nested_vmx_result(VMfailInvalid, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
++			}
++		}
++	}
++
++	return 0;
++}
++
++int handle_vmread(struct kvm_vcpu *vcpu)
++{
++	struct vcpu_vmx *vmx = to_vmx(vcpu);
++	struct pkvm_host_vcpu *pkvm_hvcpu = to_pkvm_hvcpu(vcpu);
++	struct shadow_vcpu_state *cur_shadow_vcpu = pkvm_hvcpu->current_shadow_vcpu;
++	struct vmcs12 *vmcs12 = (struct vmcs12 *)cur_shadow_vcpu->cached_vmcs12;
++	u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
++	struct x86_exception e;
++	unsigned long field;
++	short offset;
++	gva_t gva = 0;
++	int r, reg;
++	u64 value;
++
++	if (check_vmx_permission(vcpu)) {
++		if (vmx->nested.current_vmptr == INVALID_GPA) {
++			nested_vmx_result(VMfailInvalid, 0);
++		} else {
++			/* Decode instruction info and find the field to read */
++			reg = ((instr_info) >> 28) & 0xf;
++			field = vcpu->arch.regs[reg];
++
++			offset = get_vmcs12_field_offset(field);
++			if (offset < 0) {
++				nested_vmx_result(VMfailInvalid, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
++			} else {
++				value = vmcs12_read_any(vmcs12, field, offset);
++				if (instr_info & BIT(10)) {
++					reg = ((instr_info) >> 3) & 0xf;
++					vcpu->arch.regs[reg] = value;
++				} else {
++					if (get_vmx_mem_address(vcpu, vmx->exit_qualification,
++								instr_info, &gva))
++						return 1;
++
++					r = write_gva(vcpu, gva, &value, 8, &e);
++					if (r < 0) {
++						/*TODO: handle memory failure exception */
++						return r;
++					}
++				}
++				nested_vmx_result(VMsucceed, 0);
++			}
++		}
++	}
++
++	return 0;
++}
++
++int handle_vmresume(struct kvm_vcpu *vcpu)
++{
++	if (check_vmx_permission(vcpu))
++		nested_vmx_run(vcpu, false);
++
++	return 0;
++}
++
++int handle_vmlaunch(struct kvm_vcpu *vcpu)
++{
++	if (check_vmx_permission(vcpu))
++		nested_vmx_run(vcpu, true);
++
++	return 0;
++}
++
++int handle_invept(struct kvm_vcpu *vcpu)
++{
++	struct vmx_capability *vmx_cap = &pkvm_hyp->vmx_cap;
++	struct shadow_vcpu_state *shadow_vcpu;
++	struct vcpu_vmx *vmx = to_vmx(vcpu);
++	u32 vmx_instruction_info, types;
++	unsigned long type;
++	int gpr_index;
++
++	if (!vmx_has_invept())
++		/* TODO: inject #UD */
++		return -EINVAL;
++
++	if (!check_vmx_permission(vcpu))
++		return 0;
++
++	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
++	gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
++	type = vcpu->arch.regs[gpr_index];
++	types = (vmx_cap->ept >> VMX_EPT_EXTENT_SHIFT) & 6;
++
++	if (type >= 32 || !(types & (1 << type))) {
++		nested_vmx_result(VMfailValid, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
++		return 0;
++	}
++
++	/*
++	 * Shadow EPT TLB is flushed when doing vmclear for a shadow vcpu, so if
++	 * this CPU doesn't have a shadow vcpu loaded, there is no shadow
++	 * EPT TLB entries left on this CPU, and no need to execute invept.
++	 */
++	shadow_vcpu = to_pkvm_hvcpu(vcpu)->current_shadow_vcpu;
++	if (!shadow_vcpu)
++		goto out;
++
++	switch (type) {
++	case VMX_EPT_EXTENT_CONTEXT: {
++		struct vmcs12 *vmcs12;
++		struct x86_exception e;
++		gva_t gva;
++		struct {
++			u64 eptp, gpa;
++		} operand;
++
++		if (get_vmx_mem_address(vcpu, vmx->exit_qualification,
++					vmx_instruction_info, &gva))
++			/* TODO: handle the decode failure */
++			return -EINVAL;
++
++		if (read_gva(vcpu, gva, &operand, sizeof(operand), &e) < 0)
++			/*TODO: handle memory failure exception */
++			return -EINVAL;
++
++		/*
++		 * For single context invept with a guest eptp, do the invept
++		 * if the guest eptp matches the shadow eptp of this
++		 * loaded shadow vcpu.
++		 */
++		vmcs12 = (struct vmcs12 *)shadow_vcpu->cached_vmcs12;
++		if (vmcs12->ept_pointer == operand.eptp)
++			pkvm_flush_shadow_ept(&shadow_vcpu->vm->sept_desc);
++		break;
++	}
++	case VMX_EPT_EXTENT_GLOBAL:
++		/*
++		 * For global context invept, directly do invept with the
++		 * shadow eptp of the current shadow vcpu, as there is no
++		 * other shadow ept's TLB entries left on this cpu.
++		 */
++		pkvm_flush_shadow_ept(&shadow_vcpu->vm->sept_desc);
++		break;
++	default:
++		break;
++	}
++
++out:
++	nested_vmx_result(VMsucceed, 0);
++	return 0;
++}
++
++void vpid_sync_context(int vpid)
++{
++	if (vmx_has_invvpid_single())
++		vpid_sync_vcpu_single(vpid);
++	else if (vpid != 0)
++		vpid_sync_vcpu_global();
++}
++
++void vpid_sync_vcpu_addr(int vpid, gva_t addr)
++{
++	if (vpid == 0)
++		return;
++
++	if (vmx_has_invvpid_individual_addr())
++		__invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr);
++	else
++		vpid_sync_context(vpid);
++}
++
++#define VMX_VPID_EXTENT_SUPPORTED_MASK		\
++	(VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT |	\
++	VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |	\
++	VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT |	\
++	VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
++
++int handle_invvpid(struct kvm_vcpu *vcpu)
++{
++	struct vmx_capability *vmx_cap = &pkvm_hyp->vmx_cap;
++	struct vcpu_vmx *vmx = to_vmx(vcpu);
++	u32 vmx_instruction_info, types;
++	struct x86_exception e;
++	unsigned long type;
++	gva_t gva;
++	int gpr_index;
++
++	struct {
++		u64 vpid : 16;
++		u64 rsvd : 48;
++		u64 gla;
++	} operand;
++
++	if (!vmx_has_invvpid())
++		/* TODO: inject #UD */
++		return -EINVAL;
++
++	if (!check_vmx_permission(vcpu))
++		return 0;
++
++	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
++	gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
++	type = vcpu->arch.regs[gpr_index];
++	types = (vmx_cap->vpid & VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
++
++	if (type > VMX_VPID_EXTENT_SINGLE_NON_GLOBAL || !(types & (1 << type))) {
++		nested_vmx_result(VMfailValid, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
++		return 0;
++	}
++
++	if (get_vmx_mem_address(vcpu, vmx->exit_qualification,
++				vmx_instruction_info, &gva))
++		/* TODO: handle the decode failure */
++		return -EINVAL;
++
++	if (read_gva(vcpu, gva, &operand, sizeof(operand), &e) < 0)
++		/*TODO: handle memory failure exception */
++		return -EINVAL;
++
++	if (operand.rsvd != 0) {
++		nested_vmx_result(VMfailValid,
++			VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
++		return 0;
++	}
++
++	switch (type) {
++	case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
++		if (!operand.vpid ||
++			!__is_canonical_address(operand.gla,
++				pkvm_virt_addr_bits())) {
++			nested_vmx_result(VMfailValid,
++				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
++			return 0;
++		}
++
++		vpid_sync_vcpu_addr(operand.vpid, operand.gla);
++		break;
++	case VMX_VPID_EXTENT_SINGLE_CONTEXT:
++	case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
++		if (!operand.vpid) {
++			nested_vmx_result(VMfailValid,
++				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
++			return 0;
++		}
++
++		vpid_sync_context(operand.vpid);
++		break;
++	case VMX_VPID_EXTENT_ALL_CONTEXT:
++		vpid_sync_context(operand.vpid);
++		break;
++	default:
++		break;
++	}
++
++	nested_vmx_result(VMsucceed, 0);
++	return 0;
++}
++
++static bool nested_handle_ept_violation(struct shadow_vcpu_state *shadow_vcpu,
++					u64 l2_gpa, u64 exit_quali)
++{
++	enum sept_handle_ret ret = pkvm_handle_shadow_ept_violation(shadow_vcpu,
++								    l2_gpa, exit_quali);
++	bool handled = false;
++
++	switch (ret) {
++	case PKVM_INJECT_EPT_MISC: {
++		struct vcpu_vmx *vmx = to_vmx(shadow_vcpu->vcpu);
++
++		vmx->exit_reason.full = EXIT_REASON_EPT_MISCONFIG;
++		/*
++		 * Inject EPT_MISCONFIG vmexit reason if can directly modify
++		 * the read-only fields. Otherwise still deliver EPT_VIOLATION
++		 * for simplification.
++		 */
++		if (vmx_has_vmwrite_any_field())
++			vmcs_write32(VM_EXIT_REASON, EXIT_REASON_EPT_MISCONFIG);
++		break;
++	}
++	case PKVM_HANDLED:
++		handled = true;
++		break;
++	default:
++		break;
++	}
++
++	if (handled && (vmcs_read32(IDT_VECTORING_INFO_FIELD) &
++			VECTORING_INFO_VALID_MASK))
++		/* pending interrupt, back to kvm-high to inject */
++		handled = false;
++
++	return handled;
++}
++
++static void pkvm_get_ve_info(struct kvm_vcpu *vcpu)
++{
++	struct shadow_vcpu_state *shadow_vcpu = to_pkvm_hvcpu(vcpu)->current_shadow_vcpu;
++	struct pkvm_ve_info *ve;
++
++	ve = &shadow_vcpu->ve_info;
++
++	kvm_rcx_write(vcpu, ve->exit_reason);
++	kvm_rdx_write(vcpu, ve->exit_qual);
++	kvm_r8_write(vcpu, ve->gla);
++	kvm_r9_write(vcpu, ve->gpa);
++
++	/*
++	 * When virtualization exception happens, the valid filed in #VE
++	 * information will be set to 0xffffffff. We need to clear it to 0 when
++	 * protected VM handles this #VE, so another #VE can continue to happen.
++	 */
++	ve->valid = 0;
++}
++
++static bool nested_handle_vmcall(struct kvm_vcpu *vcpu)
++{
++	u64 nr, a0, a1, a2, a3;
++	struct shadow_vcpu_state *shadow_vcpu = to_pkvm_hvcpu(vcpu)->current_shadow_vcpu;
++	struct pkvm_pgtable *pgstate_pgt = &shadow_vcpu->vm->pgstate_pgt;
++	bool handled = false;
++	int ret = 0;
++
++	/* All normal guest's vmcall should be handled by KVM. */
++	if (!shadow_vcpu_is_protected(shadow_vcpu))
++		return false;
++
++	nr = vcpu->arch.regs[VCPU_REGS_RAX];
++	a0 = vcpu->arch.regs[VCPU_REGS_RBX];
++	a1 = vcpu->arch.regs[VCPU_REGS_RCX];
++	a2 = vcpu->arch.regs[VCPU_REGS_RDX];
++	a3 = vcpu->arch.regs[VCPU_REGS_RSI];
++
++	switch (nr) {
++	case PKVM_GHC_SHARE_MEM:
++		ret = __pkvm_guest_share_host(pgstate_pgt, a0, a1);
++		handled = true;
++		break;
++	case PKVM_GHC_UNSHARE_MEM:
++		ret = __pkvm_guest_unshare_host(pgstate_pgt, a0, a1);
++		handled = true;
++		break;
++	case PKVM_GHC_GET_VE_INFO:
++		pkvm_get_ve_info(vcpu);
++		handled = true;
++		break;
++	default:
++		break;
++	}
++
++	if (handled)
++		vcpu->arch.regs[VCPU_REGS_RAX] = ret;
++
++	return handled;
++}
++
++static bool nested_handle_cpuid(struct kvm_vcpu *vcpu)
++{
++	struct shadow_vcpu_state *shadow_vcpu = to_pkvm_hvcpu(vcpu)->current_shadow_vcpu;
++	u32 leaf;
++
++	if (!shadow_vcpu_is_protected(shadow_vcpu))
++		return false;
++
++	leaf = vcpu->arch.regs[VCPU_REGS_RAX];
++
++	/*
++	 * Reuse the KVM_CPUID_SIGNATURE, which has been used by KVM. By
++	 * intercept the process of detecting hypervisor, the protected vm will
++	 * detect PKVM hypervisor instead of KVM.
++	 */
++	if (leaf == KVM_CPUID_SIGNATURE) {
++		const u32 *sigptr = (const u32 *)"PKVMPKVMPKVM";
++		vcpu->arch.regs[VCPU_REGS_RBX] = sigptr[0];
++		vcpu->arch.regs[VCPU_REGS_RCX] = sigptr[1];
++		vcpu->arch.regs[VCPU_REGS_RDX] = sigptr[2];
++		return true;
++	}
++
++	return false;
++}
++
++int nested_vmexit(struct kvm_vcpu *vcpu, bool *skip_instruction)
++{
++	struct vcpu_vmx *vmx = to_vmx(vcpu);
++	struct pkvm_host_vcpu *pkvm_hvcpu = to_pkvm_hvcpu(vcpu);
++	struct shadow_vcpu_state *cur_shadow_vcpu = pkvm_hvcpu->current_shadow_vcpu;
++	struct vmcs *vmcs02 = (struct vmcs *)cur_shadow_vcpu->vmcs02;
++	struct vmcs12 *vmcs12 = (struct vmcs12 *)cur_shadow_vcpu->cached_vmcs12;
++
++	switch (vmx->exit_reason.full) {
++	case EXIT_REASON_EPT_VIOLATION:
++		/* EPT violation can be handled by pkvm, no need back to kvm-high */
++		if (nested_handle_ept_violation(cur_shadow_vcpu,
++						vmcs_read64(GUEST_PHYSICAL_ADDRESS),
++						vmx->exit_qualification))
++			return 0;
++		break;
++	case EXIT_REASON_VMCALL:
++		if (nested_handle_vmcall(vcpu)) {
++			*skip_instruction = true;
++			return 0;
++		}
++		break;
++	case EXIT_REASON_INIT_SIGNAL:
++		/*
++		 * INIT vmexit reason is unsupported by KVM in primary VM and
++		 * it is reused by pkvm to kick vcpu out of non-root.
++		 * When this vmexit reason happens, no need back to primary VM.
++		 */
++		return 0;
++	case EXIT_REASON_CPUID:
++		if (nested_handle_cpuid(vcpu)) {
++			*skip_instruction = true;
++			return 0;
++		}
++		break;
++	default:
++		break;
++	}
++
++	/* clear guest mode */
++	vcpu->arch.hflags &= ~HF_GUEST_MASK;
++
++	/* L1 host wishes to keep use MSRs from L2 guest after its VMExit?
++	 * save vmcs02 guest state for later vmcs01 guest state preparation
++	 */
++	if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) != VM_EXIT_LOAD_IA32_EFER)
++		vmcs12->host_ia32_efer = vmcs_read64(GUEST_IA32_EFER);
++	if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) != VM_EXIT_LOAD_IA32_PAT)
++		vmcs12->host_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
++
++	if (!vmcs12->launch_state)
++		vmcs12->launch_state = 1;
++
++	/* switch to vmcs01 */
++	vmcs_clear_track(vmx, vmcs02);
++	set_shadow_indicator(vmcs02);
++	vmcs_load_track(vmx, vmx->loaded_vmcs->vmcs);
++
++	prepare_vmcs01_guest_state(vmx, vmcs12);
++
++	return 0;
++}
++
++void nested_flush_shadow_ept(struct kvm_vcpu *vcpu)
++{
++	struct pkvm_host_vcpu *pkvm_hvcpu = to_pkvm_hvcpu(vcpu);
++	struct shadow_vcpu_state *cur_shadow_vcpu = pkvm_hvcpu->current_shadow_vcpu;
++
++	/*
++	 * If the shadow vcpu is released from this CPU, no need to
++	 * worry about its TLB as it is already flushed during release.
++	 */
++	if (!cur_shadow_vcpu)
++		return;
++
++	/*
++	 * And probably the shadow EPT is not the one wanting to be flushed
++	 * if another shadow vcpu is loaded after kick, and cannot tell
++	 * this case without additional hints. So always do the shadow
++	 * ept flushing.
++	 */
++	pkvm_flush_shadow_ept(&cur_shadow_vcpu->vm->sept_desc);
++}
++
++void nested_invalidate_shadow_ept(int shadow_vm_handle, u64 start_gpa, u64 size)
++{
++	struct pkvm_shadow_vm *vm = get_shadow_vm(shadow_vm_handle);
++
++	if (!vm)
++		return;
++
++	if (!start_gpa && !size)
++		/*
++		 * With start_gpa = 0 & size = 0, do invalidation
++		 * for the entire shadow EPT
++		 */
++		pkvm_invalidate_shadow_ept(&vm->sept_desc);
++	else
++		pkvm_invalidate_shadow_ept_with_range(&vm->sept_desc,
++						      start_gpa, size);
++
++	put_shadow_vm(shadow_vm_handle);
++}
++
++void pkvm_init_nest(void)
++{
++	init_vmcs_shadow_fields();
++	init_emulated_vmcs_fields();
++}
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/nested.h b/arch/x86/kvm/vmx/pkvm/hyp/nested.h
+new file mode 100644
+index 000000000000..c539026862c2
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/nested.h
+@@ -0,0 +1,32 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++#ifndef __PKVM_NESTED_H
++#define __PKVM_NESTED_H
++
++int handle_vmxon(struct kvm_vcpu *vcpu);
++int handle_vmxoff(struct kvm_vcpu *vcpu);
++int handle_vmptrld(struct kvm_vcpu *vcpu);
++int handle_vmclear(struct kvm_vcpu *vcpu);
++int handle_vmwrite(struct kvm_vcpu *vcpu);
++int handle_vmread(struct kvm_vcpu *vcpu);
++int handle_vmresume(struct kvm_vcpu *vcpu);
++int handle_vmlaunch(struct kvm_vcpu *vcpu);
++int handle_invept(struct kvm_vcpu *vcpu);
++int handle_invvpid(struct kvm_vcpu *vcpu);
++int nested_vmexit(struct kvm_vcpu *vcpu, bool *skip_instruction);
++void nested_flush_shadow_ept(struct kvm_vcpu *vcpu);
++void nested_invalidate_shadow_ept(int shadow_handle, u64 start_gpa, u64 size);
++void pkvm_init_nest(void);
++
++#define LIST_OF_VMX_MSRS        		\
++	MSR_IA32_VMX_MISC,                      \
++	MSR_IA32_VMX_PROCBASED_CTLS2,           \
++	MSR_IA32_VMX_EPT_VPID_CAP,              \
++	MSR_IA32_VMX_VMFUNC
++
++bool is_vmx_msr(unsigned long msr);
++int read_vmx_msr(struct kvm_vcpu *vcpu, unsigned long msr, u64 *val);
++
++#endif
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/pci.c b/arch/x86/kvm/vmx/pkvm/hyp/pci.c
+new file mode 100644
+index 000000000000..222f009e669c
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/pci.c
+@@ -0,0 +1,350 @@
++// SPDX-License-Identifier: GPL-2.0
++/* Copyright(c) 2023 Intel Corporation. */
++#include <asm/pci_x86.h>
++#include <pkvm.h>
++
++#include "pkvm_spinlock.h"
++#include "io.h"
++#include "io_emulate.h"
++#include "mmu.h"
++#include "ptdev.h"
++#include "pci.h"
++
++static union pci_cfg_addr_reg host_vpci_cfg_addr;
++static pkvm_spinlock_t pci_cfg_lock = { __ARCH_PKVM_SPINLOCK_UNLOCKED };
++static pkvm_spinlock_t host_vpci_cfg_lock = { __ARCH_PKVM_SPINLOCK_UNLOCKED };
++
++static int pci_cfg_space_read(union pci_cfg_addr_reg *cfg_addr,
++	u32 offset, int size, unsigned long *value)
++{
++	pkvm_spin_lock(&pci_cfg_lock);
++
++	pkvm_pio_write(PCI_CFG_ADDR, 4, cfg_addr->value);
++	pkvm_pio_read(PCI_CFG_DATA + offset, size, value);
++
++	pkvm_spin_unlock(&pci_cfg_lock);
++
++	return 0;
++}
++
++static int pci_cfg_space_write(union pci_cfg_addr_reg *cfg_addr,
++	u32 offset, int size, unsigned long value)
++{
++	pkvm_spin_lock(&pci_cfg_lock);
++
++	pkvm_pio_write(PCI_CFG_ADDR, 4, cfg_addr->value);
++	pkvm_pio_write(PCI_CFG_DATA + offset, size, value);
++
++	pkvm_spin_unlock(&pci_cfg_lock);
++
++	return 0;
++}
++
++static int pci_mmcfg_read(u64 address, int size, unsigned long *value)
++{
++	pkvm_mmio_read(address, size, value);
++	return 0;
++}
++
++static int pci_mmcfg_write(u64 address, int size, unsigned long value)
++{
++	pkvm_mmio_write(address, size, value);
++	return 0;
++}
++
++unsigned long pkvm_pci_cfg_space_read(u32 bdf, u32 offset, int size)
++{
++	union pci_cfg_addr_reg reg;
++	unsigned long value = 0;
++
++	reg.enable = 1;
++	reg.bdf = bdf;
++	reg.reg = offset & (~0x3);
++
++	pci_cfg_space_read(&reg, offset & 0x3, size, &value);
++
++	return value;
++}
++
++void pkvm_pci_cfg_space_write(u32 bdf, u32 offset, int size, unsigned long value)
++{
++	union pci_cfg_addr_reg reg;
++
++	reg.enable = 1;
++	reg.bdf = bdf;
++	reg.reg = offset & (~0x3);
++
++	pci_cfg_space_write(&reg, offset & 0x3, size, value);
++}
++
++static bool host_vpci_cfg_data_allow_write(struct pkvm_ptdev *ptdev, u64 offset, int size, u32 value)
++{
++	int index;
++
++	if (!ptdev_attached_to_vm(ptdev))
++		return true;
++
++	if (offset >= 0x10 && offset < 0x28) {
++		index = (offset-0x10) >> 2;
++		/* Allow only aligned BAR write with the cached value*/
++		return (offset & 0x3) == 0 && size == 4 && value == ptdev->bars[index];
++	}
++
++	return true;
++}
++
++static int host_vpci_cfg_addr_read(struct kvm_vcpu *vcpu, struct pkvm_pio_req *req)
++{
++	u32 value = host_vpci_cfg_addr.value;
++	int ret = 0;
++
++	pkvm_spin_lock(&host_vpci_cfg_lock);
++
++	switch (req->size) {
++	case 1:
++		*(u8 *)req->value = (u8)value;
++		break;
++	case 2:
++		*(u16 *)req->value = (u16)value;
++		break;
++	case 4:
++		*(u32 *)req->value = value;
++		break;
++	default:
++		ret = -EINVAL;
++		break;
++	}
++
++	pkvm_spin_unlock(&host_vpci_cfg_lock);
++
++	return ret;
++}
++
++static int host_vpci_cfg_addr_write(struct kvm_vcpu *vcpu, struct pkvm_pio_req *req)
++{
++	u32 *value = &host_vpci_cfg_addr.value;
++	int ret = 0;
++
++	pkvm_spin_lock(&host_vpci_cfg_lock);
++
++	switch (req->size) {
++	case 1:
++		*(u8 *)value = (u8)*req->value;
++		break;
++	case 2:
++		*(u16 *)value = (u16)*req->value;
++		break;
++	case 4:
++		*value = (u32)*req->value;
++		break;
++	default:
++		ret = -EINVAL;
++		break;
++	}
++
++	pkvm_spin_unlock(&host_vpci_cfg_lock);
++
++	return ret;
++}
++
++static int host_vpci_cfg_data_audit_write(struct pkvm_pio_req *req)
++{
++	struct pkvm_ptdev *ptdev;
++	u64 offset = host_vpci_cfg_addr.reg;
++	u32 bdf = host_vpci_cfg_addr.bdf;
++	int ret;
++
++	ptdev = pkvm_get_ptdev(bdf, 0);
++
++	if (ptdev) {
++		pkvm_spin_lock(&ptdev->lock);
++		if (!host_vpci_cfg_data_allow_write(ptdev, offset + req->port - PCI_CFG_DATA,
++			req->size, *req->value)) {
++			ret = -EINVAL;
++			goto out;
++		}
++	}
++
++	ret = pci_cfg_space_write(&host_vpci_cfg_addr, req->port - PCI_CFG_DATA, req->size, *req->value);
++
++out:
++	if (ptdev) {
++		pkvm_spin_unlock(&ptdev->lock);
++		pkvm_put_ptdev(ptdev);
++	}
++
++	return ret;
++}
++
++static int host_vpci_cfg_data_read(struct kvm_vcpu *vcpu, struct pkvm_pio_req *req)
++{
++	int ret;
++
++	pkvm_spin_lock(&host_vpci_cfg_lock);
++
++	if (host_vpci_cfg_addr.enable)
++		ret = pci_cfg_space_read(&host_vpci_cfg_addr, req->port - PCI_CFG_DATA, req->size, req->value);
++	else
++		ret = -EINVAL;
++
++	pkvm_spin_unlock(&host_vpci_cfg_lock);
++
++	return ret;
++}
++
++static int host_vpci_cfg_data_write(struct kvm_vcpu *vcpu, struct pkvm_pio_req *req)
++{
++	int ret;
++
++	pkvm_spin_lock(&host_vpci_cfg_lock);
++
++	if (host_vpci_cfg_addr.enable)
++		ret = host_vpci_cfg_data_audit_write(req);
++	else
++		ret = -EINVAL;
++
++	pkvm_spin_unlock(&host_vpci_cfg_lock);
++
++	return ret;
++}
++
++static int host_vpci_mmcfg_get_bdf_offset(u64 address, u32 *bdf, u64 *offset)
++{
++	int i;
++	struct pkvm_pci_info *pci_info;
++	struct pci_mmcfg_region *region;
++
++	pci_info = &pkvm_hyp->host_vm.pci_info;
++	for (i = 0; i < pci_info->mmcfg_table_size; i++) {
++		region = &pci_info->mmcfg_table[i];
++		if (address >= region->res.start && address <= region->res.end) {
++			*bdf = (address - region->address) >> 12;
++			*offset = address & 0xfff;
++			return 0;
++		}
++	}
++
++	return -EINVAL;
++}
++
++int host_vpci_mmcfg_read(struct kvm_vcpu *vcpu, struct pkvm_mmio_req *req)
++{
++	u64 address = (u64)host_mmio2hva(req->address);
++
++	return pci_mmcfg_read(address, req->size, req->value);
++}
++
++int host_vpci_mmcfg_write(struct kvm_vcpu *vcpu, struct pkvm_mmio_req *req)
++{
++	struct pkvm_ptdev *ptdev;
++	u64 offset, address = (u64)host_mmio2hva(req->address);
++	u32 bdf;
++	int ret;
++
++	if (host_vpci_mmcfg_get_bdf_offset(req->address, &bdf, &offset))
++		return -EINVAL;
++
++	ptdev = pkvm_get_ptdev(bdf, 0);
++
++	if (ptdev) {
++		pkvm_spin_lock(&ptdev->lock);
++		if (!host_vpci_cfg_data_allow_write(ptdev, offset, req->size, *req->value)) {
++			ret = -EINVAL;
++			goto out;
++		}
++	}
++
++	ret = pci_mmcfg_write(address, req->size, *req->value);
++
++out:
++	if (ptdev) {
++		pkvm_spin_unlock(&ptdev->lock);
++		pkvm_put_ptdev(ptdev);
++	}
++
++	return ret;
++}
++
++int init_pci(struct pkvm_hyp *pkvm)
++{
++	int ret;
++
++	ret = register_host_pio_handler(&pkvm->host_vm,
++		PCI_CFG_ADDR, IO_SIZE_4, host_vpci_cfg_addr_read, host_vpci_cfg_addr_write);
++	if (ret)
++		goto out;
++
++	/*
++	 * Kernel access the PCI config space data port in an unaligned way. So here we
++	 * treat the data port as four consecutive ports and register four handlers for it.
++	 * All registered ports and access width below are valid.
++	 */
++	ret = register_host_pio_handler(&pkvm->host_vm,
++		PCI_CFG_DATA, IO_SIZE_FULL, host_vpci_cfg_data_read, host_vpci_cfg_data_write);
++	if (ret)
++		goto out;
++
++	ret = register_host_pio_handler(&pkvm->host_vm,
++		PCI_CFG_DATA + 1, IO_SIZE_1, host_vpci_cfg_data_read, host_vpci_cfg_data_write);
++	if (ret)
++		goto out;
++
++	ret = register_host_pio_handler(&pkvm->host_vm,
++		PCI_CFG_DATA + 2, IO_SIZE_1 | IO_SIZE_2, host_vpci_cfg_data_read, host_vpci_cfg_data_write);
++	if (ret)
++		goto out;
++
++	ret = register_host_pio_handler(&pkvm->host_vm,
++		PCI_CFG_DATA + 3, IO_SIZE_1, host_vpci_cfg_data_read, host_vpci_cfg_data_write);
++	if (ret)
++		goto out;
++
++	return 0;
++
++out:
++	pkvm_err("pkvm: init pci failed");
++	return ret;
++}
++
++static int pkvm_mmu_map_mmcfg_region(struct pkvm_pci_info *pci_info)
++{
++	struct pci_mmcfg_region *region;
++	int i, ret;
++	u64 start, end;
++
++	for (i = 0; i < pci_info->mmcfg_table_size; i++) {
++		region = &pci_info->mmcfg_table[i];
++		start = region->res.start;
++		end = region->res.end;
++		ret = pkvm_mmu_map((u64)host_mmio2hva(start), start,
++			end - start + 1, 0, (u64)pgprot_val(PAGE_KERNEL_IO));
++		if (ret)
++			return ret;
++	}
++
++	return 0;
++}
++
++int init_finalize_pci(struct pkvm_pci_info *pci_info)
++{
++	struct pci_mmcfg_region *region;
++	unsigned long start, end;
++	int ret, i;
++
++	ret = pkvm_mmu_map_mmcfg_region(pci_info);
++	if (ret)
++		return ret;
++
++	for (i = 0; i < pci_info->mmcfg_table_size; i++) {
++		region = &pci_info->mmcfg_table[i];
++		start = region->res.start;
++		end = region->res.end;
++
++		ret = register_host_mmio_handler(start, end,
++			host_vpci_mmcfg_read, host_vpci_mmcfg_write);
++		if (ret)
++			return ret;
++	}
++
++	return 0;
++}
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/pci.h b/arch/x86/kvm/vmx/pkvm/hyp/pci.h
+new file mode 100644
+index 000000000000..22d57eff24df
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/pci.h
+@@ -0,0 +1,24 @@
++/* SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2023 Intel Corporation
++ */
++#ifndef _PKVM_PCI_H_
++#define _PKVM_PCI_H_
++
++#define PCI_CFG_ADDR 0xcf8
++#define PCI_CFG_DATA 0xcfc
++
++union pci_cfg_addr_reg {
++	u32 value;
++	struct {
++		u32 reg : 8;
++		u32 bdf : 16;
++		u32 resv : 7;
++		u32 enable : 1;
++	};
++};
++
++unsigned long pkvm_pci_cfg_space_read(u32 bdf, u32 offset, int size);
++void pkvm_pci_cfg_space_write(u32 bdf, u32 offset, int size, unsigned long value);
++
++int init_finalize_pci(struct pkvm_pci_info *pci);
++#endif
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/pgtable.c b/arch/x86/kvm/vmx/pkvm/hyp/pgtable.c
+new file mode 100644
+index 000000000000..463b053d7894
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/pgtable.c
+@@ -0,0 +1,801 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++
++#include <pkvm.h>
++
++#include "pgtable.h"
++#include "memory.h"
++#include "mem_protect.h"
++#include "debug.h"
++#include "bug.h"
++
++struct pgt_walk_data {
++	struct pkvm_pgtable *pgt;
++	struct pgt_flush_data flush_data;
++	unsigned long vaddr;
++	unsigned long vaddr_end;
++	struct pkvm_pgtable_walker *walker;
++};
++
++struct pkvm_pgtable_lookup_data {
++	unsigned long vaddr;
++	unsigned long phys;
++	u64 prot;
++	int level;
++};
++
++static bool pkvm_phys_is_valid(u64 phys)
++{
++	return phys != INVALID_ADDR;
++}
++
++static bool leaf_mapping_valid(struct pkvm_pgtable_ops *pgt_ops,
++			       unsigned long vaddr,
++			       unsigned long vaddr_end,
++			       int pgsz_mask,
++			       int level)
++{
++	unsigned long page_size = pgt_ops->pgt_level_to_size(level);
++
++	if (!((1 << level) & pgsz_mask))
++		return false;
++
++	if (!IS_ALIGNED(vaddr, page_size))
++		return false;
++
++	if (page_size > (vaddr_end - vaddr))
++		return false;
++
++	return true;
++}
++
++static bool leaf_mapping_allowed(struct pkvm_pgtable_ops *pgt_ops,
++				 unsigned long vaddr,
++				 unsigned long vaddr_end,
++				 unsigned long phys,
++				 int pgsz_mask,
++				 int level)
++{
++	unsigned long page_size = pgt_ops->pgt_level_to_size(level);
++
++	if (pkvm_phys_is_valid(phys) && !IS_ALIGNED(phys, page_size))
++		return false;
++
++	return leaf_mapping_valid(pgt_ops, vaddr, vaddr_end, pgsz_mask, level);
++}
++
++static void *pgtable_alloc_page(struct pkvm_mm_ops *mm_ops)
++{
++	void *page = NULL;
++
++	if (mm_ops->zalloc_page)
++		page = mm_ops->zalloc_page();
++
++	if (page && mm_ops->flush_cache)
++		mm_ops->flush_cache(page, PAGE_SIZE);
++
++	return page;
++}
++
++static void pgtable_set_entry(struct pkvm_pgtable_ops *pgt_ops,
++			struct pkvm_mm_ops *mm_ops,
++			void *ptep, u64 pte)
++{
++	pgt_ops->pgt_set_entry(ptep, pte);
++
++	if (mm_ops->flush_cache)
++		mm_ops->flush_cache(ptep, sizeof(u64));
++}
++
++static void pgtable_split(struct pkvm_pgtable_ops *pgt_ops,
++			  struct pkvm_mm_ops *mm_ops,
++			  unsigned long vaddr, unsigned long phys,
++			  unsigned long size, void *ptep,
++			  int level, u64 prot)
++{
++	unsigned long phys_end = phys + size;
++	int level_size = pgt_ops->pgt_level_to_size(level);
++	int entry_size = PAGE_SIZE / pgt_ops->pgt_level_to_entries(level);
++	int i = 0;
++
++	if (level > PG_LEVEL_4K)
++		pgt_ops->pgt_entry_mkhuge(&prot);
++
++	for (i = 0; phys < phys_end; phys += level_size, i++) {
++		pgtable_set_entry(pgt_ops, mm_ops,(ptep + i * entry_size), phys | prot);
++		mm_ops->get_page(ptep);
++	}
++}
++
++int pgtable_map_leaf(struct pkvm_pgtable *pgt,
++			    unsigned long vaddr,
++			    int level, void *ptep,
++			    struct pgt_flush_data *flush_data,
++			    struct pkvm_pgtable_map_data *data)
++{
++	struct pkvm_pgtable_ops *pgt_ops = pgt->pgt_ops;
++	struct pkvm_mm_ops *mm_ops = pgt->mm_ops;
++	u64 old = *(u64 *)ptep, new;
++
++	if (pkvm_phys_is_valid(data->phys)) {
++		new = data->phys | data->prot;
++		if (level != PG_LEVEL_4K)
++			pgt_ops->pgt_entry_mkhuge(&new);
++	} else {
++		new = data->annotation;
++	}
++
++	if (pgt_ops->pgt_entry_mapped(ptep)) {
++		/* if just modify the page state, do set_pte directly */
++		if (!((old ^ new) & ~PKVM_PAGE_STATE_PROT_MASK))
++			goto set_pte;
++
++		if (pgt_ops->pgt_entry_present(ptep)) {
++			pgtable_set_entry(pgt_ops, mm_ops, ptep, 0);
++			flush_data->flushtlb |= true;
++		}
++		mm_ops->put_page(ptep);
++	}
++
++	if (pgt_ops->pgt_entry_mapped(&new))
++		mm_ops->get_page(ptep);
++
++set_pte:
++	pgtable_set_entry(pgt_ops, mm_ops, ptep, new);
++	if (pkvm_phys_is_valid(data->phys))
++		data->phys += page_level_size(level);
++
++	return 0;
++}
++
++static int pgtable_map_try_leaf(struct pkvm_pgtable *pgt, unsigned long vaddr,
++				unsigned long vaddr_end, int level, void *ptep,
++				struct pgt_flush_data *flush_data,
++				struct pkvm_pgtable_map_data *data)
++{
++	if (!leaf_mapping_allowed(pgt->pgt_ops, vaddr, vaddr_end,
++				 data->phys, data->pgsz_mask, level)) {
++		/* The 4K page shall be able to map, otherwise return err */
++		return (level == PG_LEVEL_4K ? -EINVAL: -E2BIG);
++	}
++
++	if (data->map_leaf_override)
++		return data->map_leaf_override(pgt, vaddr, level, ptep, flush_data, data);
++	else
++		return pgtable_map_leaf(pgt, vaddr, level, ptep, flush_data, data);
++}
++
++static int pgtable_map_walk_leaf(struct pkvm_pgtable *pgt,
++				 unsigned long vaddr, unsigned long vaddr_end,
++				 int level, void *ptep, unsigned long flags,
++				 struct pgt_flush_data *flush_data,
++				 struct pkvm_pgtable_map_data *data)
++{
++	struct pkvm_pgtable_ops *pgt_ops = pgt->pgt_ops;
++	struct pkvm_mm_ops *mm_ops = pgt->mm_ops;
++	unsigned long size = page_level_size(level);
++	void *page;
++	int ret;
++
++	/* First try to create leaf page mapping on current level */
++	ret = pgtable_map_try_leaf(pgt, vaddr, vaddr_end, level, ptep, flush_data, data);
++	if (ret != -E2BIG)
++		return ret;
++
++	/*
++	 * Be here is because the mapping needs to be done on smaller(or level-1)
++	 * page size. We need to allocate a table page for the smaller(level-1)
++	 * page mapping. And for current level, if the huge page mapping is already
++	 * present, we need further split it.
++	 */
++	page = pgtable_alloc_page(mm_ops);
++	if (!page)
++		return -ENOMEM;
++
++	if (pgt_ops->pgt_entry_huge(ptep)) {
++		u64 prot = pgt_ops->pgt_entry_to_prot(ptep);
++
++		prot = pkvm_mkstate(prot, pkvm_getstate(*(u64 *)ptep));
++
++		/*
++		 * Split the large mapping and reuse the
++		 * large mapping's prot. The translation
++		 * doesn't have a change, so no need to
++		 * flush tlb.
++		 */
++		mm_ops->put_page(ptep);
++		pgtable_split(pgt_ops, mm_ops, ALIGN_DOWN(vaddr, size),
++			      pgt_ops->pgt_entry_to_phys(ptep),
++			      size, page, level - 1, prot);
++	}
++
++	mm_ops->get_page(ptep);
++	pgtable_set_entry(pgt_ops, mm_ops, ptep, pgt->table_prot | mm_ops->virt_to_phys(page));
++
++	return 0;
++}
++
++/*
++ *TODO: support merging small entries to a large one.
++ */
++static int pgtable_map_cb(struct pkvm_pgtable *pgt, unsigned long vaddr,
++			  unsigned long vaddr_end, int level, void *ptep,
++			  unsigned long flags, struct pgt_flush_data *flush_data,
++			  void *const arg)
++{
++	struct pkvm_pgtable_map_data *data = arg;
++
++	switch(flags) {
++	case PKVM_PGTABLE_WALK_LEAF:
++		return pgtable_map_walk_leaf(pgt, vaddr, vaddr_end, level,
++					     ptep, flags, flush_data, data);
++	case PKVM_PGTABLE_WALK_TABLE_PRE:
++	case PKVM_PGTABLE_WALK_TABLE_POST:
++		break;
++	}
++
++	return -EINVAL;
++}
++
++/*
++ * put_page_to_free_list(): the page added to the freelist should not be used
++ * by any one as this page will be used as a node linked to the freelist.
++ */
++static inline void put_page_to_freelist(void *page, struct list_head *head)
++{
++	struct list_head *node = page;
++
++	list_add_tail(node, head);
++}
++
++/*
++ * get_page_to_free_list(): the page got from the freelist is valid to be used
++ * again.
++ */
++static inline void *get_page_from_freelist(struct list_head *head)
++{
++	struct list_head *node = head->next;
++
++	list_del(node);
++	memset(node, 0, sizeof(struct list_head));
++
++	return (void *)node;
++}
++
++static int pgtable_unmap_leaf(struct pkvm_pgtable *pgt, unsigned long vaddr,
++			      int level, void *ptep, struct pgt_flush_data *flush_data,
++			      void *const arg)
++{
++	struct pkvm_pgtable_unmap_data *data = arg;
++	struct pkvm_pgtable_ops *pgt_ops = pgt->pgt_ops;
++	struct pkvm_mm_ops *mm_ops = pgt->mm_ops;
++	unsigned long size = page_level_size(level);
++
++	if (data->phys != INVALID_ADDR) {
++		unsigned long phys = pgt_ops->pgt_entry_to_phys(ptep);
++
++		PKVM_ASSERT(phys == data->phys);
++	}
++
++	if (pgt_ops->pgt_entry_present(ptep))
++		flush_data->flushtlb |= true;
++
++	pgtable_set_entry(pgt_ops, mm_ops, ptep, pgt_ops->default_prot);
++	mm_ops->put_page(ptep);
++
++	if (data->phys != INVALID_ADDR) {
++		data->phys = ALIGN_DOWN(data->phys, size);
++		data->phys += size;
++	}
++
++	return 0;
++}
++
++static void pgtable_free_child(struct pkvm_pgtable *pgt, void *ptep,
++			    struct pgt_flush_data *flush_data)
++{
++	struct pkvm_pgtable_ops *pgt_ops = pgt->pgt_ops;
++	struct pkvm_mm_ops *mm_ops = pgt->mm_ops;
++	void *child_ptep;
++
++	/*
++	 * Check the child pte page refcount. Put the child pte page if
++	 * no one else is using it.
++	 */
++	child_ptep = mm_ops->phys_to_virt(pgt_ops->pgt_entry_to_phys(ptep));
++	if (mm_ops->page_count(child_ptep) == 1) {
++		pgtable_set_entry(pgt_ops, mm_ops, ptep, pgt_ops->default_prot);
++		mm_ops->put_page(ptep);
++		put_page_to_freelist(child_ptep, &flush_data->free_list);
++	}
++}
++
++static int pgtable_unmap_cb(struct pkvm_pgtable *pgt, unsigned long vaddr,
++			    unsigned long vaddr_end, int level, void *ptep,
++			    unsigned long flags, struct pgt_flush_data *flush_data,
++			    void *const arg)
++{
++	struct pkvm_pgtable_unmap_data *data = arg;
++	struct pkvm_pgtable_ops *pgt_ops = pgt->pgt_ops;
++	struct pkvm_mm_ops *mm_ops = pgt->mm_ops;
++	unsigned long size = page_level_size(level);
++
++	if (!pgt_ops->pgt_entry_mapped(ptep))
++		/* Nothing to do if the entry is not mapped */
++		return 0;
++
++	/*
++	 * Unmap the page if the target address range belongs a
++	 * - 4K PTE entry
++	 * - huge page and don't need to split it
++	 * - a full huge page
++	 */
++	if (level == PG_LEVEL_4K || (pgt_ops->pgt_entry_huge(ptep) &&
++		(!data->split_huge_page || leaf_mapping_valid(pgt_ops, vaddr,
++			vaddr_end, 1 << level, level)))) {
++
++		if (data->unmap_leaf_override) {
++			vaddr = ALIGN_DOWN(vaddr, pgt_ops->pgt_level_to_size(level));
++			return data->unmap_leaf_override(pgt, vaddr, level, ptep,
++							 flush_data, data);
++		} else
++			return pgtable_unmap_leaf(pgt, vaddr, level, ptep,
++						  flush_data, data);
++	}
++
++	if (pgt_ops->pgt_entry_huge(ptep)) {
++		/*
++		 * if it is huge pte, split and goto next level.
++		 */
++		u64 prot = pgt_ops->pgt_entry_to_prot(ptep);
++		void *page = pgtable_alloc_page(mm_ops);
++
++		if (!page)
++			return -ENOMEM;
++
++		prot = pkvm_mkstate(prot, pkvm_getstate(*(u64 *)ptep));
++		/*
++		 * Split the large mapping and reuse the
++		 * large mapping's prot. The translation
++		 * doesn't have a change, so no need to
++		 * flush tlb.
++		 */
++		pgtable_split(pgt_ops, mm_ops, ALIGN_DOWN(vaddr, size),
++			      pgt_ops->pgt_entry_to_phys(ptep),
++			      size, page, level - 1, prot);
++		pgtable_set_entry(pgt_ops, mm_ops, ptep,
++				pgt->table_prot | mm_ops->virt_to_phys(page));
++		return 0;
++	}
++
++	/* if not huge entry then means it is table entry */
++	pgtable_free_child(pgt, ptep, flush_data);
++	return 0;
++}
++
++static int pgtable_lookup_cb(struct pkvm_pgtable *pgt,
++			    unsigned long aligned_vaddr,
++			    unsigned long aligned_vaddr_end,
++			    int level,
++			    void *ptep,
++			    unsigned long flags,
++			    struct pgt_flush_data *flush_data,
++			    void *const arg)
++{
++	struct pkvm_pgtable_lookup_data *data = arg;
++	struct pkvm_pgtable_ops *pgt_ops = pgt->pgt_ops;
++	u64 pte = atomic64_read((atomic64_t *)ptep);
++
++	data->phys = INVALID_ADDR;
++	data->prot = 0;
++	data->level = level;
++
++	/*
++	 * This cb shall only be called for leaf. If now it is not a leaf
++	 * that means the pte is changed by others, and we shall re-walk the pgtable
++	 */
++	if (unlikely(!pgt_ops->pgt_entry_is_leaf(&pte, level)))
++		return -EAGAIN;
++
++	if (pgt_ops->pgt_entry_present(&pte)) {
++		unsigned long offset =
++			data->vaddr & ~pgt_ops->pgt_level_page_mask(level);
++
++		data->phys = pgt_ops->pgt_entry_to_phys(&pte) + offset;
++		data->prot = pgt_ops->pgt_entry_to_prot(&pte);
++	}
++
++	return PGTABLE_WALK_DONE;
++}
++
++static int pgtable_free_leaf(struct pkvm_pgtable *pgt,
++			     struct pgt_flush_data *flush_data,
++			     void *ptep)
++{
++	if (pgt->pgt_ops->pgt_entry_mapped(ptep)) {
++		if (pgt->pgt_ops->pgt_entry_present(ptep))
++			flush_data->flushtlb |= true;
++		pgt->mm_ops->put_page(ptep);
++	}
++
++	return 0;
++}
++
++static int pgtable_free_cb(struct pkvm_pgtable *pgt,
++			    unsigned long vaddr,
++			    unsigned long vaddr_end,
++			    int level,
++			    void *ptep,
++			    unsigned long flags,
++			    struct pgt_flush_data *flush_data,
++			    void *const arg)
++{
++	struct pkvm_pgtable_free_data *data = arg;
++	struct pkvm_pgtable_ops *pgt_ops = pgt->pgt_ops;
++
++	if (pgt_ops->pgt_entry_is_leaf(ptep, level)) {
++		if (data->free_leaf_override)
++			return data->free_leaf_override(pgt, vaddr, level, ptep,
++							flush_data, data);
++		else
++			return pgtable_free_leaf(pgt, flush_data, ptep);
++	}
++
++	/* Free the child page */
++	pgtable_free_child(pgt, ptep, flush_data);
++	return 0;
++}
++
++static int _pgtable_walk(struct pgt_walk_data *data, void *ptep, int level);
++static int pgtable_visit(struct pgt_walk_data *data, void *ptep, int level)
++{
++	struct pkvm_pgtable_ops *pgt_ops = data->pgt->pgt_ops;
++	struct pkvm_mm_ops *mm_ops = data->pgt->mm_ops;
++	struct pkvm_pgtable_walker *walker = data->walker;
++	unsigned long flags = walker->flags;
++	bool leaf = pgt_ops->pgt_entry_is_leaf(ptep, level);
++	void *child_ptep;
++	int ret = 0;
++
++	if (!leaf && (flags & PKVM_PGTABLE_WALK_TABLE_PRE))
++		ret = walker->cb(data->pgt, data->vaddr, data->vaddr_end,
++				 level, ptep, PKVM_PGTABLE_WALK_TABLE_PRE,
++				 &data->flush_data, walker->arg);
++
++	if (leaf && (flags & PKVM_PGTABLE_WALK_LEAF)) {
++		ret = walker->cb(data->pgt, data->vaddr, data->vaddr_end,
++				 level, ptep, PKVM_PGTABLE_WALK_LEAF,
++				 &data->flush_data, walker->arg);
++		leaf = pgt_ops->pgt_entry_is_leaf(ptep, level);
++	}
++
++	if (ret)
++		return ret;
++
++	if (leaf) {
++		unsigned long size = pgt_ops->pgt_level_to_size(level);
++		data->vaddr = ALIGN_DOWN(data->vaddr, size);
++		data->vaddr += size;
++		return ret;
++	}
++
++	child_ptep = mm_ops->phys_to_virt(pgt_ops->pgt_entry_to_phys(ptep));
++	ret = _pgtable_walk(data, child_ptep, level - 1);
++	if (ret)
++		return ret;
++
++	if (flags & PKVM_PGTABLE_WALK_TABLE_POST)
++		ret = walker->cb(data->pgt, data->vaddr, data->vaddr_end,
++				 level, ptep, PKVM_PGTABLE_WALK_TABLE_POST,
++				 &data->flush_data, walker->arg);
++
++	return ret;
++}
++
++static int _pgtable_walk(struct pgt_walk_data *data, void *ptep, int level)
++{
++	struct pkvm_pgtable_ops *pgt_ops = data->pgt->pgt_ops;
++	int entries = pgt_ops->pgt_level_to_entries(level);
++	int entry_size = pgt_ops->pgt_level_entry_size(level);
++	int idx = pgt_ops->pgt_entry_to_index(data->vaddr, level);
++	int ret;
++
++	for (; idx < entries; idx++) {
++		if (data->vaddr >= data->vaddr_end)
++			break;
++
++		ret = pgtable_visit(data, (ptep + idx * entry_size), level);
++		if (ret)
++			return ret;
++	}
++
++	return 0;
++}
++
++int pgtable_walk(struct pkvm_pgtable *pgt, unsigned long vaddr,
++			unsigned long size, bool page_aligned,
++			struct pkvm_pgtable_walker *walker)
++{
++	unsigned long aligned_vaddr =
++		page_aligned ? ALIGN_DOWN(vaddr, PAGE_SIZE) : vaddr;
++	unsigned long aligned_size =
++		page_aligned ? ALIGN(size, PAGE_SIZE) : size;
++	struct pgt_walk_data data = {
++		.pgt = pgt,
++		.flush_data = {
++			.flushtlb = false,
++			.free_list = LIST_HEAD_INIT(data.flush_data.free_list),
++		},
++		.vaddr = aligned_vaddr,
++		.vaddr_end = aligned_vaddr + aligned_size,
++		.walker = walker,
++	};
++	struct pkvm_mm_ops *mm_ops = pgt->mm_ops;
++	int ret;
++
++	if (!size || data.vaddr == data.vaddr_end)
++		return 0;
++
++	ret = _pgtable_walk(&data, mm_ops->phys_to_virt(pgt->root_pa), pgt->level);
++
++	if (data.flush_data.flushtlb || !list_empty(&data.flush_data.free_list))
++		pgt->mm_ops->flush_tlb(pgt, aligned_vaddr, aligned_size);
++
++	while (!list_empty(&data.flush_data.free_list)) {
++		void *page = get_page_from_freelist(&data.flush_data.free_list);
++
++		pgt->mm_ops->put_page(page);
++	}
++
++	return ret;
++}
++
++int pkvm_pgtable_init(struct pkvm_pgtable *pgt,
++			     struct pkvm_mm_ops *mm_ops,
++			     struct pkvm_pgtable_ops *pgt_ops,
++			     struct pkvm_pgtable_cap *cap,
++			     bool alloc_root)
++{
++	void *root;
++
++	if (!mm_ops || !pgt_ops || !cap)
++		return -EINVAL;
++
++	if (alloc_root) {
++		root = pgtable_alloc_page(mm_ops);
++		if (!root)
++			return -ENOMEM;
++		pgt->root_pa = __pkvm_pa(root);
++	}
++
++	pgt->mm_ops = mm_ops;
++	pgt->pgt_ops = pgt_ops;
++	pgt->level = cap->level;
++	pgt->allowed_pgsz = cap->allowed_pgsz;
++	pgt->table_prot = cap->table_prot;
++
++	return 0;
++}
++
++static int __pkvm_pgtable_map(struct pkvm_pgtable *pgt, unsigned long vaddr_start,
++			      unsigned long phys, unsigned long size,
++			      int pgsz_mask, u64 prot, pgtable_leaf_ov_fn_t map_leaf,
++			      u64 annotation)
++{
++	struct pkvm_pgtable_map_data data = {
++		.phys = phys,
++		.annotation = annotation,
++		.prot = prot,
++		.pgsz_mask = pgsz_mask ? pgt->allowed_pgsz & pgsz_mask :
++					 pgt->allowed_pgsz,
++		.map_leaf_override = map_leaf,
++	};
++	struct pkvm_pgtable_walker walker = {
++		.cb = pgtable_map_cb,
++		.arg = &data,
++		.flags = PKVM_PGTABLE_WALK_LEAF,
++	};
++
++	return pgtable_walk(pgt, vaddr_start, size, true, &walker);
++}
++
++int pkvm_pgtable_map(struct pkvm_pgtable *pgt, unsigned long vaddr_start,
++		     unsigned long phys_start, unsigned long size,
++		     int pgsz_mask, u64 prot, pgtable_leaf_ov_fn_t map_leaf)
++{
++	return __pkvm_pgtable_map(pgt, vaddr_start, ALIGN_DOWN(phys_start, PAGE_SIZE),
++				  size, pgsz_mask, prot, map_leaf, 0);
++}
++
++int pkvm_pgtable_unmap(struct pkvm_pgtable *pgt, unsigned long vaddr_start,
++		       unsigned long size, pgtable_leaf_ov_fn_t unmap_leaf)
++{
++	struct pkvm_pgtable_unmap_data data = {
++		.phys = INVALID_ADDR,
++		.split_huge_page = true,
++		.unmap_leaf_override = unmap_leaf,
++	};
++	struct pkvm_pgtable_walker walker = {
++		.cb = pgtable_unmap_cb,
++		.arg = &data,
++		.flags = PKVM_PGTABLE_WALK_LEAF | PKVM_PGTABLE_WALK_TABLE_POST,
++	};
++
++	return pgtable_walk(pgt, vaddr_start, size, true, &walker);
++}
++
++int pkvm_pgtable_unmap_safe(struct pkvm_pgtable *pgt, unsigned long vaddr_start,
++			    unsigned long phys_start, unsigned long size,
++			    pgtable_leaf_ov_fn_t unmap_leaf)
++{
++	struct pkvm_pgtable_unmap_data data = {
++		.phys = ALIGN_DOWN(phys_start, PAGE_SIZE),
++		.split_huge_page = true,
++		.unmap_leaf_override = unmap_leaf,
++	};
++	struct pkvm_pgtable_walker walker = {
++		.cb = pgtable_unmap_cb,
++		.arg = &data,
++		.flags = PKVM_PGTABLE_WALK_LEAF | PKVM_PGTABLE_WALK_TABLE_POST,
++	};
++
++	return pgtable_walk(pgt, vaddr_start, size, true, &walker);
++}
++
++int pkvm_pgtable_unmap_nosplit(struct pkvm_pgtable *pgt, unsigned long vaddr_start,
++		       unsigned long size, pgtable_leaf_ov_fn_t unmap_leaf)
++{
++	struct pkvm_pgtable_unmap_data data = {
++		.phys = INVALID_ADDR,
++		.split_huge_page = false,
++		.unmap_leaf_override = unmap_leaf,
++	};
++	struct pkvm_pgtable_walker walker = {
++		.cb = pgtable_unmap_cb,
++		.arg = &data,
++		.flags = PKVM_PGTABLE_WALK_LEAF | PKVM_PGTABLE_WALK_TABLE_POST,
++	};
++
++	return pgtable_walk(pgt, vaddr_start, size, true, &walker);
++}
++
++void pkvm_pgtable_lookup(struct pkvm_pgtable *pgt, unsigned long vaddr,
++		     unsigned long *pphys, u64 *pprot, int *plevel)
++{
++	struct pkvm_pgtable_lookup_data data = {
++		.vaddr = vaddr,
++	};
++	struct pkvm_pgtable_walker walker = {
++		.cb = pgtable_lookup_cb,
++		.arg = &data,
++		.flags = PKVM_PGTABLE_WALK_LEAF,
++	};
++	int ret, retry_cnt = 0;
++
++retry:
++	ret = pgtable_walk(pgt, vaddr, PAGE_SIZE, true, &walker);
++	if ((ret == -EAGAIN) && (retry_cnt++ < 5))
++		goto retry;
++
++	if (pphys)
++		*pphys = data.phys;
++	if (pprot)
++		*pprot = data.prot;
++	if (plevel)
++		*plevel = data.level;
++}
++
++void pkvm_pgtable_destroy(struct pkvm_pgtable *pgt, pgtable_leaf_ov_fn_t free_leaf)
++{
++	unsigned long size;
++	void *virt_root;
++	struct pkvm_pgtable_ops *pgt_ops;
++	struct pkvm_pgtable_free_data data = {
++		.free_leaf_override = free_leaf,
++	};
++	struct pkvm_pgtable_walker walker = {
++		.cb 	= pgtable_free_cb,
++		.arg 	= &data,
++		.flags 	= PKVM_PGTABLE_WALK_LEAF | PKVM_PGTABLE_WALK_TABLE_POST,
++	};
++
++	pgt_ops = pgt->pgt_ops;
++	size = pgt_ops->pgt_level_to_size(pgt->level + 1);
++
++	pgtable_walk(pgt, 0, size, true, &walker);
++	virt_root = pgt->mm_ops->phys_to_virt(pgt->root_pa);
++	pgt->mm_ops->put_page(virt_root);
++}
++
++/*
++ * pkvm_pgtable_annotate() - Unmap and annotate pages to track ownership.
++ * @annotation:		The value stored in the invalid pte.
++ * 			@annotation[2:0] must be 0.
++ */
++int pkvm_pgtable_annotate(struct pkvm_pgtable *pgt, unsigned long addr,
++			  unsigned long size, u64 annotation)
++{
++	if (pgt->pgt_ops->pgt_entry_present(&annotation))
++		return -EINVAL;
++
++	return __pkvm_pgtable_map(pgt, addr, INVALID_ADDR,
++				  size, 1 << PG_LEVEL_4K, 0,
++				  NULL, annotation);
++}
++
++static int pgtable_sync_map_cb(struct pkvm_pgtable *pgt, unsigned long vaddr,
++			       unsigned long vaddr_end, int level, void *ptep,
++			       unsigned long flags, struct pgt_flush_data *flush_data,
++			       void *const arg)
++{
++	struct pkvm_pgtable_ops *pgt_ops = pgt->pgt_ops;
++	struct pkvm_pgtable_sync_data *data = arg;
++	unsigned long phys;
++	unsigned long size;
++	u64 prot;
++
++	phys = pgt_ops->pgt_entry_to_phys(ptep);
++	size = pgt_ops->pgt_level_to_size(level);
++
++	if (!pgt->pgt_ops->pgt_entry_present(ptep))
++		return pkvm_pgtable_unmap(data->dest_pgt, vaddr, size, NULL);
++
++	if (data->prot_override)
++		prot = *data->prot_override;
++	else
++		prot = pgt_ops->pgt_entry_to_prot(ptep);
++
++	return pkvm_pgtable_map(data->dest_pgt, vaddr, phys,
++				size, 0, prot, data->map_leaf_override);
++}
++
++/*
++ * pkvm_pgtable_sync_map_range() - map the given address range in the destination
++ * pgtable according to the source pgtable, with the same phys address and desired
++ * property bits.
++ *
++ * @src:	source pgtable.
++ * @dest:	destination pgtable.
++ * @vaddr:	virtual start address of the range.
++ * @size:	size of the range in bytes.
++ * @prot:	desired property bits. Can be NULL if use the same property
++ *		bits as the source pgtable
++ * @map_leaf:	function to map the leaf entry for destination pgtable.
++ */
++int pkvm_pgtable_sync_map_range(struct pkvm_pgtable *src, struct pkvm_pgtable *dest,
++				unsigned long vaddr, unsigned long size,
++				u64 *prot, pgtable_leaf_ov_fn_t map_leaf)
++{
++	struct pkvm_pgtable_sync_data data = {
++		.dest_pgt = dest,
++		.prot_override = prot,
++		.map_leaf_override = map_leaf,
++	};
++	struct pkvm_pgtable_walker walker = {
++		.cb = pgtable_sync_map_cb,
++		.flags = PKVM_PGTABLE_WALK_LEAF,
++		.arg = &data,
++	};
++
++	return pgtable_walk(src, vaddr, size, true, &walker);
++}
++
++/*
++ * pkvm_pgtable_sync_map() - map the destination pgtable according to the source
++ * pgtable, with the same phys address and desired property bits.
++ *
++ * @src:	source pgtable.
++ * @dest:	destination pgtable.
++ * @prot:	desired property bits. Can be NULL if use the same property
++ *		bits as the source pgtable
++ * @map_leaf:	function to map the leaf entry for destination pgtable.
++ */
++int pkvm_pgtable_sync_map(struct pkvm_pgtable *src, struct pkvm_pgtable *dest,
++			  u64 *prot, pgtable_leaf_ov_fn_t map_leaf)
++{
++	unsigned long size = src->pgt_ops->pgt_level_to_size(src->level + 1);
++
++	return pkvm_pgtable_sync_map_range(src, dest, 0, size, prot, map_leaf);
++}
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/pgtable.h b/arch/x86/kvm/vmx/pkvm/hyp/pgtable.h
+new file mode 100644
+index 000000000000..85a2f74c5fe4
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/pgtable.h
+@@ -0,0 +1,155 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++#ifndef _PKVM_PGTABLE_H_
++#define _PKVM_PGTABLE_H_
++
++#include <linux/types.h>
++#include <asm/pgtable_types.h>
++
++#define SUPPRESS_VE	BIT(63)
++
++struct pkvm_mm_ops {
++	void *(*phys_to_virt)(unsigned long phys);
++	unsigned long (*virt_to_phys)(void *vaddr);
++	void *(*zalloc_page)(void);
++	int (*page_count)(void *vaddr);
++	void (*get_page)(void *vaddr);
++	void (*put_page)(void *vaddr);
++	void (*flush_tlb)(struct pkvm_pgtable *pgt,
++			  unsigned long vaddr, unsigned long size);
++	void (*flush_cache)(void *vaddr, unsigned int size);
++};
++
++struct pkvm_pgtable_ops {
++	bool (*pgt_entry_present)(void *pte);
++	bool (*pgt_entry_mapped)(void *pte);
++	bool (*pgt_entry_huge)(void *pte);
++	void (*pgt_entry_mkhuge)(void *ptep);
++	unsigned long (*pgt_entry_to_phys)(void *pte);
++	u64 (*pgt_entry_to_prot)(void *pte);
++	int (*pgt_entry_to_index)(unsigned long vaddr, int level);
++	u64 (*pgt_level_page_mask)(int level);
++	bool (*pgt_entry_is_leaf)(void *ptep, int level);
++	int (*pgt_level_entry_size)(int level);
++	int (*pgt_level_to_entries)(int level);
++	unsigned long (*pgt_level_to_size)(int level);
++	void (*pgt_set_entry)(void *ptep, u64 val);
++	u64 default_prot;
++};
++
++struct pkvm_pgtable {
++	unsigned long root_pa;
++	int level;
++	int allowed_pgsz;
++	u64 table_prot;
++	struct pkvm_mm_ops *mm_ops;
++	struct pkvm_pgtable_ops *pgt_ops;
++};
++
++struct pgt_flush_data {
++	bool flushtlb;
++	struct list_head free_list;
++};
++
++typedef int (*pgtable_visit_fn_t)(struct pkvm_pgtable *pgt, unsigned long vaddr,
++				  unsigned long vaddr_end, int level, void *ptep,
++				  unsigned long flags, struct pgt_flush_data *flush_data,
++				  void *const arg);
++
++typedef int (*pgtable_leaf_ov_fn_t)(struct pkvm_pgtable *pgt, unsigned long vaddr,
++				    int level, void *ptep, struct pgt_flush_data *flush_data,
++				    void *data);
++
++struct pkvm_pgtable_map_data {
++	unsigned long phys;
++	u64 annotation;
++	u64 prot;
++	int pgsz_mask;
++
++	/*
++	 * extra override helper ops:
++	 * - map_leaf_override():  override the final page entry map function
++	 *   		  	   for pkvm_pgtable_map()
++	 */
++	pgtable_leaf_ov_fn_t map_leaf_override;
++};
++
++struct pkvm_pgtable_unmap_data {
++	unsigned long phys;
++
++	/*
++	 * extra override helper ops:
++	 * - unmap_leaf_override(): override the final page entry map function
++	 *   for pkvm_pgtable_unmap()
++	 */
++	pgtable_leaf_ov_fn_t unmap_leaf_override;
++
++	bool split_huge_page;
++};
++
++struct pkvm_pgtable_free_data {
++	/*
++	 * extra override helper ops:
++	 * - free_leaf_override(): override the final page entry free function
++	 *   		  	   for pkvm_pgtable_destroy()
++	 */
++	pgtable_leaf_ov_fn_t free_leaf_override;
++};
++
++struct pkvm_pgtable_sync_data {
++	struct pkvm_pgtable *dest_pgt;
++	u64 *prot_override;
++
++	pgtable_leaf_ov_fn_t map_leaf_override;
++};
++
++#define PGTABLE_WALK_DONE      1
++
++struct pkvm_pgtable_walker {
++	const pgtable_visit_fn_t cb;
++	void *const arg;
++	unsigned long flags;
++#define PKVM_PGTABLE_WALK_TABLE_PRE	BIT(0)
++#define PKVM_PGTABLE_WALK_LEAF		BIT(1)
++#define PKVM_PGTABLE_WALK_TABLE_POST	BIT(2)
++};
++
++int pgtable_walk(struct pkvm_pgtable *pgt, unsigned long vaddr,
++		unsigned long size, bool page_aligned,
++		struct pkvm_pgtable_walker *walker);
++int pkvm_pgtable_init(struct pkvm_pgtable *pgt,
++		struct pkvm_mm_ops *mm_ops,
++		struct pkvm_pgtable_ops *pgt_ops,
++		struct pkvm_pgtable_cap *cap,
++		bool alloc_root);
++int pkvm_pgtable_map(struct pkvm_pgtable *pgt, unsigned long vaddr_start,
++		unsigned long phys_start, unsigned long size,
++		int pgsz_mask, u64 entry_prot, pgtable_leaf_ov_fn_t map_leaf);
++int pgtable_map_leaf(struct pkvm_pgtable *pgt, unsigned long vaddr,
++		     int level, void *ptep, struct pgt_flush_data *flush_data,
++		     struct pkvm_pgtable_map_data *data);
++int pkvm_pgtable_unmap(struct pkvm_pgtable *pgt, unsigned long vaddr_start,
++		       unsigned long size, pgtable_leaf_ov_fn_t unmap_leaf);
++int pkvm_pgtable_unmap_safe(struct pkvm_pgtable *pgt, unsigned long vaddr_start,
++			    unsigned long phys_start, unsigned long size,
++			    pgtable_leaf_ov_fn_t unmap_leaf);
++int pkvm_pgtable_unmap_nosplit(struct pkvm_pgtable *pgt, unsigned long vaddr_start,
++		       unsigned long size, pgtable_leaf_ov_fn_t unmap_leaf);
++void pkvm_pgtable_lookup(struct pkvm_pgtable *pgt, unsigned long vaddr,
++		unsigned long *pphys, u64 *pprot, int *plevel);
++void pkvm_pgtable_destroy(struct pkvm_pgtable *pgt, pgtable_leaf_ov_fn_t free_leaf);
++int pkvm_pgtable_annotate(struct pkvm_pgtable *pgt, unsigned long addr,
++			  unsigned long size, u64 annotation);
++int pkvm_pgtable_sync_map(struct pkvm_pgtable *src, struct pkvm_pgtable *dest,
++			  u64 *prot, pgtable_leaf_ov_fn_t map_leaf);
++int pkvm_pgtable_sync_map_range(struct pkvm_pgtable *src, struct pkvm_pgtable *dest,
++				unsigned long vaddr, unsigned long size,
++				u64 *prot, pgtable_leaf_ov_fn_t map_leaf);
++
++static inline void pkvm_pgtable_set_mm_ops(struct pkvm_pgtable *pgt, struct pkvm_mm_ops *mm_ops)
++{
++	pgt->mm_ops = mm_ops;
++}
++#endif
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/pkvm.c b/arch/x86/kvm/vmx/pkvm/hyp/pkvm.c
+new file mode 100644
+index 000000000000..3bf26c75ae98
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/pkvm.c
+@@ -0,0 +1,470 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++
++#include <linux/hashtable.h>
++#include <pkvm.h>
++
++#include "pkvm_hyp.h"
++#include "ept.h"
++#include "mem_protect.h"
++#include "lapic.h"
++#include "ptdev.h"
++
++struct pkvm_hyp *pkvm_hyp;
++
++#define MAX_SHADOW_VMS	(PKVM_MAX_NORMAL_VM_NUM + PKVM_MAX_SECURE_VM_NUM)
++#define HANDLE_OFFSET 1
++
++#define to_shadow_vm_handle(vcpu_handle)	((s64)(vcpu_handle) >> SHADOW_VM_HANDLE_SHIFT)
++#define to_shadow_vcpu_idx(vcpu_handle)		((s64)(vcpu_handle) & SHADOW_VCPU_INDEX_MASK)
++
++static DECLARE_BITMAP(shadow_vms_bitmap, MAX_SHADOW_VMS);
++static pkvm_spinlock_t shadow_vms_lock = { __ARCH_PKVM_SPINLOCK_UNLOCKED };
++struct shadow_vm_ref {
++	atomic_t refcount;
++	struct pkvm_shadow_vm *vm;
++};
++static struct shadow_vm_ref shadow_vms_ref[MAX_SHADOW_VMS];
++
++#define SHADOW_VCPU_ARRAY(vm) \
++	((struct shadow_vcpu_array *)((void *)(vm) + sizeof(struct pkvm_shadow_vm)))
++
++#define SHADOW_VCPU_HASH_BITS		10
++DEFINE_HASHTABLE(shadow_vcpu_table, SHADOW_VCPU_HASH_BITS);
++static pkvm_spinlock_t shadow_vcpu_table_lock = { __ARCH_PKVM_SPINLOCK_UNLOCKED };
++
++static int allocate_shadow_vm_handle(struct pkvm_shadow_vm *vm)
++{
++	struct shadow_vm_ref *vm_ref;
++	int handle;
++
++	/*
++	 * The shadow_vm_handle is an int so it cannot exceed INT_MAX.
++	 * Meanwhile shadow_vm_handle will also be used as owner_id in
++	 * the page state machine so it also cannot exceed the max
++	 * owner_id.
++	 */
++	BUILD_BUG_ON(MAX_SHADOW_VMS >
++		     min(INT_MAX, ((1 << hweight_long(PKVM_INVALID_PTE_OWNER_MASK)) - 1)));
++
++	pkvm_spin_lock(&shadow_vms_lock);
++
++	handle = find_next_zero_bit(shadow_vms_bitmap, MAX_SHADOW_VMS,
++				    HANDLE_OFFSET);
++	if ((u32)handle < MAX_SHADOW_VMS) {
++		__set_bit(handle, shadow_vms_bitmap);
++		vm->shadow_vm_handle = handle;
++		vm_ref = &shadow_vms_ref[handle];
++		vm_ref->vm = vm;
++		atomic_set(&vm_ref->refcount, 1);
++	} else
++		handle = -ENOMEM;
++
++	pkvm_spin_unlock(&shadow_vms_lock);
++
++	return handle;
++}
++
++static struct pkvm_shadow_vm *free_shadow_vm_handle(int handle)
++{
++	struct shadow_vm_ref *vm_ref;
++	struct pkvm_shadow_vm *vm = NULL;
++
++	pkvm_spin_lock(&shadow_vms_lock);
++
++	if ((u32)handle >= MAX_SHADOW_VMS)
++		goto out;
++
++	vm_ref = &shadow_vms_ref[handle];
++	if ((atomic_cmpxchg(&vm_ref->refcount, 1, 0) != 1)) {
++		pkvm_err("%s: VM%d is busy, refcount %d\n",
++			 __func__, handle, atomic_read(&vm_ref->refcount));
++		goto out;
++	}
++
++	vm = vm_ref->vm;
++
++	vm_ref->vm = NULL;
++	__clear_bit(handle, shadow_vms_bitmap);
++out:
++	pkvm_spin_unlock(&shadow_vms_lock);
++	return vm;
++}
++
++int __pkvm_init_shadow_vm(struct kvm_vcpu *hvcpu, unsigned long kvm_va,
++			  unsigned long shadow_pa,  size_t shadow_size)
++{
++	unsigned long offset = offsetof(struct kvm, arch.vm_type);
++	unsigned long vm_type, bytes = sizeof(unsigned long);
++	struct pkvm_shadow_vm *vm;
++	struct x86_exception e;
++	int shadow_vm_handle;
++
++	if (!PAGE_ALIGNED(shadow_pa) ||
++		!PAGE_ALIGNED(shadow_size) ||
++		(shadow_size != PAGE_ALIGN(sizeof(struct pkvm_shadow_vm)
++					   + pkvm_shadow_vcpu_array_size())))
++		return -EINVAL;
++
++	if (read_gva(hvcpu, kvm_va + offset, &vm_type, bytes, &e) < 0)
++		return -EINVAL;
++
++	if(__pkvm_host_donate_hyp(shadow_pa, shadow_size))
++		return -EINVAL;
++
++	vm = pkvm_phys_to_virt(shadow_pa);
++
++	memset(vm, 0, shadow_size);
++	pkvm_spinlock_init(&vm->lock);
++	INIT_LIST_HEAD(&vm->ptdev_head);
++
++	vm->host_kvm_va = kvm_va;
++	vm->shadow_size = shadow_size;
++	vm->vm_type = vm_type;
++
++	if (pkvm_pgstate_pgt_init(vm))
++		goto undonate;
++
++	if (pkvm_shadow_ept_init(&vm->sept_desc))
++		goto deinit_pgstate_pgt;
++
++	shadow_vm_handle = allocate_shadow_vm_handle(vm);
++	if (shadow_vm_handle < 0)
++		goto deinit_shadow_ept;
++
++	return shadow_vm_handle;
++
++deinit_shadow_ept:
++	pkvm_shadow_ept_deinit(&vm->sept_desc);
++deinit_pgstate_pgt:
++	pkvm_pgstate_pgt_deinit(vm);
++undonate:
++	memset(vm, 0, shadow_size);
++	__pkvm_hyp_donate_host(shadow_pa, shadow_size);
++	return -EINVAL;
++}
++
++unsigned long __pkvm_teardown_shadow_vm(int shadow_vm_handle)
++{
++	struct pkvm_shadow_vm *vm = free_shadow_vm_handle(shadow_vm_handle);
++	struct pkvm_ptdev *ptdev, *tmp;
++	unsigned long shadow_size;
++
++	if (!vm)
++		return 0;
++
++	pkvm_shadow_ept_deinit(&vm->sept_desc);
++
++	pkvm_pgstate_pgt_deinit(vm);
++
++	list_for_each_entry_safe(ptdev, tmp, &vm->ptdev_head, vm_node)
++		pkvm_detach_ptdev(ptdev, vm);
++
++	shadow_size = vm->shadow_size;
++	memset(vm, 0, shadow_size);
++
++	WARN_ON(__pkvm_hyp_donate_host(pkvm_virt_to_phys(vm), shadow_size));
++
++	return pkvm_virt_to_phys(vm);
++}
++
++struct pkvm_shadow_vm *get_shadow_vm(int shadow_vm_handle)
++{
++	struct shadow_vm_ref *vm_ref;
++
++	if ((u32)shadow_vm_handle >= MAX_SHADOW_VMS)
++		return NULL;
++
++	vm_ref = &shadow_vms_ref[shadow_vm_handle];
++	return atomic_inc_not_zero(&vm_ref->refcount) ? vm_ref->vm : NULL;
++}
++
++void put_shadow_vm(int shadow_vm_handle)
++{
++	struct shadow_vm_ref *vm_ref;
++
++	if ((u32)shadow_vm_handle >= MAX_SHADOW_VMS)
++		return;
++
++	vm_ref = &shadow_vms_ref[shadow_vm_handle];
++	WARN_ON(atomic_dec_if_positive(&vm_ref->refcount) <= 0);
++}
++
++void pkvm_shadow_vm_link_ptdev(struct pkvm_shadow_vm *vm,
++			       struct list_head *node, bool coherency)
++{
++	pkvm_spin_lock(&vm->lock);
++	list_add_tail(node, &vm->ptdev_head);
++	vm->noncoherent_ptdev += !coherency;
++	vm->need_prepopulation = true;
++	pkvm_shadow_sl_iommu_pgt_update_coherency(&vm->pgstate_pgt,
++						  !vm->noncoherent_ptdev);
++	pkvm_spin_unlock(&vm->lock);
++}
++
++void pkvm_shadow_vm_unlink_ptdev(struct pkvm_shadow_vm *vm,
++				 struct list_head *node, bool coherency)
++{
++	pkvm_spin_lock(&vm->lock);
++	list_del(node);
++	vm->noncoherent_ptdev -= !coherency;
++	pkvm_shadow_sl_iommu_pgt_update_coherency(&vm->pgstate_pgt,
++						  !vm->noncoherent_ptdev);
++	pkvm_spin_unlock(&vm->lock);
++}
++
++static void add_shadow_vcpu_vmcs12_map(struct shadow_vcpu_state *vcpu)
++{
++	pkvm_spin_lock(&shadow_vcpu_table_lock);
++	hash_add(shadow_vcpu_table, &vcpu->hnode, vcpu->vmcs12_pa);
++	pkvm_spin_unlock(&shadow_vcpu_table_lock);
++}
++
++static void remove_shadow_vcpu_vmcs12_map(struct shadow_vcpu_state *vcpu)
++{
++	pkvm_spin_lock(&shadow_vcpu_table_lock);
++	hash_del(&vcpu->hnode);
++	pkvm_spin_unlock(&shadow_vcpu_table_lock);
++}
++
++s64 find_shadow_vcpu_handle_by_vmcs(unsigned long vmcs12_pa)
++{
++	struct shadow_vcpu_state *shadow_vcpu;
++	s64 handle = -1;
++
++	pkvm_spin_lock(&shadow_vcpu_table_lock);
++	hash_for_each_possible(shadow_vcpu_table, shadow_vcpu, hnode, vmcs12_pa) {
++		if (shadow_vcpu->vmcs12_pa == vmcs12_pa) {
++			handle = shadow_vcpu->shadow_vcpu_handle;
++			break;
++		}
++	}
++	pkvm_spin_unlock(&shadow_vcpu_table_lock);
++
++	return handle;
++}
++
++struct shadow_vcpu_state *get_shadow_vcpu(s64 shadow_vcpu_handle)
++{
++	int shadow_vm_handle = to_shadow_vm_handle(shadow_vcpu_handle);
++	u32 vcpu_idx = to_shadow_vcpu_idx(shadow_vcpu_handle);
++	struct shadow_vcpu_ref *vcpu_ref;
++	struct shadow_vcpu_state *vcpu;
++	struct pkvm_shadow_vm *vm;
++
++	if (vcpu_idx >= KVM_MAX_VCPUS)
++		return NULL;
++
++	vm = get_shadow_vm(shadow_vm_handle);
++	if (!vm)
++		return NULL;
++
++	vcpu_ref = &SHADOW_VCPU_ARRAY(vm)->ref[vcpu_idx];
++	vcpu = atomic_inc_not_zero(&vcpu_ref->refcount) ? vcpu_ref->vcpu : NULL;
++
++	put_shadow_vm(shadow_vm_handle);
++	return vcpu;
++}
++
++void put_shadow_vcpu(s64 shadow_vcpu_handle)
++{
++	int shadow_vm_handle = to_shadow_vm_handle(shadow_vcpu_handle);
++	u32 vcpu_idx = to_shadow_vcpu_idx(shadow_vcpu_handle);
++	struct shadow_vcpu_ref *vcpu_ref;
++	struct pkvm_shadow_vm *vm;
++
++	if (vcpu_idx >= KVM_MAX_VCPUS)
++		return;
++
++	vm = get_shadow_vm(shadow_vm_handle);
++	if (!vm)
++		return;
++
++	vcpu_ref = &SHADOW_VCPU_ARRAY(vm)->ref[vcpu_idx];
++	WARN_ON(atomic_dec_if_positive(&vcpu_ref->refcount) <= 0);
++
++	put_shadow_vm(shadow_vm_handle);
++}
++
++static s64 attach_shadow_vcpu_to_vm(struct pkvm_shadow_vm *vm,
++				    struct shadow_vcpu_state *shadow_vcpu)
++{
++	struct shadow_vcpu_ref *vcpu_ref;
++	u32 vcpu_idx;
++
++	/*
++	 * Shadow_vcpu_handle is a s64 value combined with shadow_vm_handle
++	 * and shadow_vcpu index from the array. So the array size cannot be
++	 * larger than the shadow_vcpu index mask.
++	 */
++	BUILD_BUG_ON(KVM_MAX_VCPUS > SHADOW_VCPU_INDEX_MASK);
++
++	/*
++	 * Save a shadow_vm pointer in shadow_vcpu requires additional
++	 * get so that later when use this pointer at runtime no need
++	 * to get again. This will be put when detaching this shadow_vcpu.
++	 */
++	shadow_vcpu->vm = get_shadow_vm(vm->shadow_vm_handle);
++	if (!shadow_vcpu->vm)
++		return -EINVAL;
++
++	add_shadow_vcpu_vmcs12_map(shadow_vcpu);
++
++	pkvm_spin_lock(&vm->lock);
++
++	if (vm->created_vcpus == KVM_MAX_VCPUS) {
++		pkvm_spin_unlock(&vm->lock);
++		return -EINVAL;
++	}
++
++	vcpu_idx = vm->created_vcpus;
++	shadow_vcpu->shadow_vcpu_handle =
++		to_shadow_vcpu_handle(vm->shadow_vm_handle, vcpu_idx);
++	vcpu_ref = &SHADOW_VCPU_ARRAY(vm)->ref[vcpu_idx];
++	vcpu_ref->vcpu = shadow_vcpu;
++	vm->created_vcpus++;
++	atomic_set(&vcpu_ref->refcount, 1);
++
++	pkvm_spin_unlock(&vm->lock);
++
++	return shadow_vcpu->shadow_vcpu_handle;
++}
++
++static struct shadow_vcpu_state *
++detach_shadow_vcpu_from_vm(struct pkvm_shadow_vm *vm, s64 shadow_vcpu_handle)
++{
++	u32 vcpu_idx = to_shadow_vcpu_idx(shadow_vcpu_handle);
++	struct shadow_vcpu_state *shadow_vcpu = NULL;
++	struct shadow_vcpu_ref *vcpu_ref;
++
++	if (vcpu_idx >= KVM_MAX_VCPUS)
++		return NULL;
++
++	pkvm_spin_lock(&vm->lock);
++
++	vcpu_ref = &SHADOW_VCPU_ARRAY(vm)->ref[vcpu_idx];
++	if ((atomic_cmpxchg(&vcpu_ref->refcount, 1, 0) != 1)) {
++		pkvm_err("%s: VM%d shadow_vcpu%d is busy, refcount %d\n",
++			 __func__, vm->shadow_vm_handle, vcpu_idx,
++			 atomic_read(&vcpu_ref->refcount));
++	} else {
++		shadow_vcpu = vcpu_ref->vcpu;
++		vcpu_ref->vcpu = NULL;
++	}
++
++	pkvm_spin_unlock(&vm->lock);
++
++	if (shadow_vcpu) {
++		remove_shadow_vcpu_vmcs12_map(shadow_vcpu);
++		/*
++		 * Paired with the get_shadow_vm when saving the shadow_vm pointer
++		 * during attaching shadow_vcpu.
++		 */
++		put_shadow_vm(shadow_vcpu->vm->shadow_vm_handle);
++	}
++
++	return shadow_vcpu;
++}
++
++s64 __pkvm_init_shadow_vcpu(struct kvm_vcpu *hvcpu, int shadow_vm_handle,
++			    unsigned long vcpu_va, unsigned long shadow_pa,
++			    size_t shadow_size)
++{
++	struct pkvm_shadow_vm *vm;
++	struct shadow_vcpu_state *shadow_vcpu;
++	struct x86_exception e;
++	unsigned long vmcs12_va;
++	s64 shadow_vcpu_handle;
++	int ret;
++
++	if (!PAGE_ALIGNED(shadow_pa) || !PAGE_ALIGNED(shadow_size) ||
++		(shadow_size != PAGE_ALIGN(sizeof(struct shadow_vcpu_state))) ||
++		(pkvm_hyp->vmcs_config.size > PAGE_SIZE))
++		return -EINVAL;
++
++	if (__pkvm_host_donate_hyp(shadow_pa, shadow_size))
++		return -EINVAL;
++
++	shadow_vcpu = pkvm_phys_to_virt(shadow_pa);
++	memset(shadow_vcpu, 0, shadow_size);
++	shadow_vcpu->shadow_size = shadow_size;
++
++	ret = read_gva(hvcpu, vcpu_va, &shadow_vcpu->vmx, sizeof(struct vcpu_vmx), &e);
++	if (ret < 0)
++		goto undonate;
++
++	vmcs12_va = (unsigned long)shadow_vcpu->vmx.vmcs01.vmcs;
++	if (gva2gpa(hvcpu, vmcs12_va, (gpa_t *)&shadow_vcpu->vmcs12_pa, 0, &e))
++		goto undonate;
++
++	vm = get_shadow_vm(shadow_vm_handle);
++	if (!vm)
++		goto undonate;
++
++	shadow_vcpu_handle = attach_shadow_vcpu_to_vm(vm, shadow_vcpu);
++
++	put_shadow_vm(shadow_vm_handle);
++
++	if (shadow_vcpu_handle < 0)
++		goto undonate;
++
++	return shadow_vcpu_handle;
++undonate:
++	memset(shadow_vcpu, 0, shadow_size);
++	__pkvm_hyp_donate_host(shadow_pa, shadow_size);
++	return -EINVAL;
++}
++
++unsigned long __pkvm_teardown_shadow_vcpu(s64 shadow_vcpu_handle)
++{
++	int shadow_vm_handle = to_shadow_vm_handle(shadow_vcpu_handle);
++	struct shadow_vcpu_state *shadow_vcpu;
++	unsigned long shadow_size;
++	struct pkvm_shadow_vm *vm = get_shadow_vm(shadow_vm_handle);
++
++	if (!vm)
++		return 0;
++
++	shadow_vcpu = detach_shadow_vcpu_from_vm(vm, shadow_vcpu_handle);
++
++	put_shadow_vm(shadow_vm_handle);
++
++	if (!shadow_vcpu)
++		return 0;
++
++	shadow_size = shadow_vcpu->shadow_size;
++	memset(shadow_vcpu, 0, shadow_size);
++	WARN_ON(__pkvm_hyp_donate_host(pkvm_virt_to_phys(shadow_vcpu),
++				       shadow_size));
++
++	return pkvm_virt_to_phys(shadow_vcpu);
++}
++
++void pkvm_kick_vcpu(struct kvm_vcpu *vcpu)
++{
++	struct pkvm_host_vcpu *hvcpu = to_pkvm_hvcpu(vcpu);
++	struct pkvm_pcpu *pcpu = hvcpu->pcpu;
++
++	if (kvm_vcpu_exiting_guest_mode(vcpu) != IN_GUEST_MODE)
++		return;
++
++	pkvm_lapic_send_init(pcpu);
++}
++
++int pkvm_add_ptdev(int shadow_vm_handle, u16 bdf, u32 pasid)
++{
++	struct pkvm_shadow_vm *vm = get_shadow_vm(shadow_vm_handle);
++	int ret = 0;
++
++	if (!vm)
++		return -EINVAL;
++
++	if (vm->vm_type != KVM_X86_DEFAULT_VM)
++		ret = pkvm_attach_ptdev(bdf, pasid, vm);
++
++	put_shadow_vm(shadow_vm_handle);
++
++	return ret;
++}
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/pkvm.lds.S b/arch/x86/kvm/vmx/pkvm/hyp/pkvm.lds.S
+new file mode 100644
+index 000000000000..af81ce58c72f
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/pkvm.lds.S
+@@ -0,0 +1,10 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++
++#include <asm/pkvm_image.h>
++
++SECTIONS {
++	PKVM_SECTION(.text)
++	PKVM_SECTION(.rodata)
++	PKVM_SECTION(.data)
++	PKVM_SECTION(.bss)
++}
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/pkvm_hyp.h b/arch/x86/kvm/vmx/pkvm/hyp/pkvm_hyp.h
+new file mode 100644
+index 000000000000..5948f1b39953
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/pkvm_hyp.h
+@@ -0,0 +1,187 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++#ifndef __PKVM_HYP_H
++#define __PKVM_HYP_H
++
++#include "pkvm_spinlock.h"
++#include "pgtable.h"
++
++/*
++ * Descriptor for shadow EPT
++ */
++struct shadow_ept_desc {
++	/* shadow EPTP value configured by pkvm */
++	u64 shadow_eptp;
++
++	/* Save the last guest EPTP value configured by kvm high */
++	u64 last_guest_eptp;
++
++	struct pkvm_pgtable sept;
++};
++
++/*
++ * Store the Virtualization Exception(#VE) information when a #VE occurs. This
++ * struture definition is based on
++ * sdm Volume 3, 25.5.7.2 Virtualizaiton-Exception Information.
++ */
++struct pkvm_ve_info {
++	u32 exit_reason;
++	u32 valid;
++	u64 exit_qual;
++	u64 gla;
++	u64 gpa;
++	u16 eptp_index;
++};
++
++/*
++ *  * A container for the vcpu state that hyp needs to maintain for protected VMs.
++ *   */
++struct shadow_vcpu_state {
++	/*
++	 * A unique id to the shadow vcpu, which is combined by
++	 * shadow_vm_handle and shadow_vcpu index in the array.
++	 * As shadow_vm_handle is in the high end and it is an
++	 * int, so define the shadow_vcpu_handle as a s64.
++	 */
++	s64 shadow_vcpu_handle;
++
++	struct pkvm_shadow_vm *vm;
++
++	/* The donated size of shadow_vcpu. */
++	unsigned long shadow_size;
++
++	struct hlist_node hnode;
++	unsigned long vmcs12_pa;
++	bool vmcs02_inited;
++
++	struct vcpu_vmx vmx;
++
++	/* represents for the virtual EPT configured by kvm-high */
++	struct pkvm_pgtable vept;
++
++	/* assume vmcs02 is one page */
++	u8 vmcs02[PAGE_SIZE] __aligned(PAGE_SIZE);
++	u8 cached_vmcs12[VMCS12_SIZE] __aligned(PAGE_SIZE);
++
++	struct pkvm_ve_info ve_info;
++
++	/* The last cpu this vmcs02 runs with */
++	int last_cpu;
++
++	/* point to the kvm_vcpu associated with this shadow_vcpu */
++	struct kvm_vcpu *vcpu;
++} __aligned(PAGE_SIZE);
++
++#define SHADOW_VM_HANDLE_SHIFT		32
++#define SHADOW_VCPU_INDEX_MASK		((1UL << SHADOW_VM_HANDLE_SHIFT) - 1)
++#define to_shadow_vcpu_handle(vm_handle, vcpu_idx)		\
++		(((s64)(vm_handle) << SHADOW_VM_HANDLE_SHIFT) | \
++		 ((vcpu_idx) & SHADOW_VCPU_INDEX_MASK))
++
++/*
++ * Shadow_vcpu_array will be appended to the end of the pkvm_shadow_vm area
++ * implicitly, so that the shadow_vcpu_state pointer cannot be got directly
++ * from the pkvm_shadow_vm, but needs to be done through the interface
++ * get/put_shadow_vcpu. This can prevent the shadow_vcpu_state pointer from being
++ * abused without getting/putting the refcount.
++ */
++struct shadow_vcpu_array {
++	struct shadow_vcpu_ref {
++		atomic_t refcount;
++		struct shadow_vcpu_state *vcpu;
++	} ref[KVM_MAX_VCPUS];
++} __aligned(PAGE_SIZE);
++
++static inline size_t pkvm_shadow_vcpu_array_size(void)
++{
++	return sizeof(struct shadow_vcpu_array);
++}
++
++/*
++ *  * Holds the relevant data for running a protected vm.
++ *   */
++struct pkvm_shadow_vm {
++	/* A unique id to the shadow structs in the hyp shadow area. */
++	int shadow_vm_handle;
++
++	/* Number of vcpus for the vm. */
++	int created_vcpus;
++
++	/* The host's kvm va. */
++	unsigned long host_kvm_va;
++
++	/* The donated size of shadow_vm. */
++	unsigned long shadow_size;
++
++	/*
++	 * VM's shadow EPT. All vCPU shares one mapping.
++	 * FIXME: a potential security issue if some vCPUs are
++	 * in SMM but the others are not.
++	 */
++	struct shadow_ept_desc sept_desc;
++
++	/*
++	 * Page state page table manages the page states, and
++	 * works as IOMMU second-level page table for protected
++	 * VM with passthrough devices. For the protected VM
++	 * without passthrough devices or normal VM, it manages
++	 * the page states only.
++	 */
++	struct pkvm_pgtable pgstate_pgt;
++	/* Indicate if pgstate_pgt needs to be prepopulated */
++	bool need_prepopulation;
++	/*
++	 * Indicate the count of the shadow VM passthrough devices
++	 * which are attached to non-coherent IOMMU.
++	 */
++	unsigned long noncoherent_ptdev;
++
++	/* link the passthrough devices of a protected VM */
++	struct list_head ptdev_head;
++
++	/* The vm_type to indicate if this is a protected VM */
++	unsigned long vm_type;
++
++	pkvm_spinlock_t lock;
++} __aligned(PAGE_SIZE);
++
++#define sept_to_shadow_ept_desc(_sept)	container_of(_sept, struct shadow_ept_desc, sept)
++
++#define sept_desc_to_shadow_vm(desc) container_of(desc, struct pkvm_shadow_vm, sept_desc)
++
++#define sept_to_shadow_vm(_sept) sept_desc_to_shadow_vm(sept_to_shadow_ept_desc(_sept))
++
++#define pgstate_pgt_to_shadow_vm(_pgt) container_of(_pgt, struct pkvm_shadow_vm, pgstate_pgt)
++
++int __pkvm_init_shadow_vm(struct kvm_vcpu *hvcpu, unsigned long kvm_va,
++			  unsigned long shadow_pa,  size_t shadow_size);
++unsigned long __pkvm_teardown_shadow_vm(int shadow_vm_handle);
++struct pkvm_shadow_vm *get_shadow_vm(int shadow_vm_handle);
++void put_shadow_vm(int shadow_vm_handle);
++void pkvm_shadow_vm_link_ptdev(struct pkvm_shadow_vm *vm,
++			       struct list_head *node, bool coherency);
++void pkvm_shadow_vm_unlink_ptdev(struct pkvm_shadow_vm *vm,
++				 struct list_head *node, bool coherency);
++s64 __pkvm_init_shadow_vcpu(struct kvm_vcpu *hvcpu, int shadow_vm_handle,
++			    unsigned long vcpu_va, unsigned long shadow_pa,
++			    size_t shadow_size);
++unsigned long __pkvm_teardown_shadow_vcpu(s64 shadow_vcpu_handle);
++struct shadow_vcpu_state *get_shadow_vcpu(s64 shadow_vcpu_handle);
++void put_shadow_vcpu(s64 shadow_vcpu_handle);
++s64 find_shadow_vcpu_handle_by_vmcs(unsigned long vmcs12_pa);
++void pkvm_kick_vcpu(struct kvm_vcpu *vcpu);
++int pkvm_add_ptdev(int shadow_vm_handle, u16 bdf, u32 pasid);
++
++#define PKVM_REQ_TLB_FLUSH_HOST_EPT			KVM_ARCH_REQ(0)
++#define PKVM_REQ_TLB_FLUSH_SHADOW_EPT			KVM_ARCH_REQ(1)
++
++extern struct pkvm_hyp *pkvm_hyp;
++
++static inline bool shadow_vcpu_is_protected(struct shadow_vcpu_state *shadow_vcpu)
++{
++	return shadow_vcpu->vm->vm_type == KVM_X86_PROTECTED_VM;
++}
++
++#endif
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/pkvm_nested_vmcs_fields.h b/arch/x86/kvm/vmx/pkvm/hyp/pkvm_nested_vmcs_fields.h
+new file mode 100644
+index 000000000000..85512f010bdb
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/pkvm_nested_vmcs_fields.h
+@@ -0,0 +1,191 @@
++#if !defined(EMULATED_FIELD_RW) && !defined(SHADOW_FIELD_RW) && !defined(SHADOW_FIELD_RO)
++BUILD_BUG_ON(1)
++#endif
++
++#ifndef EMULATED_FIELD_RW
++#define EMULATED_FIELD_RW(x, y)
++#endif
++#ifndef SHADOW_FIELD_RW
++#define SHADOW_FIELD_RW(x, y)
++#endif
++#ifndef SHADOW_FIELD_RO
++#define SHADOW_FIELD_RO(x, y)
++#endif
++
++/*
++ * Emulated fields for vmcs02:
++ *
++ * These fields are recorded in cached_vmcs12, and should be emulated to
++ * real value in vmcs02 before vmcs01 active.
++ */
++/* 16-bits */
++EMULATED_FIELD_RW(VIRTUAL_PROCESSOR_ID, virtual_processor_id)
++
++/* 32-bits */
++EMULATED_FIELD_RW(VM_EXIT_CONTROLS, vm_exit_controls)
++EMULATED_FIELD_RW(VM_ENTRY_CONTROLS, vm_entry_controls)
++EMULATED_FIELD_RW(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control)
++
++/* 64-bits, what about their HIGH 32 fields?  */
++EMULATED_FIELD_RW(IO_BITMAP_A, io_bitmap_a)
++EMULATED_FIELD_RW(IO_BITMAP_B, io_bitmap_b)
++EMULATED_FIELD_RW(MSR_BITMAP, msr_bitmap)
++EMULATED_FIELD_RW(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr)
++EMULATED_FIELD_RW(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr)
++EMULATED_FIELD_RW(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr)
++EMULATED_FIELD_RW(XSS_EXIT_BITMAP, xss_exit_bitmap)
++EMULATED_FIELD_RW(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr)
++EMULATED_FIELD_RW(PML_ADDRESS, pml_address)
++EMULATED_FIELD_RW(VM_FUNCTION_CONTROL, vm_function_control)
++EMULATED_FIELD_RW(EPT_POINTER, ept_pointer)
++EMULATED_FIELD_RW(EOI_EXIT_BITMAP0, eoi_exit_bitmap0)
++EMULATED_FIELD_RW(EOI_EXIT_BITMAP1, eoi_exit_bitmap1)
++EMULATED_FIELD_RW(EOI_EXIT_BITMAP2, eoi_exit_bitmap2)
++EMULATED_FIELD_RW(EOI_EXIT_BITMAP3, eoi_exit_bitmap3)
++EMULATED_FIELD_RW(EPTP_LIST_ADDRESS, eptp_list_address)
++EMULATED_FIELD_RW(VMREAD_BITMAP, vmread_bitmap)
++EMULATED_FIELD_RW(VMWRITE_BITMAP, vmwrite_bitmap)
++EMULATED_FIELD_RW(ENCLS_EXITING_BITMAP, encls_exiting_bitmap)
++EMULATED_FIELD_RW(VMCS_LINK_POINTER, vmcs_link_pointer)
++
++/*
++ * Shadow fields for vmcs02:
++ *
++ * These fields are HW shadowing in vmcs02, we try to shadow all non-host
++ * fields except emulated ones.
++ * Host state fields need to be recorded in cached_vmcs12 and restored to vmcs01's
++ * guest state when returning to L1 host, so please ensure __NO__ host fields below.
++ */
++
++/* 16-bits */
++SHADOW_FIELD_RW(POSTED_INTR_NV, posted_intr_nv)
++SHADOW_FIELD_RW(GUEST_ES_SELECTOR, guest_es_selector)
++SHADOW_FIELD_RW(GUEST_CS_SELECTOR, guest_cs_selector)
++SHADOW_FIELD_RW(GUEST_SS_SELECTOR, guest_ss_selector)
++SHADOW_FIELD_RW(GUEST_DS_SELECTOR, guest_ds_selector)
++SHADOW_FIELD_RW(GUEST_FS_SELECTOR, guest_fs_selector)
++SHADOW_FIELD_RW(GUEST_GS_SELECTOR, guest_gs_selector)
++SHADOW_FIELD_RW(GUEST_LDTR_SELECTOR, guest_ldtr_selector)
++SHADOW_FIELD_RW(GUEST_TR_SELECTOR, guest_tr_selector)
++SHADOW_FIELD_RW(GUEST_TR_SELECTOR, guest_tr_selector)
++SHADOW_FIELD_RW(GUEST_INTR_STATUS, guest_intr_status)
++SHADOW_FIELD_RW(GUEST_PML_INDEX, guest_pml_index)
++
++/* 32-bits */
++SHADOW_FIELD_RW(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control)
++SHADOW_FIELD_RW(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control)
++SHADOW_FIELD_RW(EXCEPTION_BITMAP, exception_bitmap)
++SHADOW_FIELD_RW(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask)
++SHADOW_FIELD_RW(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match)
++SHADOW_FIELD_RW(CR3_TARGET_COUNT, cr3_target_count)
++SHADOW_FIELD_RW(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count)
++SHADOW_FIELD_RW(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count)
++SHADOW_FIELD_RW(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count)
++SHADOW_FIELD_RW(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field)
++SHADOW_FIELD_RW(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code)
++SHADOW_FIELD_RW(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len)
++SHADOW_FIELD_RW(TPR_THRESHOLD, tpr_threshold)
++SHADOW_FIELD_RW(GUEST_ES_LIMIT, guest_es_limit)
++SHADOW_FIELD_RW(GUEST_CS_LIMIT, guest_cs_limit)
++SHADOW_FIELD_RW(GUEST_SS_LIMIT, guest_ss_limit)
++SHADOW_FIELD_RW(GUEST_DS_LIMIT, guest_ds_limit)
++SHADOW_FIELD_RW(GUEST_FS_LIMIT, guest_fs_limit)
++SHADOW_FIELD_RW(GUEST_GS_LIMIT, guest_gs_limit)
++SHADOW_FIELD_RW(GUEST_LDTR_LIMIT, guest_ldtr_limit)
++SHADOW_FIELD_RW(GUEST_TR_LIMIT, guest_tr_limit)
++SHADOW_FIELD_RW(GUEST_GDTR_LIMIT, guest_gdtr_limit)
++SHADOW_FIELD_RW(GUEST_IDTR_LIMIT, guest_idtr_limit)
++SHADOW_FIELD_RW(GUEST_ES_AR_BYTES, guest_es_ar_bytes)
++SHADOW_FIELD_RW(GUEST_CS_AR_BYTES, guest_cs_ar_bytes)
++SHADOW_FIELD_RW(GUEST_SS_AR_BYTES, guest_ss_ar_bytes)
++SHADOW_FIELD_RW(GUEST_DS_AR_BYTES, guest_ds_ar_bytes)
++SHADOW_FIELD_RW(GUEST_FS_AR_BYTES, guest_fs_ar_bytes)
++SHADOW_FIELD_RW(GUEST_GS_AR_BYTES, guest_gs_ar_bytes)
++SHADOW_FIELD_RW(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes)
++SHADOW_FIELD_RW(GUEST_TR_AR_BYTES, guest_tr_ar_bytes)
++SHADOW_FIELD_RW(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info)
++SHADOW_FIELD_RW(GUEST_ACTIVITY_STATE, guest_activity_state)
++SHADOW_FIELD_RW(GUEST_SYSENTER_CS, guest_sysenter_cs)
++SHADOW_FIELD_RW(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value)
++SHADOW_FIELD_RW(PLE_GAP, ple_gap)
++SHADOW_FIELD_RW(PLE_WINDOW, ple_window)
++
++/* Natural width */
++SHADOW_FIELD_RW(CR0_GUEST_HOST_MASK, cr0_guest_host_mask)
++SHADOW_FIELD_RW(CR4_GUEST_HOST_MASK, cr4_guest_host_mask)
++SHADOW_FIELD_RW(CR0_READ_SHADOW, cr0_read_shadow)
++SHADOW_FIELD_RW(CR4_READ_SHADOW, cr4_read_shadow)
++SHADOW_FIELD_RW(GUEST_CR0, guest_cr0)
++SHADOW_FIELD_RW(GUEST_CR3, guest_cr3)
++SHADOW_FIELD_RW(GUEST_CR4, guest_cr4)
++SHADOW_FIELD_RW(GUEST_ES_BASE, guest_es_base)
++SHADOW_FIELD_RW(GUEST_CS_BASE, guest_cs_base)
++SHADOW_FIELD_RW(GUEST_SS_BASE, guest_ss_base)
++SHADOW_FIELD_RW(GUEST_DS_BASE, guest_ds_base)
++SHADOW_FIELD_RW(GUEST_FS_BASE, guest_fs_base)
++SHADOW_FIELD_RW(GUEST_GS_BASE, guest_gs_base)
++SHADOW_FIELD_RW(GUEST_LDTR_BASE, guest_ldtr_base)
++SHADOW_FIELD_RW(GUEST_TR_BASE, guest_tr_base)
++SHADOW_FIELD_RW(GUEST_GDTR_BASE, guest_gdtr_base)
++SHADOW_FIELD_RW(GUEST_IDTR_BASE, guest_idtr_base)
++SHADOW_FIELD_RW(GUEST_DR7, guest_dr7)
++SHADOW_FIELD_RW(GUEST_RSP, guest_rsp)
++SHADOW_FIELD_RW(GUEST_RIP, guest_rip)
++SHADOW_FIELD_RW(GUEST_RFLAGS, guest_rflags)
++SHADOW_FIELD_RW(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions)
++SHADOW_FIELD_RW(GUEST_SYSENTER_ESP, guest_sysenter_esp)
++SHADOW_FIELD_RW(GUEST_SYSENTER_EIP, guest_sysenter_eip)
++
++/* 64-bit */
++SHADOW_FIELD_RW(TSC_OFFSET, tsc_offset)
++SHADOW_FIELD_RW(TSC_OFFSET_HIGH, tsc_offset)
++SHADOW_FIELD_RW(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr)
++SHADOW_FIELD_RW(VIRTUAL_APIC_PAGE_ADDR_HIGH, virtual_apic_page_addr)
++SHADOW_FIELD_RW(APIC_ACCESS_ADDR, apic_access_addr)
++SHADOW_FIELD_RW(APIC_ACCESS_ADDR_HIGH, apic_access_addr)
++SHADOW_FIELD_RW(TSC_MULTIPLIER, tsc_multiplier)
++SHADOW_FIELD_RW(TSC_MULTIPLIER_HIGH, tsc_multiplier)
++SHADOW_FIELD_RW(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl)
++SHADOW_FIELD_RW(GUEST_IA32_DEBUGCTL_HIGH, guest_ia32_debugctl)
++SHADOW_FIELD_RW(GUEST_IA32_PAT, guest_ia32_pat)
++SHADOW_FIELD_RW(GUEST_IA32_PAT_HIGH, guest_ia32_pat)
++SHADOW_FIELD_RW(GUEST_IA32_EFER, guest_ia32_efer)
++SHADOW_FIELD_RW(GUEST_IA32_EFER_HIGH, guest_ia32_efer)
++SHADOW_FIELD_RW(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl)
++SHADOW_FIELD_RW(GUEST_IA32_PERF_GLOBAL_CTRL_HIGH, guest_ia32_perf_global_ctrl)
++SHADOW_FIELD_RW(GUEST_PDPTR0, guest_pdptr0)
++SHADOW_FIELD_RW(GUEST_PDPTR0_HIGH, guest_pdptr0)
++SHADOW_FIELD_RW(GUEST_PDPTR1, guest_pdptr1)
++SHADOW_FIELD_RW(GUEST_PDPTR1_HIGH, guest_pdptr1)
++SHADOW_FIELD_RW(GUEST_PDPTR2, guest_pdptr2)
++SHADOW_FIELD_RW(GUEST_PDPTR2_HIGH, guest_pdptr2)
++SHADOW_FIELD_RW(GUEST_PDPTR3, guest_pdptr3)
++SHADOW_FIELD_RW(GUEST_PDPTR3_HIGH, guest_pdptr3)
++SHADOW_FIELD_RW(GUEST_BNDCFGS, guest_bndcfgs)
++SHADOW_FIELD_RW(GUEST_BNDCFGS_HIGH, guest_bndcfgs)
++
++/* 32-bits */
++SHADOW_FIELD_RO(VM_INSTRUCTION_ERROR, vm_instruction_error)
++SHADOW_FIELD_RO(VM_EXIT_REASON, vm_exit_reason)
++SHADOW_FIELD_RO(VM_EXIT_INTR_INFO, vm_exit_intr_info)
++SHADOW_FIELD_RO(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code)
++SHADOW_FIELD_RO(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field)
++SHADOW_FIELD_RO(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code)
++SHADOW_FIELD_RO(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len)
++SHADOW_FIELD_RO(VMX_INSTRUCTION_INFO, vmx_instruction_info)
++
++/* Natural width */
++SHADOW_FIELD_RO(EXIT_QUALIFICATION, exit_qualification)
++SHADOW_FIELD_RO(EXIT_IO_RCX, exit_io_rcx)
++SHADOW_FIELD_RO(EXIT_IO_RSI, exit_io_rsi)
++SHADOW_FIELD_RO(EXIT_IO_RDI, exit_io_rdi)
++SHADOW_FIELD_RO(EXIT_IO_RIP, exit_io_rip)
++SHADOW_FIELD_RO(GUEST_LINEAR_ADDRESS, guest_linear_address)
++
++/* 64-bit */
++SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS, guest_physical_address)
++SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS_HIGH, guest_physical_address)
++
++#undef EMULATED_FIELD_RW
++#undef SHADOW_FIELD_RW
++#undef SHADOW_FIELD_RO
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/ptdev.c b/arch/x86/kvm/vmx/pkvm/hyp/ptdev.c
+new file mode 100644
+index 000000000000..409fd0af75e9
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/ptdev.c
+@@ -0,0 +1,213 @@
++// SPDX-License-Identifier: GPL-2.0
++/* Copyright(c) 2022 Intel Corporation. */
++
++#include <linux/hashtable.h>
++#include <pkvm_spinlock.h>
++#include <pkvm.h>
++#include "pkvm_hyp.h"
++#include "iommu.h"
++#include "ptdev.h"
++#include "iommu_spgt.h"
++#include "bug.h"
++#include "pci.h"
++
++#define MAX_PTDEV_NUM	(PKVM_MAX_PDEV_NUM + PKVM_MAX_PASID_PDEV_NUM)
++static DEFINE_HASHTABLE(ptdev_hasht, 8);
++static DECLARE_BITMAP(ptdevs_bitmap, MAX_PTDEV_NUM);
++static struct pkvm_ptdev pkvm_ptdev[MAX_PTDEV_NUM];
++static pkvm_spinlock_t ptdev_lock = { __ARCH_PKVM_SPINLOCK_UNLOCKED };
++
++struct pkvm_ptdev *pkvm_alloc_ptdev(u16 bdf, u32 pasid, bool coherency)
++{
++	struct pkvm_ptdev *ptdev = NULL;
++	unsigned long index;
++
++	pkvm_spin_lock(&ptdev_lock);
++
++	index = find_next_zero_bit(ptdevs_bitmap, MAX_PTDEV_NUM, 0);
++	if (index < MAX_PTDEV_NUM) {
++		__set_bit(index, ptdevs_bitmap);
++		ptdev = &pkvm_ptdev[index];
++		ptdev->bdf = bdf;
++		ptdev->pasid = pasid;
++		ptdev->iommu_coherency = coherency;
++		ptdev->index = index;
++		ptdev->pgt = pkvm_hyp->host_vm.ept;
++		INIT_LIST_HEAD(&ptdev->iommu_node);
++		INIT_LIST_HEAD(&ptdev->vm_node);
++		atomic_set(&ptdev->refcount, 1);
++		pkvm_spinlock_init(&ptdev->lock);
++		hash_add(ptdev_hasht, &ptdev->hnode, bdf);
++	}
++
++	pkvm_spin_unlock(&ptdev_lock);
++
++	return ptdev;
++}
++
++struct pkvm_ptdev *pkvm_get_ptdev(u16 bdf, u32 pasid)
++{
++	struct pkvm_ptdev *ptdev = NULL, *tmp;
++
++	pkvm_spin_lock(&ptdev_lock);
++
++	hash_for_each_possible(ptdev_hasht, tmp, hnode, bdf) {
++		if (match_ptdev(tmp, bdf, pasid)) {
++			ptdev = atomic_inc_not_zero(&tmp->refcount) ? tmp : NULL;
++			if (ptdev)
++				break;
++		}
++	}
++
++	pkvm_spin_unlock(&ptdev_lock);
++	return ptdev;
++}
++
++void pkvm_put_ptdev(struct pkvm_ptdev *ptdev)
++{
++	if (!atomic_dec_and_test(&ptdev->refcount))
++		return;
++
++	pkvm_spin_lock(&ptdev_lock);
++
++	hlist_del(&ptdev->hnode);
++
++	__clear_bit(ptdev->index, ptdevs_bitmap);
++
++	if (ptdev->pgt != pkvm_hyp->host_vm.ept)
++		pkvm_put_host_iommu_spgt(ptdev->pgt, ptdev->iommu_coherency);
++
++	memset(ptdev, 0, sizeof(struct pkvm_ptdev));
++
++	pkvm_spin_unlock(&ptdev_lock);
++}
++
++void pkvm_setup_ptdev_vpgt(struct pkvm_ptdev *ptdev, unsigned long root_gpa,
++			   struct pkvm_mm_ops *mm_ops, struct pkvm_pgtable_ops *paging_ops,
++			   struct pkvm_pgtable_cap *cap, bool shadowed)
++{
++	pkvm_spin_lock(&ptdev->lock);
++
++	if (ptdev->pgt != pkvm_hyp->host_vm.ept &&
++			(!shadowed || root_gpa != ptdev->vpgt.root_pa) &&
++			!ptdev_attached_to_vm(ptdev)) {
++		pkvm_put_host_iommu_spgt(ptdev->pgt, ptdev->iommu_coherency);
++		ptdev->pgt = pkvm_hyp->host_vm.ept;
++	}
++
++	if (!root_gpa || root_gpa == INVALID_ADDR || !mm_ops || !paging_ops || !cap) {
++		memset(&ptdev->vpgt, 0, sizeof(struct pkvm_pgtable));
++		goto out;
++	}
++
++	ptdev->vpgt.root_pa = root_gpa;
++	PKVM_ASSERT(pkvm_pgtable_init(&ptdev->vpgt, mm_ops, paging_ops, cap, false) == 0);
++
++	if (shadowed && ptdev->pgt == pkvm_hyp->host_vm.ept) {
++		ptdev->pgt = pkvm_get_host_iommu_spgt(root_gpa, ptdev->iommu_coherency);
++		PKVM_ASSERT(ptdev->pgt);
++	}
++out:
++	pkvm_spin_unlock(&ptdev->lock);
++}
++
++void pkvm_setup_ptdev_did(struct pkvm_ptdev *ptdev, u16 did)
++{
++	ptdev->did = did;
++}
++
++static void pkvm_ptdev_cache_bar(struct pkvm_ptdev *ptdev)
++{
++	u32 offset;
++	int i;
++
++	for (i = 0; i < 6; i++) {
++		offset = 0x10 + 4 * i;
++		ptdev->bars[i] = pkvm_pci_cfg_space_read(ptdev->bdf, offset, 4);
++	}
++}
++
++/*
++ * pkvm_detach_ptdev()	- detach a ptdev from the shadow VM it is attached.
++ * Basically it reverts what pkvm_attach_ptdev() does.
++ *
++ * @ptdev:	The target ptdev.
++ * @vm:		The shadow VM which will be attached to.
++ */
++void pkvm_detach_ptdev(struct pkvm_ptdev *ptdev, struct pkvm_shadow_vm *vm)
++{
++	/* Reset what the attach API has set */
++	pkvm_spin_lock(&ptdev->lock);
++	ptdev->shadow_vm_handle = 0;
++	ptdev->pgt = pkvm_hyp->host_vm.ept;
++	pkvm_spin_unlock(&ptdev->lock);
++
++	pkvm_shadow_vm_unlink_ptdev(vm, &ptdev->vm_node,
++				    ptdev->iommu_coherency);
++	pkvm_iommu_sync(ptdev->bdf, ptdev->pasid);
++
++	pkvm_put_ptdev(ptdev);
++}
++
++/*
++ * pkvm_attach_ptdev() - attach a ptdev to a shadow VM so it will be isolated
++ * from the primary VM.
++ *
++ * @bdf:	The bdf of this ptdev.
++ * @pasid:	The pasid of this ptdev.
++ * @vm:		The shadow VM which will be attached to.
++ *
++ * FIXME:
++ * The passthrough devices attached to the protected VM is relying on KVM
++ * high to send vmcall so that pKVM can know which device should be isolated.
++ * But if KVM high has created a passthrough device for a protected VM without
++ * using this vmcall to notify pKVM, pKVM should still be able to isolate this
++ * passthrough device. To guarantee this, either needs pKVM to know the
++ * passthrough devices information to isolate them independently or needs
++ * protected VM to check with pKVM about its passthrough device info through
++ * some vmcall. Currently neither way is available.
++ */
++int pkvm_attach_ptdev(u16 bdf, u32 pasid, struct pkvm_shadow_vm *vm)
++{
++	struct pkvm_ptdev *ptdev = pkvm_get_ptdev(bdf, pasid);
++
++	if (!ptdev) {
++		ptdev = pkvm_alloc_ptdev(bdf, pasid,
++					 pkvm_iommu_coherency(bdf, pasid));
++		if (!ptdev)
++			return -ENODEV;
++	}
++
++	pkvm_spin_lock(&ptdev->lock);
++
++	if (cmpxchg(&ptdev->shadow_vm_handle, 0, vm->shadow_vm_handle) != 0) {
++		pkvm_err("%s: ptdev with bdf 0x%x pasid 0x%x is already attached\n",
++			 __func__, bdf, pasid);
++		pkvm_spin_unlock(&ptdev->lock);
++		pkvm_put_ptdev(ptdev);
++		return -ENODEV;
++	}
++
++	pkvm_ptdev_cache_bar(ptdev);
++
++	PKVM_ASSERT(ptdev->pgt != &vm->pgstate_pgt);
++	if (ptdev->pgt != pkvm_hyp->host_vm.ept)
++		pkvm_put_host_iommu_spgt(ptdev->pgt, ptdev->iommu_coherency);
++
++	/*
++	 * Reset pgt of this ptdev to VM's pgstate_pgt so need to update
++	 * IOMMU page table accordingly.
++	 */
++	ptdev->pgt = &vm->pgstate_pgt;
++
++	pkvm_spin_unlock(&ptdev->lock);
++
++	pkvm_shadow_vm_link_ptdev(vm, &ptdev->vm_node,
++				  ptdev->iommu_coherency);
++	if (pkvm_iommu_sync(ptdev->bdf, ptdev->pasid)) {
++		pkvm_detach_ptdev(ptdev, vm);
++		return -ENODEV;
++	}
++
++	return 0;
++}
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/ptdev.h b/arch/x86/kvm/vmx/pkvm/hyp/ptdev.h
+new file mode 100644
+index 000000000000..bfefcf7346c1
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/ptdev.h
+@@ -0,0 +1,53 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/* Copyright(c) 2022 Intel Corporation. */
++
++#ifndef _PKVM_PTDEV_H_
++#define _PKVM_PTDEV_H_
++
++#include "pkvm_hyp.h"
++#include "pgtable.h"
++
++struct pkvm_ptdev {
++	atomic_t refcount;
++	struct hlist_node hnode;
++	u16 did;
++	u16 bdf;
++	u32 pasid;
++	unsigned long index;
++	struct list_head iommu_node;
++	bool iommu_coherency;
++	/* cached value of BARs when attach to shadow vm */
++	u32 bars[6];
++
++	/* Represents the page table maintained by primary VM */
++	struct pkvm_pgtable vpgt;
++	/* Represents the page table maintained by pKVM */
++	struct pkvm_pgtable *pgt;
++
++	pkvm_spinlock_t lock;
++
++	int shadow_vm_handle;
++	struct list_head vm_node;
++};
++
++struct pkvm_ptdev *pkvm_alloc_ptdev(u16 bdf, u32 pasid, bool coherency);
++struct pkvm_ptdev *pkvm_get_ptdev(u16 bdf, u32 pasid);
++void pkvm_put_ptdev(struct pkvm_ptdev *ptdev);
++void pkvm_setup_ptdev_vpgt(struct pkvm_ptdev *ptdev, unsigned long root_gpa,
++			   struct pkvm_mm_ops *mm_ops, struct pkvm_pgtable_ops *paging_ops,
++			   struct pkvm_pgtable_cap *cap, bool shadowed);
++void pkvm_setup_ptdev_did(struct pkvm_ptdev *ptdev, u16 did);
++void pkvm_detach_ptdev(struct pkvm_ptdev *ptdev, struct pkvm_shadow_vm *vm);
++int pkvm_attach_ptdev(u16 bdf, u32 pasid, struct pkvm_shadow_vm *vm);
++
++static inline bool match_ptdev(struct pkvm_ptdev *ptdev, u16 bdf, u32 pasid)
++{
++	return ptdev && (ptdev->bdf == bdf) && (ptdev->pasid == pasid);
++}
++
++static inline bool ptdev_attached_to_vm(struct pkvm_ptdev *ptdev)
++{
++	/* Attached ptdev has non-zero shadow_vm_handle */
++	return cmpxchg(&ptdev->shadow_vm_handle, 0, 0) != 0;
++}
++#endif
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/trace.c b/arch/x86/kvm/vmx/pkvm/hyp/trace.c
+new file mode 100644
+index 000000000000..c4ef27e4d1c1
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/trace.c
+@@ -0,0 +1,117 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++#include <linux/kvm_host.h>
++#include <pkvm_spinlock.h>
++#include <pkvm_trace.h>
++#include <pkvm.h>
++
++struct vmexit_perf {
++	struct perf_data l1data;
++	struct perf_data l2data;
++	struct perf_data *cur;
++	bool on;
++	bool start;
++	int cpu;
++	pkvm_spinlock_t lock;
++};
++static struct vmexit_perf hvcpu_perf[CONFIG_NR_CPUS];
++
++static inline unsigned long long pkvm_rdtsc_ordered(void)
++{
++	DECLARE_ARGS(val, low, high);
++
++	asm volatile("lfence;rdtsc" : EAX_EDX_RET(val, low, high));
++
++	return EAX_EDX_VAL(val, low, high);
++}
++
++void trace_vmexit_start(struct kvm_vcpu *vcpu, bool nested_vmexit)
++{
++	int cpu = vcpu->cpu;
++	struct vmexit_perf *perf = &hvcpu_perf[cpu];
++
++	if (!perf->on)
++		return;
++
++	perf->start = true;
++	perf->cpu = cpu;
++	if (nested_vmexit)
++		perf->cur = &perf->l2data;
++	else
++		perf->cur = &perf->l1data;
++
++	pkvm_spin_lock(&perf->lock);
++	perf->cur->tsc = pkvm_rdtsc_ordered();
++	pkvm_spin_unlock(&perf->lock);
++}
++
++void trace_vmexit_end(struct kvm_vcpu *vcpu, u32 index)
++{
++	int cpu = vcpu->cpu;
++	struct vmexit_perf *perf = &hvcpu_perf[cpu];
++	struct perf_data *perf_data = perf->cur;
++	unsigned long long cycles;
++
++	if (!perf->on || !perf->start || !perf_data)
++		return;
++
++	pkvm_spin_lock(&perf->lock);
++	cycles = pkvm_rdtsc_ordered() - perf_data->tsc;
++	perf_data->data.cycles[index] += cycles;
++	perf_data->data.total_cycles += cycles;
++	perf_data->data.total_count++;
++	perf_data->data.reasons[index]++;
++	pkvm_spin_unlock(&perf->lock);
++}
++
++void pkvm_handle_set_vmexit_trace(struct kvm_vcpu *vcpu, bool en)
++{
++	int cpu = vcpu->cpu;
++	struct vmexit_perf *perf = &hvcpu_perf[cpu];
++
++	if (en && !perf->on) {
++		perf->on = true;
++		pkvm_dbg("%s: CPU%d enable vmexit_trace\n", __func__, cpu);
++		memset(&perf->l1data, 0, sizeof(struct perf_data));
++		memset(&perf->l2data, 0, sizeof(struct perf_data));
++		return;
++	}
++
++	if (!en && perf->on) {
++		perf->on = false;
++		perf->start = false;
++		pkvm_dbg("%s: CPU%d disable vmexit_trace\n", __func__, cpu);
++		return;
++	}
++}
++
++void pkvm_handle_dump_vmexit_trace(unsigned long pa, unsigned long size)
++{
++	void *out = pkvm_phys_to_virt(pa);
++	struct pkvm_host_vcpu *p;
++	struct vmexit_perf *perf;
++	int cpu, index;
++
++	for (index = 0; index < CONFIG_NR_CPUS; index++) {
++		p = pkvm_hyp->host_vm.host_vcpus[index];
++		if (!p)
++			continue;
++
++		cpu = p->vmx.vcpu.cpu;
++		perf = &hvcpu_perf[cpu];
++
++		pkvm_spin_lock(&perf->lock);
++		if (size >= sizeof(struct vmexit_perf_dump)) {
++			struct vmexit_perf_dump *dump = out;
++
++			memcpy(&dump->l1data, &perf->l1data, sizeof(struct perf_data));
++			memcpy(&dump->l2data, &perf->l2data, sizeof(struct perf_data));
++			dump->cpu = perf->cpu;
++			out += sizeof(struct vmexit_perf_dump);
++			size -= sizeof(struct vmexit_perf_dump);
++		}
++		pkvm_spin_unlock(&perf->lock);
++	}
++}
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/trace.h b/arch/x86/kvm/vmx/pkvm/hyp/trace.h
+new file mode 100644
+index 000000000000..970d2e770844
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/trace.h
+@@ -0,0 +1,15 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++#ifndef _HYP_TRACE_H_
++#define _HYP_TRACE_H_
++
++#include <pkvm_trace.h>
++
++void trace_vmexit_start(struct kvm_vcpu *vcpu, bool nested_vmexit);
++void trace_vmexit_end(struct kvm_vcpu *vcpu, u32 index);
++void pkvm_handle_set_vmexit_trace(struct kvm_vcpu *vcpu, bool en);
++void pkvm_handle_dump_vmexit_trace(unsigned long pa, unsigned long size);
++
++#endif
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/vmexit.c b/arch/x86/kvm/vmx/pkvm/hyp/vmexit.c
+new file mode 100644
+index 000000000000..77324f75424b
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/vmexit.c
+@@ -0,0 +1,360 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++
++#include <linux/memblock.h>
++#include <asm/kvm_pkvm.h>
++#include <pkvm.h>
++#include "trace.h"
++#include "vmexit.h"
++
++#include "pkvm_hyp.h"
++#include "vmsr.h"
++#include "nested.h"
++#include "ept.h"
++#include "iommu.h"
++#include "lapic.h"
++#include "io_emulate.h"
++#include "debug.h"
++
++#define CR0	0
++#define CR3	3
++#define CR4	4
++
++#define MOV_TO_CR		0
++
++extern int __pkvm_init_finalise(struct kvm_vcpu *vcpu,
++		phys_addr_t phys, unsigned long size);
++
++static void skip_emulated_instruction(void)
++{
++	unsigned long rip;
++
++	rip = vmcs_readl(GUEST_RIP);
++	rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
++	vmcs_writel(GUEST_RIP, rip);
++}
++
++static void handle_cpuid(struct kvm_vcpu *vcpu)
++{
++	u32 eax, ebx, ecx, edx;
++
++	eax = vcpu->arch.regs[VCPU_REGS_RAX];
++	ecx = vcpu->arch.regs[VCPU_REGS_RCX];
++	native_cpuid(&eax, &ebx, &ecx, &edx);
++	vcpu->arch.regs[VCPU_REGS_RAX] = eax;
++	vcpu->arch.regs[VCPU_REGS_RBX] = ebx;
++	vcpu->arch.regs[VCPU_REGS_RCX] = ecx;
++	vcpu->arch.regs[VCPU_REGS_RDX] = edx;
++}
++
++static void handle_cr(struct kvm_vcpu *vcpu)
++{
++	struct vcpu_vmx *vmx = to_vmx(vcpu);
++	unsigned long exit_qual, val;
++	int cr;
++	int type;
++	int reg;
++	unsigned long old_value;
++
++	exit_qual = vmx->exit_qualification;
++	cr = exit_qual & 15;
++	type = (exit_qual >> 4)	& 3;
++	reg = (exit_qual >> 8) & 15;
++
++	switch (type) {
++	case MOV_TO_CR:
++		switch (cr) {
++		case CR0:
++			old_value = vmcs_readl(GUEST_CR0);
++			val = vcpu->arch.regs[reg];
++			break;
++		case CR4:
++			old_value = vmcs_readl(GUEST_CR4);
++			val = vcpu->arch.regs[reg];
++			/*
++			 * VMXE bit is owned by host, others are owned by guest
++			 * So only when guest is trying to modify VMXE bit it
++			 * can cause vmexit and get here.
++			 */
++			vmcs_writel(CR4_READ_SHADOW, val);
++			break;
++		default:
++			break;
++		}
++		break;
++	default:
++		break;
++	}
++}
++
++static unsigned long handle_vmcall(struct kvm_vcpu *vcpu)
++{
++	u64 nr, a0, a1, a2, a3;
++	unsigned long ret = 0;
++
++	nr = vcpu->arch.regs[VCPU_REGS_RAX];
++	a0 = vcpu->arch.regs[VCPU_REGS_RBX];
++	a1 = vcpu->arch.regs[VCPU_REGS_RCX];
++	a2 = vcpu->arch.regs[VCPU_REGS_RDX];
++	a3 = vcpu->arch.regs[VCPU_REGS_RSI];
++
++	switch (nr) {
++	case PKVM_HC_SET_VMEXIT_TRACE:
++		pkvm_handle_set_vmexit_trace(vcpu, a0);
++		break;
++	case PKVM_HC_DUMP_VMEXIT_TRACE:
++		pkvm_handle_dump_vmexit_trace(a0, a1);
++		break;
++	case PKVM_HC_INIT_FINALISE:
++		__pkvm_init_finalise(vcpu, a0, a1);
++		break;
++	case PKVM_HC_INIT_SHADOW_VM:
++		ret = __pkvm_init_shadow_vm(vcpu, a0, a1, a2);
++		break;
++	case PKVM_HC_INIT_SHADOW_VCPU:
++		ret = __pkvm_init_shadow_vcpu(vcpu, a0, a1, a2, a3);
++		break;
++	case PKVM_HC_TEARDOWN_SHADOW_VM:
++		ret = __pkvm_teardown_shadow_vm(a0);
++		break;
++	case PKVM_HC_TEARDOWN_SHADOW_VCPU:
++		ret = __pkvm_teardown_shadow_vcpu(a0);
++		break;
++	case PKVM_HC_MMIO_ACCESS:
++		ret = pkvm_access_iommu(a0, a1, a2, a3);
++		break;
++	case PKVM_HC_ACTIVATE_IOMMU:
++		ret = pkvm_activate_iommu();
++		break;
++	case PKVM_HC_TLB_REMOTE_FLUSH_RANGE:
++		nested_invalidate_shadow_ept(a0, a1, a2);
++		break;
++	case PKVM_HC_SET_MMIO_VE:
++		pkvm_shadow_clear_suppress_ve(vcpu, a0);
++		break;
++	case PKVM_HC_ADD_PTDEV:
++		ret = pkvm_add_ptdev(a0, a1, a2);
++		break;
++	default:
++		ret = -EINVAL;
++	}
++
++	return ret;
++}
++
++static void handle_xsetbv(struct kvm_vcpu *vcpu)
++{
++	u32 eax = (u32)(vcpu->arch.regs[VCPU_REGS_RAX] & -1u);
++	u32 edx = (u32)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u);
++	u32 ecx = (u32)(vcpu->arch.regs[VCPU_REGS_RCX] & -1u);
++
++	asm volatile(".byte 0x0f,0x01,0xd1"
++			: : "a" (eax), "d" (edx), "c" (ecx));
++}
++
++static void handle_irq_window(struct kvm_vcpu *vcpu)
++{
++	struct vcpu_vmx *vmx = to_vmx(vcpu);
++	u32 cpu_based_exec_ctrl = exec_controls_get(vmx);
++
++	exec_controls_set(vmx, cpu_based_exec_ctrl & ~CPU_BASED_INTR_WINDOW_EXITING);
++	pkvm_dbg("%s: CPU%d clear irq_window_exiting\n", __func__, vcpu->cpu);
++}
++
++static void handle_nmi_window(struct kvm_vcpu *vcpu)
++{
++	struct vcpu_vmx *vmx = to_vmx(vcpu);
++	u32 cpu_based_exec_ctrl = exec_controls_get(vmx);
++
++	exec_controls_set(vmx, cpu_based_exec_ctrl & ~CPU_BASED_NMI_WINDOW_EXITING);
++	pkvm_dbg("%s: CPU%d clear nmi_window_exiting\n", __func__, vcpu->cpu);
++}
++
++static void handle_pending_events(struct kvm_vcpu *vcpu)
++{
++	struct pkvm_host_vcpu *pkvm_host_vcpu = to_pkvm_hvcpu(vcpu);
++
++	if (!is_guest_mode(vcpu) && pkvm_host_vcpu->pending_nmi) {
++		/* Inject if NMI is not blocked */
++		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
++			     INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
++		pkvm_host_vcpu->pending_nmi = false;
++	}
++
++	if (kvm_check_request(PKVM_REQ_TLB_FLUSH_HOST_EPT, vcpu))
++		pkvm_flush_host_ept();
++	if (kvm_check_request(PKVM_REQ_TLB_FLUSH_SHADOW_EPT, vcpu))
++		nested_flush_shadow_ept(vcpu);
++}
++
++static inline void set_vcpu_mode(struct kvm_vcpu *vcpu, int mode)
++{
++	vcpu->mode = mode;
++	/*
++	 * Make sure vcpu->mode is set before checking/handling the pending
++	 * requests. Pairs with kvm_vcpu_exiting_guest_mode().
++	 */
++	smp_wmb();
++}
++
++/* we take use of kvm_vcpu structure, but not used all the fields */
++int pkvm_main(struct kvm_vcpu *vcpu)
++{
++	struct vcpu_vmx *vmx = to_vmx(vcpu);
++	int launch = 1;
++
++	vcpu->mode = IN_GUEST_MODE;
++
++	do {
++		bool skip_instruction = false, guest_exit = false;
++
++		if (__pkvm_vmx_vcpu_run(vcpu->arch.regs, launch)) {
++			pkvm_err("%s: CPU%d run_vcpu failed with error 0x%x\n",
++				__func__, vcpu->cpu, vmcs_read32(VM_INSTRUCTION_ERROR));
++			return -EINVAL;
++		}
++
++		vcpu->arch.cr2 = native_read_cr2();
++
++		trace_vmexit_start(vcpu, is_guest_mode(vcpu) ? true : false);
++
++		set_vcpu_mode(vcpu, OUTSIDE_GUEST_MODE);
++
++		vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
++		vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
++
++		vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
++		vmx->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
++
++		if (is_guest_mode(vcpu)) {
++			guest_exit = true;
++			nested_vmexit(vcpu, &skip_instruction);
++		} else {
++			switch (vmx->exit_reason.full) {
++			case EXIT_REASON_INIT_SIGNAL:
++				/*
++				 * INIT is used as kick when making a request.
++				 * So just break the vmexits and go to pending
++				 * events handling.
++				 */
++				break;
++			case EXIT_REASON_CPUID:
++				handle_cpuid(vcpu);
++				skip_instruction = true;
++				break;
++			case EXIT_REASON_CR_ACCESS:
++				pkvm_dbg("CPU%d vmexit_reason: CR_ACCESS.\n", vcpu->cpu);
++				handle_cr(vcpu);
++				skip_instruction = true;
++				break;
++			case EXIT_REASON_MSR_READ:
++				pkvm_dbg("CPU%d vmexit_reason: MSR_READ 0x%lx\n",
++						vcpu->cpu, vcpu->arch.regs[VCPU_REGS_RCX]);
++				handle_read_msr(vcpu);
++				skip_instruction = true;
++				break;
++			case EXIT_REASON_MSR_WRITE:
++				pkvm_dbg("CPU%d vmexit_reason: MSR_WRITE 0x%lx\n",
++						vcpu->cpu, vcpu->arch.regs[VCPU_REGS_RCX]);
++				handle_write_msr(vcpu);
++				skip_instruction = true;
++				break;
++			case EXIT_REASON_VMLAUNCH:
++				handle_vmlaunch(vcpu);
++				break;
++			case EXIT_REASON_VMRESUME:
++				handle_vmresume(vcpu);
++				break;
++			case EXIT_REASON_VMON:
++				pkvm_dbg("CPU%d vmexit reason: VMXON.\n", vcpu->cpu);
++				handle_vmxon(vcpu);
++				skip_instruction = true;
++				break;
++			case EXIT_REASON_VMOFF:
++				pkvm_dbg("CPU%d vmexit reason: VMXOFF.\n", vcpu->cpu);
++				handle_vmxoff(vcpu);
++				skip_instruction = true;
++				break;
++			case EXIT_REASON_VMPTRLD:
++				pkvm_dbg("CPU%d vmexit reason: VMPTRLD.\n", vcpu->cpu);
++				handle_vmptrld(vcpu);
++				skip_instruction = true;
++				break;
++			case EXIT_REASON_VMCLEAR:
++				pkvm_dbg("CPU%d vmexit reason: VMCLEAR.\n", vcpu->cpu);
++				handle_vmclear(vcpu);
++				skip_instruction = true;
++				break;
++			case EXIT_REASON_VMREAD:
++				pkvm_dbg("CPU%d vmexit reason: WMREAD.\n", vcpu->cpu);
++				handle_vmread(vcpu);
++				skip_instruction = true;
++				break;
++			case EXIT_REASON_VMWRITE:
++				pkvm_dbg("CPU%d vmexit reason: VMWRITE.\n", vcpu->cpu);
++				handle_vmwrite(vcpu);
++				skip_instruction = true;
++				break;
++			case EXIT_REASON_XSETBV:
++				handle_xsetbv(vcpu);
++				skip_instruction = true;
++				break;
++			case EXIT_REASON_VMCALL:
++				vcpu->arch.regs[VCPU_REGS_RAX] = handle_vmcall(vcpu);
++				skip_instruction = true;
++				break;
++			case EXIT_REASON_INTERRUPT_WINDOW:
++				handle_irq_window(vcpu);
++				break;
++			case EXIT_REASON_NMI_WINDOW:
++				handle_nmi_window(vcpu);
++				break;
++			case EXIT_REASON_INVEPT:
++				handle_invept(vcpu);
++				skip_instruction = true;
++				break;
++			case EXIT_REASON_INVVPID:
++				handle_invvpid(vcpu);
++				skip_instruction = true;
++				break;
++			case EXIT_REASON_EPT_VIOLATION:
++				if (handle_host_ept_violation(vcpu, &skip_instruction))
++					pkvm_err("pkvm: handle host ept violation failed");
++				break;
++			case EXIT_REASON_IO_INSTRUCTION:
++				if (handle_host_pio(vcpu))
++					pkvm_err("pkvm: handle host port I/O access failed.");
++				skip_instruction = true;
++				break;
++			default:
++				pkvm_dbg("CPU%d: Unsupported vmexit reason 0x%x.\n", vcpu->cpu, vmx->exit_reason.full);
++				skip_instruction = true;
++				break;
++			}
++		}
++
++		if (skip_instruction)
++			skip_emulated_instruction();
++handle_events:
++		handle_pending_events(vcpu);
++
++		set_vcpu_mode(vcpu, IN_GUEST_MODE);
++
++		if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu))
++			goto handle_events;
++
++		/*
++		 * L2 VMExit -> L1 VMEntry and L1 VMExit -> L1 VMEnetry: vmresume.
++		 * L2 VMExit -> L2 VMEntry: vmresume
++		 * L1 VMExit -> L2 VMEntry: vmlaunch, as vmcs02 is clear every time
++		 */
++		launch = !is_guest_mode(vcpu) ? 0 : (guest_exit ? 0 : 1);
++
++		native_write_cr2(vcpu->arch.cr2);
++		trace_vmexit_end(vcpu, vmx->exit_reason.basic);
++	} while (1);
++
++	return 0;
++}
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/vmexit.h b/arch/x86/kvm/vmx/pkvm/hyp/vmexit.h
+new file mode 100644
+index 000000000000..95a27c2ac112
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/vmexit.h
+@@ -0,0 +1,11 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++
++#ifndef _PKVM_VMEXIT_H_
++#define _PKVM_VMEXIT_H_
++
++int __pkvm_vmx_vcpu_run(unsigned long *regs, int launch);
++
++#endif
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/vmsr.c b/arch/x86/kvm/vmx/pkvm/hyp/vmsr.c
+new file mode 100644
+index 000000000000..10a035aee7ec
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/vmsr.c
+@@ -0,0 +1,120 @@
++/*
++ * SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
++ * Copyright (C) 2018-2022 Intel Corporation
++ */
++
++#include <pkvm.h>
++#include "cpu.h"
++#include "nested.h"
++#include "lapic.h"
++#include "debug.h"
++
++#define INTERCEPT_DISABLE		(0U)
++#define INTERCEPT_READ			(1U << 0U)
++#define INTERCEPT_WRITE			(1U << 1U)
++#define INTERCEPT_READ_WRITE		(INTERCEPT_READ | INTERCEPT_WRITE)
++
++static unsigned int emulated_ro_guest_msrs[] = {
++	LIST_OF_VMX_MSRS,
++};
++
++static unsigned int emulated_wo_guest_msrs[] = {
++	MSR_IA32_APICBASE,
++	(APIC_BASE_MSR + (APIC_ID >> 4)),
++};
++
++static void enable_msr_interception(u8 *bitmap, unsigned int msr_arg, unsigned int mode)
++{
++	unsigned int read_offset = 0U;
++	unsigned int write_offset = 2048U;
++	unsigned int msr = msr_arg;
++	u8 msr_bit;
++	unsigned int msr_index;
++
++	if ((msr <= 0x1FFFU) || ((msr >= 0xc0000000U) && (msr <= 0xc0001fffU))) {
++		if ((msr & 0xc0000000U) != 0U) {
++			read_offset = read_offset + 1024U;
++			write_offset = write_offset + 1024U;
++		}
++
++		msr &= 0x1FFFU;
++		msr_bit = (u8)(1U << (msr & 0x7U));
++		msr_index = msr >> 3U;
++
++		if ((mode & INTERCEPT_READ) == INTERCEPT_READ) {
++			bitmap[read_offset + msr_index] |= msr_bit;
++		} else {
++			bitmap[read_offset + msr_index] &= ~msr_bit;
++		}
++
++		if ((mode & INTERCEPT_WRITE) == INTERCEPT_WRITE) {
++			bitmap[write_offset + msr_index] |= msr_bit;
++		} else {
++			bitmap[write_offset + msr_index] &= ~msr_bit;
++		}
++	} else {
++		pkvm_err("%s, Invalid MSR: 0x%x", __func__, msr);
++	}
++}
++
++int handle_read_msr(struct kvm_vcpu *vcpu)
++{
++	unsigned long msr = vcpu->arch.regs[VCPU_REGS_RCX];
++	int ret = 0;
++	u32 low = 0, high = 0;
++	u64 val;
++
++	/* For non-supported MSRs, return low=high=0 by default */
++	if (is_vmx_msr(msr)) {
++		ret = read_vmx_msr(vcpu, msr, &val);
++		if (!ret) {
++			low = (u32)val;
++			high = (u32)(val >> 32);
++		}
++	}
++
++	pkvm_dbg("%s: CPU%d Value of msr 0x%lx: low=0x%x, high=0x%x\n", __func__, vcpu->cpu, msr, low, high);
++
++	vcpu->arch.regs[VCPU_REGS_RAX] = low;
++	vcpu->arch.regs[VCPU_REGS_RDX] = high;
++
++	return ret;
++}
++
++int handle_write_msr(struct kvm_vcpu *vcpu)
++{
++	unsigned long msr = vcpu->arch.regs[VCPU_REGS_RCX];
++	u32 low, high;
++	u64 val;
++	int ret = 0;
++
++	low = vcpu->arch.regs[VCPU_REGS_RAX];
++	high = vcpu->arch.regs[VCPU_REGS_RDX];
++	val = low | ((u64)high << 32);
++
++	switch (msr) {
++	case MSR_IA32_APICBASE:
++		pkvm_apic_base_msr_write(vcpu, val);
++		break;
++	case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
++		ret = pkvm_x2apic_msr_write(vcpu, msr, val);
++		break;
++	default:
++		break;
++	}
++
++	return ret;
++}
++
++void init_msr_emulation(struct vcpu_vmx *vmx)
++{
++	int i;
++	u8 *bitmap = (u8 *)vmx->loaded_vmcs->msr_bitmap;
++
++	for (i = 0; i < ARRAY_SIZE(emulated_ro_guest_msrs); i++) {
++		enable_msr_interception(bitmap, emulated_ro_guest_msrs[i], INTERCEPT_READ);
++	}
++
++	for (i = 0; i < ARRAY_SIZE(emulated_wo_guest_msrs); i++)
++		enable_msr_interception(bitmap, emulated_wo_guest_msrs[i], INTERCEPT_WRITE);
++}
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/vmsr.h b/arch/x86/kvm/vmx/pkvm/hyp/vmsr.h
+new file mode 100644
+index 000000000000..2a8f947fb17a
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/vmsr.h
+@@ -0,0 +1,11 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++#ifndef _PKVM_VMSR_H_
++#define _PKVM_VMSR_H_
++
++int handle_read_msr(struct kvm_vcpu *vcpu);
++int handle_write_msr(struct kvm_vcpu *vcpu);
++
++#endif
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/vmx.c b/arch/x86/kvm/vmx/pkvm/hyp/vmx.c
+new file mode 100644
+index 000000000000..4ad38578d0e7
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/vmx.c
+@@ -0,0 +1,79 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ */
++
++#include <pkvm.h>
++#include "cpu.h"
++
++void init_contant_host_state_area(struct pkvm_pcpu *pcpu, int cpu)
++{
++	unsigned long a;
++#ifdef CONFIG_PKVM_INTEL_DEBUG
++	u32 high, low;
++	struct desc_ptr dt;
++	u16 selector;
++#endif
++
++	vmcs_writel(HOST_CR0, native_read_cr0() & ~X86_CR0_TS);
++	vmcs_writel(HOST_CR3, pcpu->cr3);
++	vmcs_writel(HOST_CR4, native_read_cr4());
++
++#ifdef CONFIG_PKVM_INTEL_DEBUG
++	savesegment(cs, selector);
++	vmcs_write16(HOST_CS_SELECTOR, selector);
++	savesegment(ss, selector);
++	vmcs_write16(HOST_SS_SELECTOR, selector);
++	savesegment(ds, selector);
++	vmcs_write16(HOST_DS_SELECTOR, selector);
++	savesegment(es, selector);
++	vmcs_write16(HOST_ES_SELECTOR, selector);
++	savesegment(fs, selector);
++	vmcs_write16(HOST_FS_SELECTOR, selector);
++	pkvm_rdmsrl(MSR_FS_BASE, a);
++	vmcs_writel(HOST_FS_BASE, a);
++	savesegment(gs, selector);
++	vmcs_write16(HOST_GS_SELECTOR, selector);
++	pkvm_rdmsrl(MSR_GS_BASE, a);
++	vmcs_writel(HOST_GS_BASE, a);
++
++	vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);
++	vmcs_writel(HOST_TR_BASE, (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
++
++	native_store_gdt(&dt);
++	vmcs_writel(HOST_GDTR_BASE, dt.address);
++	vmcs_writel(HOST_IDTR_BASE, (unsigned long)(&pcpu->idt_page));
++
++	pkvm_rdmsr(MSR_IA32_SYSENTER_CS, low, high);
++	vmcs_write32(HOST_IA32_SYSENTER_CS, low);
++
++	pkvm_rdmsrl(MSR_IA32_SYSENTER_ESP, a);
++	vmcs_writel(HOST_IA32_SYSENTER_ESP, a);
++
++	pkvm_rdmsrl(MSR_IA32_SYSENTER_EIP, a);
++	vmcs_writel(HOST_IA32_SYSENTER_EIP, a);
++#else
++	vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);
++	vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);
++	vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);
++	vmcs_write16(HOST_ES_SELECTOR, 0);
++	vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);
++	vmcs_write16(HOST_FS_SELECTOR, 0);
++	vmcs_write16(HOST_GS_SELECTOR, 0);
++	vmcs_writel(HOST_FS_BASE, 0);
++	vmcs_writel(HOST_GS_BASE, 0);
++
++	vmcs_writel(HOST_TR_BASE, (unsigned long)&pcpu->tss);
++	vmcs_writel(HOST_GDTR_BASE, (unsigned long)(&pcpu->gdt_page));
++	vmcs_writel(HOST_IDTR_BASE, (unsigned long)(&pcpu->idt_page));
++
++	vmcs_write16(HOST_GS_SELECTOR, __KERNEL_DS);
++	vmcs_writel(HOST_GS_BASE, cpu);
++#endif
++
++	/* MSR area */
++	pkvm_rdmsrl(MSR_EFER, a);
++	vmcs_write64(HOST_IA32_EFER, a);
++
++	pkvm_rdmsrl(MSR_IA32_CR_PAT, a);
++	vmcs_write64(HOST_IA32_PAT, a);
++}
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/vmx.h b/arch/x86/kvm/vmx/pkvm/hyp/vmx.h
+new file mode 100644
+index 000000000000..40da630f3c95
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/vmx.h
+@@ -0,0 +1,63 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++
++#ifndef __PKVM_VMX_H
++#define __PKVM_VMX_H
++
++#include "pkvm_hyp.h"
++
++static inline u64 pkvm_construct_eptp(unsigned long root_hpa, int level)
++{
++	u64 eptp = 0;
++
++	if ((level == 4) && vmx_ept_has_4levels())
++		eptp = VMX_EPTP_PWL_4;
++	else if ((level == 5) && vmx_ept_has_5levels())
++		eptp = VMX_EPTP_PWL_5;
++
++	if (vmx_ept_has_mt_wb())
++		eptp |= VMX_EPTP_MT_WB;
++
++	eptp |= (root_hpa & PAGE_MASK);
++
++	return eptp;
++}
++
++static inline void vmcs_load_track(struct vcpu_vmx *vmx, struct vmcs *vmcs)
++{
++	struct pkvm_host_vcpu *pkvm_host_vcpu = vmx_to_pkvm_hvcpu(vmx);
++
++	pkvm_host_vcpu->current_vmcs = vmcs;
++	barrier();
++	vmcs_load(vmcs);
++}
++
++static inline void vmcs_clear_track(struct vcpu_vmx *vmx, struct vmcs *vmcs)
++{
++	struct pkvm_host_vcpu *pkvm_host_vcpu = vmx_to_pkvm_hvcpu(vmx);
++
++	/* vmcs_clear might clear none current vmcs */
++	if (pkvm_host_vcpu->current_vmcs == vmcs)
++		pkvm_host_vcpu->current_vmcs = NULL;
++
++	barrier();
++	vmcs_clear(vmcs);
++}
++
++static inline void flush_ept(u64 eptp)
++{
++	if (vmx_has_invept_context())
++		__invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
++	else
++		__invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
++}
++
++static inline u8 pkvm_virt_addr_bits(void)
++{
++	return (vmcs_readl(GUEST_CR4) & X86_CR4_LA57) ? 57 : 48;
++}
++
++void init_contant_host_state_area(struct pkvm_pcpu *pcpu, int cpu);
++#endif
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/vmx_asm.S b/arch/x86/kvm/vmx/pkvm/hyp/vmx_asm.S
+new file mode 100644
+index 000000000000..ad6ae1257a7a
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/vmx_asm.S
+@@ -0,0 +1,186 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++#include <linux/linkage.h>
++#include <asm/kvm_vcpu_regs.h>
++#include <asm/frame.h>
++#include <asm/asm.h>
++#include <asm/bitsperlong.h>
++#include <asm/unwind_hints.h>
++#include <asm/nospec-branch.h>
++
++#define WORD_SIZE (BITS_PER_LONG / 8)
++
++#define VCPU_RAX	(__VCPU_REGS_RAX * WORD_SIZE)
++#define VCPU_RCX	(__VCPU_REGS_RCX * WORD_SIZE)
++#define VCPU_RDX	(__VCPU_REGS_RDX * WORD_SIZE)
++#define VCPU_RBX	(__VCPU_REGS_RBX * WORD_SIZE)
++#define VCPU_RBP	(__VCPU_REGS_RBP * WORD_SIZE)
++#define VCPU_RSI	(__VCPU_REGS_RSI * WORD_SIZE)
++#define VCPU_RDI	(__VCPU_REGS_RDI * WORD_SIZE)
++
++#define VCPU_R8		(__VCPU_REGS_R8  * WORD_SIZE)
++#define VCPU_R9		(__VCPU_REGS_R9  * WORD_SIZE)
++#define VCPU_R10	(__VCPU_REGS_R10 * WORD_SIZE)
++#define VCPU_R11	(__VCPU_REGS_R11 * WORD_SIZE)
++#define VCPU_R12	(__VCPU_REGS_R12 * WORD_SIZE)
++#define VCPU_R13	(__VCPU_REGS_R13 * WORD_SIZE)
++#define VCPU_R14	(__VCPU_REGS_R14 * WORD_SIZE)
++#define VCPU_R15	(__VCPU_REGS_R15 * WORD_SIZE)
++
++#define HOST_RSP	0x6C14
++
++/**
++ * __vmenter - VM-Enter the current loaded VMCS
++ *
++ * Returns:
++ *	%RFLAGS.CF is set on VM-Fail Invalid
++ *	%RFLAGS.ZF is set on VM-Fail Valid
++ *	%RFLAGS.{CF,ZF} are cleared on VM-Success, i.e. VM-Exit
++ *
++ * Note that VMRESUME/VMLAUNCH fall-through and return directly if
++ * they VM-Fail, whereas a successful VM-Enter + VM-Exit will jump
++ * to vmx_vmexit.
++ */
++SYM_FUNC_START_LOCAL(__vmenter)
++	/* EFLAGS.ZF is set if VMCS.LAUNCHED == 0 */
++	je 2f
++
++1:	vmresume
++	ANNOTATE_UNRET_SAFE
++	ret
++
++2:	vmlaunch
++	ANNOTATE_UNRET_SAFE
++	ret
++SYM_FUNC_END(__vmenter)
++
++/**
++ * __pkvm_vmx_vmexit - Handle a VMX VM-Exit
++ *
++ * Returns:
++ *	%RFLAGS.{CF,ZF} are cleared on VM-Success, i.e. VM-Exit
++ *
++ * This is __vmenter's partner in crime.  On a VM-Exit, control will jump
++ * here after hardware loads the host's state, i.e. this is the destination
++ * referred to by VMCS.HOST_RIP.
++ */
++SYM_FUNC_START(__pkvm_vmx_vmexit)
++	ANNOTATE_UNRET_SAFE
++	ret
++SYM_FUNC_END(__pkvm_vmx_vmexit)
++
++/**
++ * __pkvm_vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode
++ * @regs:	unsigned long * (to guest registers)
++ * @launched:	%true if the VMCS has been launched
++ *
++ * Returns:
++ *	0 on VM-Exit, 1 on VM-Fail
++ */
++SYM_FUNC_START(__pkvm_vmx_vcpu_run)
++	push %_ASM_BP
++	mov  %_ASM_SP, %_ASM_BP
++	push %r15
++	push %r14
++	push %r13
++	push %r12
++
++	push %_ASM_BX
++
++	push %_ASM_ARG1
++
++	/* record host RSP (0x6C14) */
++	mov $HOST_RSP, %_ASM_BX
++	lea -WORD_SIZE(%_ASM_SP), %_ASM_CX
++	vmwrite %_ASM_CX, %_ASM_BX
++
++	mov %_ASM_ARG1, %_ASM_CX
++	cmp $1, %_ASM_ARG2
++
++	mov VCPU_RAX(%_ASM_CX), %_ASM_AX
++	mov VCPU_RBX(%_ASM_CX), %_ASM_BX
++	mov VCPU_RDX(%_ASM_CX), %_ASM_DX
++	mov VCPU_RSI(%_ASM_CX), %_ASM_SI
++	mov VCPU_RDI(%_ASM_CX), %_ASM_DI
++	mov VCPU_RBP(%_ASM_CX), %_ASM_BP
++	mov VCPU_R8(%_ASM_CX),  %r8
++	mov VCPU_R9(%_ASM_CX),  %r9
++	mov VCPU_R10(%_ASM_CX), %r10
++	mov VCPU_R11(%_ASM_CX), %r11
++	mov VCPU_R12(%_ASM_CX), %r12
++	mov VCPU_R13(%_ASM_CX), %r13
++	mov VCPU_R14(%_ASM_CX), %r14
++	mov VCPU_R15(%_ASM_CX), %r15
++
++	mov VCPU_RCX(%_ASM_CX), %_ASM_CX
++
++	call __vmenter
++
++	/* Jump on VM-Fail. */
++	jbe 2f
++
++	push %_ASM_CX
++	mov WORD_SIZE(%_ASM_SP), %_ASM_CX
++
++	mov %_ASM_AX, VCPU_RAX(%_ASM_CX)
++	mov %_ASM_BX, VCPU_RBX(%_ASM_CX)
++	mov %_ASM_DX, VCPU_RDX(%_ASM_CX)
++	mov %_ASM_SI, VCPU_RSI(%_ASM_CX)
++	mov %_ASM_DI, VCPU_RDI(%_ASM_CX)
++	mov %_ASM_BP, VCPU_RBP(%_ASM_CX)
++	mov %r8 , VCPU_R8(%_ASM_CX)
++	mov %r9 , VCPU_R9(%_ASM_CX)
++	mov %r10, VCPU_R10(%_ASM_CX)
++	mov %r11, VCPU_R11(%_ASM_CX)
++	mov %r12, VCPU_R12(%_ASM_CX)
++	mov %r13, VCPU_R13(%_ASM_CX)
++	mov %r14, VCPU_R14(%_ASM_CX)
++	mov %r15, VCPU_R15(%_ASM_CX)
++
++	pop VCPU_RCX(%_ASM_CX)
++
++	/* Clear RAX to indicate VM-Exit (as opposed to VM-Fail). */
++	xor %eax, %eax
++
++	/*
++	 * Clear all general purpose registers except RSP and RAX to prevent
++	 * speculative use of the guest's values, even those that are reloaded
++	 * via the stack.  In theory, an L1 cache miss when restoring registers
++	 * could lead to speculative execution with the guest's values.
++	 * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially
++	 * free.  RSP and RAX are exempt as RSP is restored by hardware during
++	 * VM-Exit and RAX is explicitly loaded with 0 or 1 to return VM-Fail.
++	 */
++1:	xor %ebx, %ebx
++	xor %ecx, %ecx
++	xor %edx, %edx
++	xor %esi, %esi
++	xor %edi, %edi
++	xor %ebp, %ebp
++	xor %r8d,  %r8d
++	xor %r9d,  %r9d
++	xor %r10d, %r10d
++	xor %r11d, %r11d
++	xor %r12d, %r12d
++	xor %r13d, %r13d
++	xor %r14d, %r14d
++	xor %r15d, %r15d
++
++	/* "POP" @regs. */
++	add $WORD_SIZE, %_ASM_SP
++	pop %_ASM_BX
++
++	pop %r12
++	pop %r13
++	pop %r14
++	pop %r15
++
++	pop %_ASM_BP
++	ANNOTATE_UNRET_SAFE
++	ret
++	/* VM-Fail.  Out-of-line to avoid a taken Jcc after VM-Exit. */
++2:	mov $1, %eax
++	jmp 1b
++SYM_FUNC_END(__pkvm_vmx_vcpu_run)
+diff --git a/arch/x86/kvm/vmx/pkvm/hyp/vmx_ops.h b/arch/x86/kvm/vmx/pkvm/hyp/vmx_ops.h
+new file mode 100644
+index 000000000000..b99067af3a6b
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/hyp/vmx_ops.h
+@@ -0,0 +1,173 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++#ifndef _PKVM_VMX_OPS_H_
++#define _PKVM_VMX_OPS_H_
++
++#include "memory.h"
++#include "debug.h"
++
++static __always_inline unsigned long __vmcs_readl(unsigned long field)
++{
++	unsigned long value;
++
++#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
++	asm_volatile_goto("1: vmread %[field], %[output]\n\t"
++			  "jna %l[do_fail]\n\t"
++			  : [output] "=r" (value)
++			  : [field] "r" (field)
++			  : "cc"
++			  : do_fail);
++
++	return value;
++
++do_fail:
++	pkvm_err("pkvm: vmread failed: field=%lx\n", field);
++	return 0;
++#else
++	asm volatile ("vmread %%rdx, %%rax "
++			: "=a" (value)
++			: "d"(field)
++			: "cc");
++	return value;
++#endif
++}
++
++static __always_inline u16 vmcs_read16(unsigned long field)
++{
++	vmcs_check16(field);
++	return __vmcs_readl(field);
++}
++
++static __always_inline u32 vmcs_read32(unsigned long field)
++{
++	vmcs_check32(field);
++	return __vmcs_readl(field);
++}
++
++static __always_inline u64 vmcs_read64(unsigned long field)
++{
++	vmcs_check64(field);
++	return __vmcs_readl(field);
++}
++
++static __always_inline unsigned long vmcs_readl(unsigned long field)
++{
++	vmcs_checkl(field);
++	return __vmcs_readl(field);
++}
++
++static inline void pkvm_vmwrite_error(unsigned long field, unsigned long value)
++{
++	pkvm_err("pkvm: vmwrite failed: field=%lx val=%lx err=%d\n",
++			field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
++}
++
++static inline void pkvm_vmclear_error(struct vmcs *vmcs, u64 phys_addr)
++{
++	pkvm_err("pkvm: vmclear failed: %p/%llx\n", vmcs, phys_addr);
++}
++
++static inline void pkvm_vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
++{
++	pkvm_err("pkvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr);
++}
++
++static inline void pkvm_invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
++{
++	pkvm_err("pkvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
++			ext, vpid, gva);
++}
++
++static inline void pkvm_invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
++{
++	pkvm_err("pkvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
++			ext, eptp, gpa);
++}
++
++#define vmx_asm1(insn, op1, error_args...)				\
++do {									\
++	asm_volatile_goto(__stringify(insn) " %0\n\t"			\
++			  ".byte 0x2e\n\t" /* branch not taken hint */	\
++			  "jna %l[error]\n\t"				\
++			  : : op1 : "cc" : error);			\
++	return;								\
++error:									\
++	pkvm_##insn##_error(error_args);					\
++	return;								\
++} while (0)
++
++#define vmx_asm2(insn, op1, op2, error_args...)				\
++do {									\
++	asm_volatile_goto(__stringify(insn) " %1, %0\n\t"		\
++			  ".byte 0x2e\n\t" /* branch not taken hint */	\
++			  "jna %l[error]\n\t"				\
++			  : : op1, op2 : "cc" : error);			\
++	return;								\
++error:									\
++	pkvm_##insn##_error(error_args);					\
++	return;								\
++} while (0)
++
++static __always_inline void __vmcs_writel(unsigned long field, unsigned long value)
++{
++	vmx_asm2(vmwrite, "r"(field), "rm"(value), field, value);
++}
++
++static __always_inline void vmcs_write16(unsigned long field, u16 value)
++{
++	vmcs_check16(field);
++	__vmcs_writel(field, value);
++}
++
++static __always_inline void vmcs_write32(unsigned long field, u32 value)
++{
++	vmcs_check32(field);
++	__vmcs_writel(field, value);
++}
++
++static __always_inline void vmcs_write64(unsigned long field, u64 value)
++{
++	vmcs_check64(field);
++	__vmcs_writel(field, value);
++}
++
++static __always_inline void vmcs_writel(unsigned long field, unsigned long value)
++{
++	vmcs_checkl(field);
++	__vmcs_writel(field, value);
++}
++
++static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask)
++{
++	BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
++			 "vmcs_clear_bits does not support 64-bit fields");
++	__vmcs_writel(field, __vmcs_readl(field) & ~mask);
++}
++
++static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
++{
++	BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
++			 "vmcs_set_bits does not support 64-bit fields");
++	__vmcs_writel(field, __vmcs_readl(field) | mask);
++}
++
++static inline void vmcs_clear(struct vmcs *vmcs)
++{
++	u64 phys_addr = __pkvm_pa(vmcs);
++
++	vmx_asm1(vmclear, "m"(phys_addr), vmcs, phys_addr);
++}
++
++static inline void vmcs_load(struct vmcs *vmcs)
++{
++	u64 phys_addr = __pkvm_pa(vmcs);
++
++	vmx_asm1(vmptrld, "m"(phys_addr), vmcs, phys_addr);
++}
++
++void vpid_sync_context(int vpid);
++void vpid_sync_vcpu_addr(int vpid, gva_t addr);
++
++#endif
+diff --git a/arch/x86/kvm/vmx/pkvm/include/capabilities.h b/arch/x86/kvm/vmx/pkvm/include/capabilities.h
+new file mode 100644
+index 000000000000..4f5c6695f509
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/include/capabilities.h
+@@ -0,0 +1,95 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++
++#ifndef _PKVM_CAPS_H_
++#define _PKVM_CAPS_H_
++
++#ifdef __PKVM_HYP__
++#define PKVM_HYP pkvm_hyp
++#else
++#define PKVM_HYP pkvm_sym(pkvm_hyp)
++#endif
++
++static inline bool vmx_has_vmwrite_any_field(void)
++{
++	return !!(PKVM_HYP->vmcs_config.nested.misc_low &
++			MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS);
++}
++
++static inline bool vmx_ept_capability_check(u32 bit)
++{
++	struct vmx_capability *vmx_cap = &PKVM_HYP->vmx_cap;
++
++	return vmx_cap->ept & bit;
++}
++
++static inline bool vmx_has_invept(void)
++{
++	return vmx_ept_capability_check(VMX_EPT_INVEPT_BIT);
++}
++
++static inline bool vmx_has_ept_execute_only(void)
++{
++	return vmx_ept_capability_check(VMX_EPT_EXECUTE_ONLY_BIT);
++}
++
++static inline bool vmx_ept_has_4levels(void)
++{
++	return vmx_ept_capability_check(VMX_EPT_PAGE_WALK_4_BIT);
++}
++
++static inline bool vmx_ept_has_5levels(void)
++{
++	return vmx_ept_capability_check(VMX_EPT_PAGE_WALK_5_BIT);
++}
++
++static inline bool vmx_ept_has_mt_wb(void)
++{
++	return vmx_ept_capability_check(VMX_EPTP_WB_BIT);
++}
++
++static inline bool vmx_ept_has_2m_page(void)
++{
++	return vmx_ept_capability_check(VMX_EPT_2MB_PAGE_BIT);
++}
++
++static inline bool vmx_ept_has_1g_page(void)
++{
++	return vmx_ept_capability_check(VMX_EPT_1GB_PAGE_BIT);
++}
++
++static inline bool vmx_has_invept_context(void)
++{
++	return vmx_ept_capability_check(VMX_EPT_EXTENT_CONTEXT_BIT);
++}
++
++static inline bool vmx_vpid_capability_check(u32 bit)
++{
++	struct vmx_capability *vmx_cap = &PKVM_HYP->vmx_cap;
++
++	return vmx_cap->vpid & bit;
++}
++
++static inline bool vmx_has_invvpid(void)
++{
++	return vmx_vpid_capability_check(VMX_VPID_INVVPID_BIT);
++}
++
++static inline bool vmx_has_invvpid_individual_addr(void)
++{
++	return vmx_vpid_capability_check(VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT);
++}
++
++static inline bool vmx_has_invvpid_single(void)
++{
++	return vmx_vpid_capability_check(VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT);
++}
++
++static inline bool vmx_has_invvpid_global(void)
++{
++	return vmx_vpid_capability_check(VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT);
++}
++
++#endif
+diff --git a/arch/x86/kvm/vmx/pkvm/include/pkvm.h b/arch/x86/kvm/vmx/pkvm/include/pkvm.h
+new file mode 100644
+index 000000000000..9ba0678fc492
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/include/pkvm.h
+@@ -0,0 +1,155 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++
++#ifndef _PKVM_H_
++#define _PKVM_H_
++
++#include <asm/pkvm_image.h>
++#include <vmx/vmx.h>
++
++#define STACK_SIZE SZ_16K
++#define PKVM_MAX_IOMMU_NUM	32
++#define PKVM_MAX_PASID_PDEV_NUM	32
++#define PKVM_MAX_PDEV_NUM	512
++#define PKVM_MAX_NORMAL_VM_NUM	8
++#define PKVM_MAX_SECURE_VM_NUM	2
++
++struct pkvm_pgtable_cap {
++	int level;
++	int allowed_pgsz;
++	u64 table_prot;
++};
++
++struct idt_page {
++	gate_desc idt[IDT_ENTRIES];
++} __aligned(PAGE_SIZE);
++
++struct pkvm_pcpu {
++	u8 stack[STACK_SIZE] __aligned(16);
++	unsigned long cr3;
++	struct gdt_page gdt_page;
++	struct idt_page idt_page;
++	struct tss_struct tss;
++	void *lapic;
++};
++
++struct pkvm_host_vcpu {
++	struct vcpu_vmx vmx;
++	struct pkvm_pcpu *pcpu;
++	struct vmcs *vmxarea;
++	struct vmcs *current_vmcs;
++
++	void *current_shadow_vcpu;
++
++	bool pending_nmi;
++	u8 *io_bitmap;
++};
++
++struct pkvm_pci_info {
++	struct pci_mmcfg_region *mmcfg_table;
++	int mmcfg_table_size;
++};
++
++struct pkvm_host_vm {
++	struct pkvm_host_vcpu *host_vcpus[CONFIG_NR_CPUS];
++	struct pkvm_pgtable *ept;
++	struct pkvm_pgtable *ept_notlbflush;
++	struct pkvm_pci_info pci_info;
++	u8 *io_bitmap;
++};
++
++struct pkvm_iommu_info {
++	u64 reg_phys;
++	u64 reg_size;
++};
++
++struct pkvm_hyp {
++	int num_cpus;
++
++	struct vmx_capability vmx_cap;
++	struct vmcs_config vmcs_config;
++
++	struct pkvm_pgtable_cap mmu_cap;
++	struct pkvm_pgtable_cap ept_cap;
++
++	struct pkvm_pgtable *mmu;
++
++	struct pkvm_pcpu *pcpus[CONFIG_NR_CPUS];
++
++	struct pkvm_host_vm host_vm;
++
++	struct pkvm_iommu_info iommu_infos[PKVM_MAX_IOMMU_NUM];
++
++	/*
++	 * IOMMU works in nested translation mode with sharing
++	 * the EPT as second-level page table. So the page table
++	 * level and large page size should be supported by both
++	 * EPT and IOMMU.
++	 */
++	int ept_iommu_pgt_level;
++	int ept_iommu_pgsz_mask;
++
++	bool iommu_coherent;
++};
++
++static inline struct pkvm_host_vcpu *vmx_to_pkvm_hvcpu(struct vcpu_vmx *vmx)
++{
++	return container_of(vmx, struct pkvm_host_vcpu, vmx);
++}
++
++static inline struct pkvm_host_vcpu *to_pkvm_hvcpu(struct kvm_vcpu *vcpu)
++{
++	struct vcpu_vmx *vmx = to_vmx(vcpu);
++
++	return vmx_to_pkvm_hvcpu(vmx);
++}
++
++struct pkvm_section {
++	unsigned long type;
++#define PKVM_RESERVED_MEMORY		0UL
++#define PKVM_CODE_DATA_SECTIONS		1UL
++#define KERNEL_DATA_SECTIONS		2UL
++	unsigned long addr;
++	unsigned long size;
++	u64 prot;
++};
++
++#define PKVM_PAGES (ALIGN(sizeof(struct pkvm_hyp), PAGE_SIZE) >> PAGE_SHIFT)
++#define PKVM_PCPU_PAGES (ALIGN(sizeof(struct pkvm_pcpu), PAGE_SIZE) >> PAGE_SHIFT)
++#define PKVM_HOST_VCPU_PAGES (ALIGN(sizeof(struct pkvm_host_vcpu), PAGE_SIZE) >> PAGE_SHIFT)
++#define PKVM_HOST_VCPU_VMCS_PAGES 3 /*vmxarea+vmcs+msr_bitmap*/
++#define PKVM_EXTRA_PAGES 3 /*io_bitmap + mmcfg_table for host vm*/
++
++/*
++ * pkvm relocate its own text/data sections to some page aligned
++ * memory area. When creating the page table for pkvm, only create
++ * mapping for its own sections so that the other kernel functions
++ * won't be used and make the pkvm to be self contained.
++ */
++extern char __pkvm_text_start[], __pkvm_text_end[];
++extern char __pkvm_rodata_start[], __pkvm_rodata_end[];
++extern char __pkvm_data_start[], __pkvm_data_end[];
++extern char __pkvm_bss_start[], __pkvm_bss_end[];
++
++extern unsigned long pkvm_sym(__page_base_offset);
++extern unsigned long pkvm_sym(__symbol_base_offset);
++extern struct pkvm_hyp *pkvm_sym(pkvm_hyp);
++extern unsigned long pkvm_sym(__x86_clflush_size);
++
++PKVM_DECLARE(void, __pkvm_vmx_vmexit(void));
++PKVM_DECLARE(int, pkvm_main(struct kvm_vcpu *vcpu));
++PKVM_DECLARE(void, init_contant_host_state_area(struct pkvm_pcpu *pcpu, int cpu));
++PKVM_DECLARE(int, init_pci(struct pkvm_hyp *pkvm));
++
++PKVM_DECLARE(void *, pkvm_early_alloc_contig(unsigned int nr_pages));
++PKVM_DECLARE(void *, pkvm_early_alloc_page(void));
++PKVM_DECLARE(void, pkvm_early_alloc_init(void *virt, unsigned long size));
++
++PKVM_DECLARE(void, init_msr_emulation(struct vcpu_vmx *vmx));
++
++PKVM_DECLARE(void, noop_handler(void));
++PKVM_DECLARE(void, nmi_handler(void));
++
++#endif
+diff --git a/arch/x86/kvm/vmx/pkvm/include/pkvm_trace.h b/arch/x86/kvm/vmx/pkvm/include/pkvm_trace.h
+new file mode 100644
+index 000000000000..a924e36eb869
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/include/pkvm_trace.h
+@@ -0,0 +1,29 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++#ifndef _PKVM_TRACE_H_
++#define _PKVM_TRACE_H_
++
++struct vmexit_data {
++	u64 total_count;
++	u64 total_cycles;
++	u64 reasons[74];
++	u64 cycles[74];
++};
++
++struct perf_data {
++	struct vmexit_data data;
++	unsigned long long tsc;
++};
++
++struct vmexit_perf_dump {
++	struct perf_data l1data;
++	struct perf_data l2data;
++	int cpu;
++};
++
++#define PKVM_HC_SET_VMEXIT_TRACE	0xabcd0001
++#define PKVM_HC_DUMP_VMEXIT_TRACE	0xabcd0002
++
++#endif
+diff --git a/arch/x86/kvm/vmx/pkvm/pkvm_constants.c b/arch/x86/kvm/vmx/pkvm/pkvm_constants.c
+new file mode 100644
+index 000000000000..746129da4438
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/pkvm_constants.c
+@@ -0,0 +1,26 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++
++#include <linux/kbuild.h>
++#include <pkvm.h>
++#include <buddy_memory.h>
++#include "hyp/pkvm_hyp.h"
++#include "hyp/iommu_internal.h"
++
++int main(void)
++{
++	DEFINE(PKVM_PERCPU_PAGES, PKVM_PCPU_PAGES + PKVM_HOST_VCPU_PAGES + PKVM_HOST_VCPU_VMCS_PAGES);
++	DEFINE(PKVM_GLOBAL_PAGES, PKVM_PAGES + PKVM_EXTRA_PAGES);
++	DEFINE(PKVM_VMEMMAP_ENTRY_SIZE, sizeof(struct pkvm_page));
++	DEFINE(PKVM_SHADOW_VM_SIZE, sizeof(struct pkvm_shadow_vm) + pkvm_shadow_vcpu_array_size());
++	DEFINE(PKVM_SHADOW_VCPU_STATE_SIZE, sizeof(struct shadow_vcpu_state));
++	DEFINE(PKVM_IOMMU_NUM, PKVM_MAX_IOMMU_NUM);
++	DEFINE(PKVM_PASIDDEV_NUM, PKVM_MAX_PASID_PDEV_NUM);
++	DEFINE(PKVM_PDEV_NUM, PKVM_MAX_PDEV_NUM);
++	DEFINE(PKVM_IOMMU_QI_DESC_SIZE, PKVM_QI_DESC_ALIGNED_SIZE);
++	DEFINE(PKVM_IOMMU_QI_DESC_STATUS_SIZE, PKVM_QI_DESC_STATUS_ALIGNED_SIZE);
++	DEFINE(PKVM_MAX_VM_NUM, PKVM_MAX_NORMAL_VM_NUM + PKVM_MAX_SECURE_VM_NUM);
++	return 0;
++}
+diff --git a/arch/x86/kvm/vmx/pkvm/pkvm_constants.h b/arch/x86/kvm/vmx/pkvm/pkvm_constants.h
+new file mode 100644
+index 000000000000..e6f2753b3d6a
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/pkvm_constants.h
+@@ -0,0 +1,21 @@
++#ifndef __PKVM_CONSTANTS_H__
++#define __PKVM_CONSTANTS_H__
++/*
++ * DO NOT MODIFY.
++ *
++ * This file was generated by Kbuild
++ */
++
++#define PKVM_PERCPU_PAGES 19 /* PKVM_PCPU_PAGES + PKVM_HOST_VCPU_PAGES + PKVM_HOST_VCPU_VMCS_PAGES */
++#define PKVM_GLOBAL_PAGES 5 /* PKVM_PAGES + PKVM_EXTRA_PAGES */
++#define PKVM_VMEMMAP_ENTRY_SIZE 4 /* sizeof(struct pkvm_page) */
++#define PKVM_SHADOW_VM_SIZE 20480 /* sizeof(struct pkvm_shadow_vm) + pkvm_shadow_vcpu_array_size() */
++#define PKVM_SHADOW_VCPU_STATE_SIZE 24576 /* sizeof(struct shadow_vcpu_state) */
++#define PKVM_IOMMU_NUM 32 /* PKVM_MAX_IOMMU_NUM */
++#define PKVM_PASIDDEV_NUM 32 /* PKVM_MAX_PASID_PDEV_NUM */
++#define PKVM_PDEV_NUM 512 /* PKVM_MAX_PDEV_NUM */
++#define PKVM_IOMMU_QI_DESC_SIZE 8192 /* PKVM_QI_DESC_ALIGNED_SIZE */
++#define PKVM_IOMMU_QI_DESC_STATUS_SIZE 4096 /* PKVM_QI_DESC_STATUS_ALIGNED_SIZE */
++#define PKVM_MAX_VM_NUM 10 /* PKVM_MAX_NORMAL_VM_NUM + PKVM_MAX_SECURE_VM_NUM */
++
++#endif
+diff --git a/arch/x86/kvm/vmx/pkvm/pkvm_debugfs.c b/arch/x86/kvm/vmx/pkvm/pkvm_debugfs.c
+new file mode 100644
+index 000000000000..c6cd7f3656b0
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/pkvm_debugfs.c
+@@ -0,0 +1,204 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++#include <linux/debugfs.h>
++#include <asm/vmx.h>
++#include <asm/kvm_para.h>
++#include <pkvm_trace.h>
++
++static void set_vmexit_trace_func(void *data)
++{
++	u64 val;
++
++	if (!data)
++		return;
++
++	val = *(u64 *)data;
++	kvm_hypercall1(PKVM_HC_SET_VMEXIT_TRACE, val);
++}
++
++static int set_vmexit_trace(void *data, u64 val)
++{
++	int cpu;
++
++	for_each_possible_cpu(cpu)
++		smp_call_function_single(cpu, set_vmexit_trace_func, &val, true);
++
++	return 0;
++}
++DEFINE_SIMPLE_ATTRIBUTE(set_vmexit_trace_fops, NULL, set_vmexit_trace, "%llu\n");
++
++static struct trace_print_flags vmexit_reasons[] = { VMX_EXIT_REASONS, { -1, NULL }};
++
++static const char *get_vmexit_reason(int index)
++{
++	struct trace_print_flags *p = vmexit_reasons;
++
++	while (p->name) {
++		if (p->mask == index)
++			return p->name;
++		p++;
++	}
++
++	return NULL;
++}
++
++static void __pkvm_vmexit_perf_dump_percpu(struct vmexit_perf_dump *perf,
++					   struct vmexit_perf_dump *count,
++					   bool dump_l2)
++{
++	struct perf_data *perf_data, *count_perf_data;
++	int cpu = perf->cpu;
++	int i;
++
++	if (dump_l2) {
++		perf_data = &perf->l2data;
++		count_perf_data = count ? &count->l2data : NULL;
++	} else {
++		perf_data = &perf->l1data;
++		count_perf_data = count ? &count->l1data : NULL;
++	}
++
++	for (i = 0 ; i < 74; i++) {
++		if (!perf_data->data.reasons[i])
++			continue;
++
++		pr_info("CPU%d vmexit_from_%s reason %s %lld cycles %lld each-handler-cycle %lld\n",
++			  cpu, dump_l2 ? "l2" : "l1", get_vmexit_reason(i),
++			  perf_data->data.reasons[i], perf_data->data.cycles[i],
++			  perf_data->data.cycles[i] / perf_data->data.reasons[i]);
++
++		if (count_perf_data) {
++			count_perf_data->data.reasons[i] += perf_data->data.reasons[i];
++			count_perf_data->data.cycles[i] += perf_data->data.cycles[i];
++		}
++
++		if (need_resched())
++			cond_resched();
++	}
++
++	if (perf_data->data.total_count) {
++		pr_info("CPU%d total_vmexit_from_%s %lld total_cycles %lld\n",
++			  cpu, dump_l2 ? "l2" : "l1",
++			  perf_data->data.total_count,
++			  perf_data->data.total_cycles);
++		memset(perf_data, 0, sizeof(struct perf_data));
++	}
++}
++
++static void __pkvm_vmexit_perf_dump_summary(struct vmexit_perf_dump *perf, bool dump_l2)
++{
++	struct perf_data *perf_data;
++	int i;
++
++	if (dump_l2)
++		perf_data = &perf->l2data;
++	else
++		perf_data = &perf->l1data;
++
++	for (i = 0 ; i < 74; i++) {
++		if (!perf_data->data.reasons[i])
++			continue;
++
++		pr_info("AllCPU: vmexit_from_%s reason %s %lld cycles %lld each-handler-cycle %lld\n",
++			  dump_l2 ? "l2" : "l1", get_vmexit_reason(i),
++			  perf_data->data.reasons[i], perf_data->data.cycles[i],
++			  perf_data->data.cycles[i] / perf_data->data.reasons[i]);
++
++		perf_data->data.total_count += perf_data->data.reasons[i];
++		perf_data->data.total_cycles += perf_data->data.cycles[i];
++
++		if (need_resched())
++			cond_resched();
++	}
++
++	pr_info("AllCPU: total_vmexit_from_%s %lld total_cycles %lld\n",
++		  dump_l2 ? "l2" : "l1",
++		  perf_data->data.total_count,
++		  perf_data->data.total_cycles);
++}
++
++static struct vmexit_perf_dump pkvm_perf;
++static void pkvm_dump_vmexit_trace(struct vmexit_perf_dump *hvcpu_perf)
++{
++	struct vmexit_perf_dump *perf;
++	int cpu;
++
++	memset(&pkvm_perf.l1data, 0, sizeof(struct perf_data));
++	memset(&pkvm_perf.l2data, 0, sizeof(struct perf_data));
++
++	for (cpu = 0; cpu < num_possible_cpus(); cpu++) {
++		perf = &hvcpu_perf[cpu];
++
++		__pkvm_vmexit_perf_dump_percpu(perf, &pkvm_perf, false);
++		__pkvm_vmexit_perf_dump_percpu(perf, &pkvm_perf, true);
++	}
++
++	__pkvm_vmexit_perf_dump_summary(&pkvm_perf, false);
++	__pkvm_vmexit_perf_dump_summary(&pkvm_perf, true);
++}
++
++static int dump_vmexit_trace(void *data, u64 *val)
++{
++	struct vmexit_perf_dump *hvcpu_perf;
++	unsigned long size = sizeof(struct vmexit_perf_dump) * num_possible_cpus();
++
++	hvcpu_perf = alloc_pages_exact(size, GFP_KERNEL_ACCOUNT);
++
++	kvm_hypercall2(PKVM_HC_DUMP_VMEXIT_TRACE, __pa(hvcpu_perf), size);
++	barrier();
++
++	pkvm_dump_vmexit_trace(hvcpu_perf);
++
++	free_pages_exact(hvcpu_perf, size);
++
++	*val = 0;
++	return 0;
++}
++DEFINE_SIMPLE_ATTRIBUTE(dump_vmexit_trace_fops, dump_vmexit_trace, NULL, "%llu\n");
++
++struct debugfs_item {
++	const char *name;
++	umode_t mode;
++	const  struct file_operations *fops;
++	struct dentry *dentry;
++};
++
++struct debugfs_item debugfs_files[] = {
++	{ "set_vmexit_trace", 0222, &set_vmexit_trace_fops},
++	{ "dump_vmexit_trace", 0444, &dump_vmexit_trace_fops},
++	{ NULL }
++};
++
++static struct dentry *debugfs_dir;
++
++void pkvm_init_debugfs(void)
++{
++	struct debugfs_item *p;
++
++	debugfs_dir = debugfs_create_dir("pkvm", NULL);
++	if (IS_ERR_OR_NULL(debugfs_dir)) {
++		pr_err("MCP_TEST: Can't create debugfs root entry\n");
++		goto failed_dir;
++	}
++
++	for (p = debugfs_files; p->name; ++p) {
++		p->dentry = debugfs_create_file(p->name, p->mode,
++						debugfs_dir,
++						NULL, p->fops);
++		if (IS_ERR_OR_NULL(p->dentry))
++			goto out_dir;
++	}
++
++	return;
++
++out_dir:
++	for (p = debugfs_files; p->dentry; ++p) {
++		debugfs_remove(p->dentry);
++		p->dentry = NULL;
++	}
++	debugfs_remove(debugfs_dir);
++failed_dir:
++	debugfs_dir = NULL;
++}
+diff --git a/arch/x86/kvm/vmx/pkvm/pkvm_host.c b/arch/x86/kvm/vmx/pkvm/pkvm_host.c
+new file mode 100644
+index 000000000000..b46c9e07fb1e
+--- /dev/null
++++ b/arch/x86/kvm/vmx/pkvm/pkvm_host.c
+@@ -0,0 +1,1300 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ */
++
++#include <linux/kernel.h>
++#include <linux/module.h>
++#include <linux/dmar.h>
++#include <linux/intel-iommu.h>
++#include <linux/pci.h>
++#include <asm/pci_x86.h>
++#include <asm/trapnr.h>
++#include <asm/kvm_pkvm.h>
++
++#include <mmu.h>
++#include <mmu/spte.h>
++#include <pkvm.h>
++#include <vmx/vmx_lib.h>
++#include "pkvm_constants.h"
++#include <capabilities.h>
++
++extern void pkvm_init_debugfs(void);
++
++MODULE_LICENSE("GPL");
++
++struct pkvm_hyp *pkvm;
++
++struct pkvm_deprivilege_param {
++	struct pkvm_hyp *pkvm;
++	int ret;
++};
++DEFINE_PER_CPU_READ_MOSTLY(bool, pkvm_enabled);
++
++#define is_aligned(POINTER, BYTE_COUNT) \
++		(((uintptr_t)(const void *)(POINTER)) % (BYTE_COUNT) == 0)
++
++static u16 pkvm_host_vpid = VMX_NR_VPIDS - 1;
++
++struct gdt_page pkvm_gdt_page = {
++	.gdt = {
++		[GDT_ENTRY_KERNEL32_CS]		= GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
++		[GDT_ENTRY_KERNEL_CS]		= GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
++		[GDT_ENTRY_KERNEL_DS]		= GDT_ENTRY_INIT(0xc093, 0, 0xfffff),
++		[GDT_ENTRY_DEFAULT_USER32_CS]	= GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff),
++		[GDT_ENTRY_DEFAULT_USER_DS]	= GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff),
++		[GDT_ENTRY_DEFAULT_USER_CS]	= GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff),
++	},
++};
++
++static int check_pci_device_count(void)
++{
++	struct pci_dev *pdev = NULL;
++	int devs = 0, devs_with_pasid = 0;
++
++	/*
++	 * pkvm has reserved the memory for IOMMU during early boot, and that
++	 * memory is estimated with PKVM_MAX_PDEV_NUM and PKVM_MAX_PASID_PDEV_NUM.
++	 * The actual number larger than this will make IOMMU fail to create
++	 * translation tables.
++	 */
++	for_each_pci_dev(pdev) {
++		if (pdev->pasid_cap)
++			devs_with_pasid++;
++		else
++			devs++;
++	}
++
++	if (devs > PKVM_MAX_PDEV_NUM ||
++		devs_with_pasid > PKVM_MAX_PASID_PDEV_NUM) {
++		pr_err("pkvm: Too many pdevs detected, actual %d %d max %d %d\n",
++			devs, devs_with_pasid, PKVM_MAX_PDEV_NUM,
++			PKVM_MAX_PASID_PDEV_NUM);
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++/*
++ * Check for the coherency of paging structures accessed through pasid table
++ * entries (in scalable mode) or context table entries (in legacy mode).
++ */
++static inline bool is_iommu_coherent(u64 ecap)
++{
++	return ecap_smts(ecap) ? !!ecap_smpwc(ecap) : !!ecap_coherent(ecap);
++}
++
++__attribute__((optimize(0)))
++static int check_and_init_iommu(struct pkvm_hyp *pkvm)
++{
++	struct pkvm_iommu_info *info;
++	struct dmar_drhd_unit *drhd;
++	int pgsz_mask = 1 << PG_LEVEL_4K;
++	int pgt_level = 0;
++	void __iomem *addr;
++	u64 reg_size;
++	u64 cap, ecap;
++	int index = 0, ret;
++
++/* matches with IOMMU cap SAGAW bits */
++#define PGT_4LEVEL	BIT(2)
++#define PGT_5LEVEL	BIT(3)
++
++	ret = check_pci_device_count();
++	if (ret)
++		return ret;
++	/*
++	 * Some cases may require IOMMU and EPT to use both supported page
++	 * table level and page size:
++	 *
++	 * 1) If IOMMU is working in nested translation of scalable-mode,
++	 * pKVM may reuse EPT as the 2nd-level page table.
++	 *
++	 * 2) If IOMMU is working in legacy mode and a device is working
++	 * in IOMMU pass-through mode, pKVM may reuse EPT as the 2nd-level
++	 * page table.
++	 *
++	 * For other cases, though not necessary to use both IOMMU and EPT
++	 * supported page table level and page size, using the same size
++	 * can simplify the implementation, as pKVM doesn't need to check
++	 * IOMMU types of all devices before deciding whether it's necessary
++	 * to use both IOMMU and EPT supported page table level and page size.
++	 */
++	if (pkvm->vmx_cap.ept & VMX_EPT_PAGE_WALK_4_BIT)
++		pgt_level |= PGT_4LEVEL;
++
++	if (pkvm->vmx_cap.ept & VMX_EPT_PAGE_WALK_5_BIT)
++		pgt_level |= PGT_5LEVEL;
++
++	if (pkvm->vmx_cap.ept & VMX_EPT_2MB_PAGE_BIT)
++		pgsz_mask |= 1 << PG_LEVEL_2M;
++
++	if ((pkvm->vmx_cap.ept & VMX_EPT_1GB_PAGE_BIT))
++		pgsz_mask |= 1 << PG_LEVEL_1G;
++
++	pkvm->iommu_coherent = true;
++	for_each_drhd_unit(drhd) {
++		int level = 0, mask = 1 << PG_LEVEL_4K;
++
++		if (index >= PKVM_MAX_IOMMU_NUM) {
++			pr_err("pkvm: too many IOMMU devices to be supported\n");
++			return -ENOMEM;
++		}
++
++		if (!drhd->reg_base_addr) {
++			pr_err("pkvm: dmar unit not valid\n");
++			return -EINVAL;
++		}
++
++		/*
++		 * pkvm requires host IOMMU driver to work in scalable mode with
++		 * first-level translation or legacy mode.
++		 */
++		if ((readl(drhd->iommu->reg + DMAR_GSTS_REG) & DMA_GSTS_TES) &&
++			(readq(drhd->iommu->reg + DMAR_RTADDR_REG) & BIT(11))) {
++			pr_err("pkvm: drhd reg_base 0x%llx: scalable/legacy mode not enabled\n",
++				drhd->reg_base_addr);
++			return -EINVAL;
++		}
++
++		addr = ioremap(drhd->reg_base_addr, VTD_PAGE_SIZE);
++		if (!addr) {
++			pr_err("pkvm: failed to map drhd reg physical addr 0x%llx\n",
++				drhd->reg_base_addr);
++			return -EINVAL;
++		}
++
++		info = &pkvm->iommu_infos[index];
++		cap = readq(addr + DMAR_CAP_REG);
++		ecap = readq(addr + DMAR_ECAP_REG);
++		iounmap(addr);
++
++		/*
++		 * If pkvm IOMMU works in scalable mode, it requires to use nested translation.
++		 */
++		if (ecap_smts(ecap) && !ecap_nest(ecap)) {
++			pr_err("pkvm: drhd reg_base 0x%llx: nested translation not supported\n",
++				drhd->reg_base_addr);
++			return -EINVAL;
++		}
++
++		/*
++		 * Check for the coherency of the paging structure access.
++		 */
++		if (!is_iommu_coherent(ecap))
++			pkvm->iommu_coherent = false;
++
++		info->reg_phys = drhd->reg_base_addr;
++		reg_size = max_t(u64, ecap_max_iotlb_offset(ecap),
++				 cap_max_fault_reg_offset(cap));
++		info->reg_size = max_t(u64, reg_size, VTD_PAGE_SIZE);
++
++		if (cap_sagaw(cap) & PGT_4LEVEL)
++			level |= PGT_4LEVEL;
++		if (cap_sagaw(cap) & PGT_5LEVEL)
++			level |= PGT_5LEVEL;
++
++		if (cap_super_page_val(cap) & BIT(0))
++			mask |= 1 << PG_LEVEL_2M;
++		if (cap_super_page_val(cap) & BIT(1))
++			mask |= 1 << PG_LEVEL_1G;
++
++		/* Get the both supported page table level */
++		pgt_level &= level;
++		pgsz_mask &= mask;
++
++		index++;
++	}
++
++	/*
++	 * There may be no supported page table level for both IOMMU and EPT.
++	 * But there will always be both supported page size, which is 4K.
++	 */
++	if (pgt_level == 0) {
++		pr_err("pkvm: no common page table level for IOMMU and EPT\n");
++		return -EINVAL;
++	}
++
++	/* By default to use 4level */
++	pkvm->ept_iommu_pgt_level = pgt_level & PGT_4LEVEL ? 4 : 5;
++
++	pkvm->ept_iommu_pgsz_mask = pgsz_mask;
++
++	return 0;
++}
++
++u64 pkvm_total_reserve_pages(void)
++{
++	u64 total;
++
++	total = pkvm_data_struct_pages(PKVM_GLOBAL_PAGES, PKVM_PERCPU_PAGES, num_possible_cpus());
++	total += pkvm_vmemmap_pages(PKVM_VMEMMAP_ENTRY_SIZE);
++	total += pkvm_mmu_pgtable_pages();
++	total += host_ept_pgtable_pages();
++	total += pkvm_iommu_pages(PKVM_MAX_PASID, PKVM_PASIDDEV_NUM,
++				  PKVM_PDEV_NUM, PKVM_IOMMU_NUM,
++				  PKVM_IOMMU_QI_DESC_SIZE,
++				  PKVM_IOMMU_QI_DESC_STATUS_SIZE,
++				  num_possible_cpus());
++	total += pkvm_shadow_ept_pgtable_pages(PKVM_MAX_VM_NUM);
++	total += pkvm_host_shadow_iommu_pgtable_pages(PKVM_PDEV_NUM);
++
++	return total;
++}
++
++static struct vmcs *pkvm_alloc_vmcs(struct vmcs_config *vmcs_config_ptr)
++{
++	struct vmcs *vmcs;
++	int pages = ALIGN(vmcs_config_ptr->size, PAGE_SIZE) >> PAGE_SHIFT;
++
++	vmcs = pkvm_sym(pkvm_early_alloc_contig)(pages);
++	if (!vmcs)
++		return NULL;
++
++	memset(vmcs, 0, vmcs_config_ptr->size);
++	vmcs->hdr.revision_id = vmcs_config_ptr->revision_id; /* vmcs revision id */
++
++	return vmcs;
++}
++
++static void vmxon_setup_revid(void *vmxon_region)
++{
++	u32 rev_id = 0;
++	u32 msr_high_value = 0;
++
++	rdmsr(MSR_IA32_VMX_BASIC, rev_id, msr_high_value);
++
++	memcpy(vmxon_region, &rev_id, 4);
++}
++
++static inline void cr4_set_vmxe(void)
++{
++	unsigned long cr4_value;
++
++	cr4_value = __read_cr4();
++	__write_cr4(cr4_value | X86_CR4_VMXE);
++}
++
++static inline void cr4_clear_vmxe(void)
++{
++	unsigned long cr4_value;
++
++	cr4_value = __read_cr4();
++	__write_cr4(cr4_value & ~(X86_CR4_VMXE));
++}
++
++static int pkvm_cpu_vmxon(u64 vmxon_pointer)
++{
++	u64 msr;
++
++	cr4_set_vmxe();
++	asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
++			  _ASM_EXTABLE(1b, %l[fault])
++			  : : [vmxon_pointer] "m"(vmxon_pointer)
++			  : : fault);
++	return 0;
++
++fault:
++	WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
++		  rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
++	cr4_clear_vmxe();
++	return -EFAULT;
++}
++
++static int pkvm_cpu_vmxoff(void)
++{
++	asm_volatile_goto("1: vmxoff\n\t"
++			  _ASM_EXTABLE(1b, %l[fault])
++			  ::: "cc", "memory" : fault);
++	cr4_clear_vmxe();
++	return 0;
++
++fault:
++	cr4_clear_vmxe();
++	return -EFAULT;
++}
++
++static int pkvm_enable_vmx(struct pkvm_host_vcpu *vcpu)
++{
++	u64 phys_addr;
++
++	vcpu->vmxarea = pkvm_sym(pkvm_early_alloc_page)();
++	if (!vcpu->vmxarea)
++		return -ENOMEM;
++
++	phys_addr = __pa(vcpu->vmxarea);
++	if (!is_aligned(phys_addr, PAGE_SIZE))
++		return -ENOMEM;
++
++	/*setup revision id in vmxon region*/
++	vmxon_setup_revid(vcpu->vmxarea);
++
++	return pkvm_cpu_vmxon(phys_addr);
++}
++
++static inline u32 get_ar(u16 sel)
++{
++	u32 access_rights;
++
++	if (sel == 0) {
++		access_rights = 0x10000;
++	} else {
++		asm ("lar %%ax, %%rax\n"
++				: "=a"(access_rights) : "a"(sel));
++		access_rights = access_rights >> 8;
++		access_rights = access_rights & 0xf0ff;
++	}
++
++	return access_rights;
++}
++
++#define init_guestsegment(seg, SEG, base, limit)		\
++	do  {							\
++		u16 sel;					\
++		u32 ar;						\
++								\
++		savesegment(seg, sel);				\
++		ar = get_ar(sel);				\
++		vmcs_write16(GUEST_##SEG##_SELECTOR, sel);	\
++		vmcs_write32(GUEST_##SEG##_AR_BYTES, ar);	\
++		vmcs_writel(GUEST_##SEG##_BASE, base);		\
++		vmcs_write32(GUEST_##SEG##_LIMIT, limit);	\
++	} while (0)
++
++static noinline void init_guest_state_area_from_native(int cpu)
++{
++	u16 ldtr;
++	struct desc_ptr dt;
++	unsigned long msrl;
++	u32 high, low;
++
++	/* load CR regiesters */
++	vmcs_writel(GUEST_CR0, read_cr0() & ~X86_CR0_TS);
++	vmcs_writel(GUEST_CR3, __read_cr3());
++	vmcs_writel(GUEST_CR4, native_read_cr4());
++
++	/* load cs/ss/ds/es */
++	init_guestsegment(cs, CS, 0x0, 0xffffffff);
++	init_guestsegment(ss, SS, 0x0, 0xffffffff);
++	init_guestsegment(ds, DS, 0x0, 0xffffffff);
++	init_guestsegment(es, ES, 0x0, 0xffffffff);
++
++	/* load fs/gs */
++	rdmsrl(MSR_FS_BASE, msrl);
++	init_guestsegment(fs, FS, msrl, 0xffffffff);
++	rdmsrl(MSR_GS_BASE, msrl);
++	init_guestsegment(gs, GS, msrl, 0xffffffff);
++
++	/* load GDTR */
++	native_store_gdt(&dt);
++	vmcs_writel(GUEST_GDTR_BASE, dt.address);
++	vmcs_write32(GUEST_GDTR_LIMIT, dt.size);
++
++	/* load TR */
++	vmcs_write16(GUEST_TR_SELECTOR, GDT_ENTRY_TSS*8);
++	vmcs_write32(GUEST_TR_AR_BYTES, get_ar(GDT_ENTRY_TSS*8));
++	vmcs_writel(GUEST_TR_BASE, (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
++	vmcs_write32(GUEST_TR_LIMIT, __KERNEL_TSS_LIMIT);
++
++	/* load LDTR */
++	store_ldt(ldtr);
++	vmcs_write16(GUEST_LDTR_SELECTOR, ldtr);
++	vmcs_write32(GUEST_LDTR_AR_BYTES, 0x10000);
++	vmcs_writel(GUEST_LDTR_BASE, 0x0);
++	vmcs_write32(GUEST_LDTR_LIMIT, 0xffffffff);
++
++	store_idt(&dt);
++	vmcs_writel(GUEST_IDTR_BASE, dt.address);
++	vmcs_write32(GUEST_IDTR_LIMIT, dt.size);
++
++	/* set MSRs */
++	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
++
++	rdmsr(MSR_IA32_SYSENTER_CS, low, high);
++	vmcs_write32(GUEST_SYSENTER_CS, low);
++
++	rdmsrl(MSR_IA32_SYSENTER_ESP, msrl);
++	vmcs_writel(GUEST_SYSENTER_ESP, msrl);
++
++	rdmsrl(MSR_IA32_SYSENTER_EIP, msrl);
++	vmcs_writel(GUEST_SYSENTER_EIP, msrl);
++
++	rdmsrl(MSR_EFER, msrl);
++	vmcs_write64(GUEST_IA32_EFER, msrl);
++
++	rdmsrl(MSR_IA32_CR_PAT, msrl);
++	vmcs_write64(GUEST_IA32_PAT, msrl);
++}
++
++static noinline void init_guest_state_area(struct pkvm_host_vcpu *vcpu, int cpu)
++{
++	init_guest_state_area_from_native(cpu);
++
++	/*Guest non register state*/
++	vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
++	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
++	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
++	vmcs_write64(VMCS_LINK_POINTER, -1ull);
++}
++
++static void init_host_state_area(struct pkvm_host_vcpu *vcpu, int cpu)
++{
++	struct pkvm_pcpu *pcpu = vcpu->pcpu;
++
++	pkvm_sym(init_contant_host_state_area)(pcpu, cpu);
++
++	/*host RIP*/
++	vmcs_writel(HOST_RIP, (unsigned long)pkvm_sym(__pkvm_vmx_vmexit));
++}
++
++static void init_execution_control(struct pkvm_host_vcpu *vcpu,
++			    struct vmcs_config *vmcs_config_ptr,
++			    struct vmx_capability *vmx_cap)
++{
++	struct vcpu_vmx *vmx = &vcpu->vmx;
++	/*
++	 * Fixed VPIDs for the host vCPUs, which implies that it could conflict
++	 * with VPIDs from nested guests.
++	 *
++	 * It's safe because cached mappings used in non-root mode are associated
++	 * with EP4TA, which is managed by pKVM and unique for every guest.
++	 */
++	if ((vmcs_config_ptr->cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_VPID) &&
++		vmx_has_invvpid() &&
++		(vmx_has_invvpid_single() || vmx_has_invvpid_global()))
++		vmcs_write16(VIRTUAL_PROCESSOR_ID, pkvm_host_vpid--);
++
++	pin_controls_set(vmx, vmcs_config_ptr->pin_based_exec_ctrl);
++	exec_controls_set(vmx, vmcs_config_ptr->cpu_based_exec_ctrl);
++	secondary_exec_controls_set(vmx, vmcs_config_ptr->cpu_based_2nd_exec_ctrl);
++	/* disable EPT first, will enable after EPT pgtable created */
++	secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_EPT);
++
++	vmcs_write32(CR3_TARGET_COUNT, 0);
++
++	vmcs_write32(EXCEPTION_BITMAP, 0);
++
++	vmcs_write64(IO_BITMAP_A, __pa(vcpu->io_bitmap));
++	vmcs_write64(IO_BITMAP_B, __pa(vcpu->io_bitmap) + PAGE_SIZE);
++
++	pkvm_sym(init_msr_emulation(vmx));
++	vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
++
++	/*guest owns the entire bits*/
++	vmcs_writel(CR0_GUEST_HOST_MASK, 0);
++
++	vmcs_writel(CR4_GUEST_HOST_MASK, X86_CR4_VMXE);
++}
++
++static void init_vmexit_control(struct vcpu_vmx *vmx, struct vmcs_config *vmcs_config_ptr)
++{
++	vm_exit_controls_set(vmx, vmcs_config_ptr->vmexit_ctrl);
++	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
++}
++
++static void init_vmentry_control(struct vcpu_vmx *vmx, struct vmcs_config *vmcs_config_ptr)
++{
++	vm_entry_controls_set(vmx, vmcs_config_ptr->vmentry_ctrl);
++	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
++	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
++	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
++}
++
++static int pkvm_host_init_vmx(struct pkvm_host_vcpu *vcpu, int cpu)
++{
++	struct vcpu_vmx *vmx = &vcpu->vmx;
++	int ret;
++
++	ret = pkvm_enable_vmx(vcpu);
++	if (ret)
++		return ret;
++
++	/* vmcs01: host vmcs in pKVM */
++	vmx->vmcs01.vmcs = pkvm_alloc_vmcs(&pkvm->vmcs_config);
++	if (!vmx->vmcs01.vmcs)
++		return -ENOMEM;
++
++	vmx->vmcs01.msr_bitmap = pkvm_sym(pkvm_early_alloc_page)();
++	if (!vmx->vmcs01.msr_bitmap) {
++		pr_err("%s: No page for msr_bitmap\n", __func__);
++		return -ENOMEM;
++	}
++
++	vcpu->io_bitmap = pkvm->host_vm.io_bitmap;
++
++	vmx->loaded_vmcs = &vmx->vmcs01;
++	vmcs_load(vmx->loaded_vmcs->vmcs);
++	vcpu->current_vmcs = vmx->loaded_vmcs->vmcs;
++
++	init_guest_state_area(vcpu, cpu);
++	init_host_state_area(vcpu, cpu);
++	init_execution_control(vcpu, &pkvm->vmcs_config, &pkvm->vmx_cap);
++	init_vmexit_control(vmx, &pkvm->vmcs_config);
++	init_vmentry_control(vmx, &pkvm->vmcs_config);
++
++	return ret;
++}
++
++static void pkvm_host_deinit_vmx(struct pkvm_host_vcpu *vcpu)
++{
++	struct vcpu_vmx *vmx = &vcpu->vmx;
++
++	pkvm_cpu_vmxoff();
++
++	if (vmx->vmcs01.vmcs)
++		vmx->vmcs01.vmcs = NULL;
++
++	if (vmx->vmcs01.msr_bitmap)
++		vmx->vmcs01.msr_bitmap = NULL;
++}
++
++static void pkvm_host_setup_nested_vmx_cap(struct pkvm_hyp *pkvm)
++{
++	struct nested_vmx_msrs *msrs = &pkvm->vmcs_config.nested;
++
++	rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
++		msrs->procbased_ctls_low,
++		msrs->procbased_ctls_high);
++
++	rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS2,
++			&msrs->secondary_ctls_low,
++			&msrs->secondary_ctls_high);
++
++	rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
++		msrs->pinbased_ctls_low,
++		msrs->pinbased_ctls_high);
++
++	rdmsrl_safe(MSR_IA32_VMX_VMFUNC, &msrs->vmfunc_controls);
++
++	rdmsr(MSR_IA32_VMX_EXIT_CTLS,
++		msrs->exit_ctls_low,
++		msrs->exit_ctls_high);
++
++	rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
++		msrs->entry_ctls_low,
++		msrs->entry_ctls_high);
++
++	rdmsr(MSR_IA32_VMX_MISC, msrs->misc_low, msrs->misc_high);
++}
++
++__attribute__((optimize(0)))
++static int pkvm_host_check_and_setup_vmx_cap(struct pkvm_hyp *pkvm)
++{
++	struct vmcs_config *vmcs_config = &pkvm->vmcs_config;
++	struct vmx_capability *vmx_cap = &pkvm->vmx_cap;
++	int ret = 0;
++	struct vmcs_config_setting setting = {
++		.cpu_based_exec_ctrl_min =
++			CPU_BASED_USE_IO_BITMAPS |
++			CPU_BASED_USE_MSR_BITMAPS |
++			CPU_BASED_ACTIVATE_SECONDARY_CONTROLS,
++		.cpu_based_exec_ctrl_opt = 0,
++		.cpu_based_2nd_exec_ctrl_min =
++			SECONDARY_EXEC_ENABLE_EPT |
++			SECONDARY_EXEC_SHADOW_VMCS,
++		.cpu_based_2nd_exec_ctrl_opt =
++			SECONDARY_EXEC_ENABLE_VPID |
++			SECONDARY_EXEC_ENABLE_INVPCID |
++			SECONDARY_EXEC_XSAVES |
++			SECONDARY_EXEC_ENABLE_RDTSCP |
++			SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE,
++		.pin_based_exec_ctrl_min = 0,
++		.pin_based_exec_ctrl_opt = 0,
++		.vmexit_ctrl_min =
++			VM_EXIT_HOST_ADDR_SPACE_SIZE |
++			VM_EXIT_LOAD_IA32_EFER |
++			VM_EXIT_SAVE_IA32_PAT |
++			VM_EXIT_SAVE_IA32_EFER |
++			VM_EXIT_SAVE_DEBUG_CONTROLS,
++		.vmexit_ctrl_opt = 0,
++		.vmentry_ctrl_min =
++			VM_ENTRY_LOAD_DEBUG_CONTROLS |
++			VM_ENTRY_IA32E_MODE |
++			VM_ENTRY_LOAD_IA32_EFER,
++//			VM_ENTRY_LOAD_IA32_PAT,
++		.vmentry_ctrl_opt = 0,
++		.has_broken_vmx_preemption_timer = false,
++		.perf_global_ctrl_workaround = false,
++	};
++
++	if (!boot_cpu_has(X86_FEATURE_VMX))
++		return -EINVAL;
++
++	if (__setup_vmcs_config(vmcs_config, vmx_cap, &setting) < 0)
++		return -EINVAL;
++
++	pr_info("pin_based_exec_ctrl 0x%x\n", vmcs_config->pin_based_exec_ctrl);
++	pr_info("cpu_based_exec_ctrl 0x%x\n", vmcs_config->cpu_based_exec_ctrl);
++	pr_info("cpu_based_2nd_exec_ctrl 0x%x\n", vmcs_config->cpu_based_2nd_exec_ctrl);
++	pr_info("vmexit_ctrl 0x%x\n", vmcs_config->vmexit_ctrl);
++	pr_info("vmentry_ctrl 0x%x\n", vmcs_config->vmentry_ctrl);
++
++	pkvm_host_setup_nested_vmx_cap(pkvm);
++
++	return ret;
++}
++
++static int pkvm_init_mmu(struct pkvm_hyp *pkvm)
++{
++	int pgsz_mask = (1 << PG_LEVEL_2M) | (1 << PG_LEVEL_4K);
++
++	if (boot_cpu_has(X86_FEATURE_GBPAGES))
++		pgsz_mask |= 1 << PG_LEVEL_1G;
++
++	/* record mmu pgtable cap for later mmu pgtable build */
++	pkvm->mmu_cap.level = pgtable_l5_enabled() ? 5 : 4;
++	pkvm->mmu_cap.allowed_pgsz = pgsz_mask;
++	pkvm->mmu_cap.table_prot = (u64)_KERNPG_TABLE_NOENC;
++
++	/*
++	 * Use IOMMU acknowledged level and page size mask for
++	 * EPT as IOMMU will use EPT as its second-level page
++	 * table in nested translation.
++	 */
++	pkvm->ept_cap.level = pkvm->ept_iommu_pgt_level;
++	pkvm->ept_cap.allowed_pgsz = pkvm->ept_iommu_pgsz_mask;
++	pkvm->ept_cap.table_prot = VMX_EPT_RWX_MASK;
++
++	/*
++	 * __page_base_offset stores the offset for pkvm
++	 * to translate VA to a PA.
++	 *
++	 * __symbol_base_offset stores the offset for pkvm
++	 * to translate its symbole's VA to a PA.
++	 */
++	pkvm_sym(__page_base_offset) = (unsigned long)__va(0);
++	pkvm_sym(__symbol_base_offset) = (unsigned long)__pkvm_text_start - __pa_symbol(__pkvm_text_start);
++
++	/*
++	 * __x86_clflush_size stores the clflush size for
++	 * pkvm to do the clfush at runtime.
++	 */
++	pkvm_sym(__x86_clflush_size) = boot_cpu_data.x86_clflush_size;
++
++	return 0;
++}
++
++static void init_gdt(struct pkvm_pcpu *pcpu)
++{
++	pcpu->gdt_page = pkvm_gdt_page;
++}
++
++static void init_idt(struct pkvm_pcpu *pcpu)
++{
++	gate_desc *idt = pcpu->idt_page.idt;
++	struct idt_data d = {
++		.segment = __KERNEL_CS,
++		.bits.ist = 0,
++		.bits.zero = 0,
++		.bits.type = GATE_INTERRUPT,
++		.bits.dpl = 0,
++		.bits.p = 1,
++	};
++	gate_desc desc;
++	int i;
++
++#ifdef CONFIG_PKVM_INTEL_DEBUG
++	gate_desc *host_idt;
++	struct desc_ptr dt;
++
++	store_idt(&dt);
++	host_idt = (gate_desc *)dt.address;
++
++	/* reuse other exception handler but control nmi handler */
++	for (i = 0; i <= X86_TRAP_IRET; i++) {
++		if (i == X86_TRAP_NMI) {
++			d.vector = i;
++			d.bits.ist = 0;
++			d.addr = (const void *)pkvm_sym(nmi_handler);
++			idt_init_desc(&desc, &d);
++			write_idt_entry(idt, i, &desc);
++		} else {
++			memcpy(&idt[i], &host_idt[i], sizeof(gate_desc));
++		}
++	}
++#else
++	for (i = 0; i <= X86_TRAP_IRET; i++) {
++		d.vector = i;
++		d.bits.ist = 0;
++		if (i == X86_TRAP_NMI)
++			d.addr = (const void *)pkvm_sym(nmi_handler);
++		else
++			d.addr = (const void *)pkvm_sym(noop_handler);
++		idt_init_desc(&desc, &d);
++		write_idt_entry(idt, i, &desc);
++	}
++#endif
++}
++
++static void init_tss(struct pkvm_pcpu *pcpu)
++{
++	struct desc_struct *d = pcpu->gdt_page.gdt;
++	tss_desc tss;
++
++	set_tssldt_descriptor(&tss, (unsigned long)&pcpu->tss, DESC_TSS,
++			__KERNEL_TSS_LIMIT);
++
++	write_gdt_entry(d, GDT_ENTRY_TSS, &tss, DESC_TSS);
++}
++
++static int pkvm_setup_pcpu(struct pkvm_hyp *pkvm, int cpu)
++{
++	struct pkvm_pcpu *pcpu;
++
++	if (cpu >= CONFIG_NR_CPUS)
++		return -ENOMEM;
++
++	pcpu = pkvm_sym(pkvm_early_alloc_contig)(PKVM_PCPU_PAGES);
++	if (!pcpu)
++		return -ENOMEM;
++
++	/* tmp use host cr3, switch to pkvm owned cr3 after de-privilege */
++	pcpu->cr3 = __read_cr3();
++
++	init_gdt(pcpu);
++	init_idt(pcpu);
++	init_tss(pcpu);
++
++	pkvm->pcpus[cpu] = pcpu;
++
++	return 0;
++}
++
++static int pkvm_host_setup_vcpu(struct pkvm_hyp *pkvm, int cpu)
++{
++	struct pkvm_host_vcpu *pkvm_host_vcpu;
++
++	if (cpu >= CONFIG_NR_CPUS)
++		return -ENOMEM;
++
++	pkvm_host_vcpu = pkvm_sym(pkvm_early_alloc_contig)(PKVM_HOST_VCPU_PAGES);
++	if (!pkvm_host_vcpu)
++		return -ENOMEM;
++
++	pkvm_host_vcpu->pcpu = pkvm->pcpus[cpu];
++	pkvm_host_vcpu->vmx.vcpu.cpu = cpu;
++
++	pkvm->host_vm.host_vcpus[cpu] = pkvm_host_vcpu;
++
++	return 0;
++}
++
++static void enable_feature_control(void)
++{
++	u64 old, test_bits;
++
++	rdmsrl(MSR_IA32_FEAT_CTL, old);
++	test_bits = FEAT_CTL_LOCKED;
++	test_bits |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
++
++	if ((old & test_bits) != test_bits)
++		wrmsrl(MSR_IA32_FEAT_CTL, old | test_bits);
++}
++
++#define savegpr(gpr, value) 		\
++	asm("mov %%" #gpr ",%0":"=r" (value) : : "memory")
++
++static noinline int pkvm_host_run_vcpu(struct pkvm_host_vcpu *vcpu)
++{
++	u64 guest_rsp, host_rsp;
++	unsigned long *regs = vcpu->vmx.vcpu.arch.regs;
++	volatile int ret = 0;
++
++	/*
++	 * prepare to RUN vcpu:
++	 *
++	 * - record gprs in vcpu.arch.regs[]:
++	 *
++	 * - record below guest vmcs fields:
++	 * 	GUSET_RFLAGS - read from native
++	 *
++	 * - record below guest vmcs fields:
++	 * 	GUSET_RFLAGS - read from native
++	 * 	GUEST_RSP - read from native
++	 * 	GUEST_RIP - vmentry_point
++	 *
++	 * - switch RSP to host_rsp
++	 */
++	savegpr(rax, regs[__VCPU_REGS_RAX]);
++	savegpr(rcx, regs[__VCPU_REGS_RCX]);
++	savegpr(rdx, regs[__VCPU_REGS_RDX]);
++	savegpr(rbx, regs[__VCPU_REGS_RBX]);
++	savegpr(rbp, regs[__VCPU_REGS_RBP]);
++	savegpr(rsi, regs[__VCPU_REGS_RSI]);
++	savegpr(rdi, regs[__VCPU_REGS_RDI]);
++	savegpr(r8, regs[__VCPU_REGS_R8]);
++	savegpr(r9, regs[__VCPU_REGS_R9]);
++	savegpr(r10, regs[__VCPU_REGS_R10]);
++	savegpr(r11, regs[__VCPU_REGS_R11]);
++	savegpr(r12, regs[__VCPU_REGS_R12]);
++	savegpr(r13, regs[__VCPU_REGS_R13]);
++	savegpr(r14, regs[__VCPU_REGS_R14]);
++	savegpr(r15, regs[__VCPU_REGS_R15]);
++	host_rsp = (u64)vcpu->pcpu->stack + STACK_SIZE;
++	asm volatile(
++		"pushfq\n"
++		"popq %%rax\n"
++		"movq %1, %%rdx\n"
++		"vmwrite %%rax, %%rdx\n"
++		"movq %%rsp, %%rax\n"
++		"movq %2, %%rdx\n"
++		"vmwrite %%rax, %%rdx\n"
++		"movq %%rax, %0\n"
++		"movq $vmentry_point, %%rax\n"
++		"movq %3, %%rdx\n"
++		"vmwrite %%rax, %%rdx\n"
++		"movq %4, %%rsp\n"
++		: "=m"(guest_rsp)
++		: "i"(GUEST_RFLAGS), "i"(GUEST_RSP) , "i"(GUEST_RIP), "m"(host_rsp)
++		: "rax", "rdx", "memory");
++
++	/*
++	 * call pkvm_main to do vmlaunch.
++	 *
++	 * if pkvm_main return:
++	 * 	vmlaunch fail - switch back to guest_rsp
++	 * if pkvm_main not return:
++	 * 	vmlaunch success: guest ret to vmentry_point
++	 */
++	ret = pkvm_sym(pkvm_main)(&vcpu->vmx.vcpu);
++	asm volatile(
++			"movq %0, %%rsp\n"
++			"vmentry_point:\n"
++			: : "m"(guest_rsp) :);
++
++	return ret;
++}
++
++static void pkvm_host_deprivilege_cpu(void *data)
++{
++	struct pkvm_deprivilege_param *p = data;
++	unsigned long flags;
++	int cpu = get_cpu(), ret;
++	struct pkvm_host_vcpu *vcpu =
++		p->pkvm->host_vm.host_vcpus[cpu];
++
++	local_irq_save(flags);
++
++	enable_feature_control();
++
++	ret = pkvm_host_init_vmx(vcpu, cpu);
++	if (ret) {
++		pr_err("%s: init vmx failed\n", __func__);
++		goto out;
++	}
++
++	ret = pkvm_host_run_vcpu(vcpu);
++	if (ret == 0) {
++		pr_info("%s: CPU%d in guest mode\n", __func__, cpu);
++		goto ok;
++	}
++
++out:
++	p->ret = ret;
++	pkvm_host_deinit_vmx(vcpu);
++	pr_err("%s: failed to deprivilege CPU%d\n", __func__, cpu);
++
++ok:
++	local_irq_restore(flags);
++
++	put_cpu();
++}
++
++/*
++ * Used in root mode to deprivilege CPUs
++ */
++static int pkvm_host_deprivilege_cpus(struct pkvm_hyp *pkvm)
++{
++	struct pkvm_deprivilege_param p = {
++		.pkvm = pkvm,
++		.ret = 0,
++	};
++
++	on_each_cpu(pkvm_host_deprivilege_cpu, &p, 1);
++	if (p.ret) {
++		/*
++		 * TODO:
++		 * We are here because some CPU failed to be deprivileged, so
++		 * the failed CPU will stay in root mode. But the others already
++		 * in the non-root mode. In this case, we should let non-root mode
++		 * CPUs go back to root mode, then the system can still run natively
++		 * without pKVM enabled.
++		 */
++		pr_err("%s: WARNING - failed to deprivilege  all CPUs!\n", __func__);
++	} else {
++		pr_info("%s: all cpus are in guest mode!\n", __func__);
++	}
++
++	return p.ret;
++}
++
++static int this_cpu_do_finalise_hc(struct pkvm_section *sections, unsigned long size)
++{
++	int ret;
++
++	local_irq_disable();
++	ret = kvm_hypercall2(PKVM_HC_INIT_FINALISE, (unsigned long)sections, size);
++	if (!ret)
++		this_cpu_write(pkvm_enabled, true);
++	local_irq_enable();
++	return ret;
++}
++
++static void do_pkvm_finalise(void *data)
++{
++	this_cpu_do_finalise_hc(NULL, 0);
++}
++
++static int pkvm_init_finalise(void)
++{
++	int ret, cpu;
++	int self = get_cpu();
++	struct pkvm_section sections[] = {
++		/*
++		 * NOTE: please ensure kernel section is put at the beginning,
++		 * as we do section mapping by the order, while kernel data
++		 * sections have overlap with pkvm ones, put the kernel section
++		 * after pkvm one will make pkvm section readonly!
++		 */
++		{
++			/*
++			 * Kernel section: addr is virtual, needed
++			 * for pkvm to access kernel alias symbol
++			 */
++			.type = KERNEL_DATA_SECTIONS,
++			.addr = (unsigned long)_sdata,
++			.size = (unsigned long)(_edata - _sdata),
++			.prot = (u64)pgprot_val(PAGE_KERNEL_RO),
++		},
++		{
++			/*
++			 * Kernel section: addr is virtual, needed
++			 * for pkvm to access kernel alias symbol
++			 */
++			.type = KERNEL_DATA_SECTIONS,
++			.addr = (unsigned long)__start_rodata,
++			.size = (unsigned long)(__end_rodata - __start_rodata),
++			.prot = (u64)pgprot_val(PAGE_KERNEL_RO),
++		},
++		{
++			/* PKVM reserved memory: addr is physical */
++			.type = PKVM_RESERVED_MEMORY,
++			.addr = (unsigned long)pkvm_mem_base,
++			.size = (unsigned long)pkvm_mem_size,
++			.prot = (u64)pgprot_val(PAGE_KERNEL),
++		},
++		{
++			/* PKVM section: addr is virtual */
++			.type = PKVM_CODE_DATA_SECTIONS,
++			.addr = (unsigned long)__pkvm_text_start,
++			.size = (unsigned long)(__pkvm_text_end - __pkvm_text_start),
++			.prot = (u64)pgprot_val(PAGE_KERNEL_EXEC),
++		},
++		{
++			/* PKVM section: addr is virtual */
++			.type = PKVM_CODE_DATA_SECTIONS,
++			.addr = (unsigned long)__pkvm_rodata_start,
++			.size = (unsigned long)(__pkvm_rodata_end - __pkvm_rodata_start),
++			.prot = (u64)pgprot_val(PAGE_KERNEL_RO),
++		},
++		{
++			/* PKVM section: addr is virtual */
++			.type = PKVM_CODE_DATA_SECTIONS,
++			.addr = (unsigned long)__pkvm_data_start,
++			.size = (unsigned long)(__pkvm_data_end - __pkvm_data_start),
++			.prot = (u64)pgprot_val(PAGE_KERNEL),
++		},
++		{
++			/* PKVM section: addr is virtual */
++			.type = PKVM_CODE_DATA_SECTIONS,
++			.addr = (unsigned long)__pkvm_bss_start,
++			.size = (unsigned long)(__pkvm_bss_end - __pkvm_bss_start),
++			.prot = (u64)pgprot_val(PAGE_KERNEL),
++		},
++	};
++
++	/*
++	 * First hypercall to recreate the pgtable for pkvm, and init
++	 * memory pool for later use, on boot cpu.
++	 * Input parameters are only needed for the first hypercall.
++	 */
++	ret = this_cpu_do_finalise_hc(sections, ARRAY_SIZE(sections));
++	if (ret) {
++		pr_err("%s: pkvm finalise failed!\n", __func__);
++		goto out;
++	}
++
++	for_each_possible_cpu(cpu) {
++		if (cpu == self)
++			continue;
++
++		/*
++		 * Second hypercall to switch the mmu and ept pgtable
++		 * for other cpus other than boot cpu.
++		 */
++		ret = smp_call_function_single(cpu, do_pkvm_finalise,
++					       NULL, true);
++	}
++
++	ret = kvm_hypercall0(PKVM_HC_ACTIVATE_IOMMU);
++out:
++	put_cpu();
++
++	return ret;
++}
++
++static int add_device_to_pkvm(struct device *dev, void *data)
++{
++	struct kvm_protected_vm *pkvm = data;
++	struct pci_dev *pdev;
++	u16 devid;
++
++	if (!dev_is_pci(dev))
++		return 0;
++
++	pdev = to_pci_dev(dev);
++	devid = PCI_DEVID(pdev->bus->number, pdev->devfn);
++
++	return kvm_hypercall3(PKVM_HC_ADD_PTDEV, pkvm->shadow_vm_handle, devid, 0);
++}
++
++static int pkvm_init_pci(struct pkvm_hyp *pkvm)
++{
++	struct pci_mmcfg_region *data, *cfg;
++	int length = 0, max_region_num = PAGE_SIZE / sizeof(struct pci_mmcfg_region);
++
++	data = pkvm_sym(pkvm_early_alloc_page)();
++
++	list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list, pci_mmcfg_lock_held()) {
++		if (length >= max_region_num)
++			return -ENOMEM;
++		memcpy(&data[length], cfg, sizeof(struct pci_mmcfg_region));
++		length += 1;
++	}
++
++	pkvm->host_vm.pci_info.mmcfg_table = data;
++	pkvm->host_vm.pci_info.mmcfg_table_size = length;
++
++	pkvm_sym(init_pci)(pkvm);
++
++	return 0;
++}
++
++int kvm_arch_add_device_to_pkvm(struct kvm *kvm, struct iommu_group *grp)
++{
++	int ret = 0;
++
++	kvm_get_kvm(kvm);
++
++	if (kvm->arch.vm_type == KVM_X86_PROTECTED_VM)
++		ret = iommu_group_for_each_dev(grp, &kvm->pkvm,
++					       add_device_to_pkvm);
++
++	kvm_put_kvm(kvm);
++
++	return ret;
++}
++
++int pkvm_init_shadow_vm(struct kvm *kvm)
++{
++	struct kvm_protected_vm *pkvm = &kvm->pkvm;
++	size_t shadow_sz;
++	void *shadow_addr;
++	int ret;
++
++	INIT_LIST_HEAD(&kvm->pkvm.pinned_pages);
++
++	shadow_sz = PAGE_ALIGN(PKVM_SHADOW_VM_SIZE);
++	shadow_addr = alloc_pages_exact(shadow_sz, GFP_KERNEL_ACCOUNT);
++	if (!shadow_addr)
++		return -ENOMEM;
++
++	ret = kvm_hypercall3(PKVM_HC_INIT_SHADOW_VM, (unsigned long)kvm,
++					  (unsigned long)__pa(shadow_addr), shadow_sz);
++	if (ret < 0)
++		goto free_page;
++
++	pkvm->shadow_vm_handle = ret;
++
++	return 0;
++free_page:
++	free_pages_exact(shadow_addr, shadow_sz);
++	return ret;
++}
++
++void pkvm_teardown_shadow_vm(struct kvm *kvm)
++{
++	struct kvm_protected_vm *pkvm = &kvm->pkvm;
++	struct kvm_pinned_page *ppage, *n;
++	unsigned long pa;
++
++	pa = kvm_hypercall1(PKVM_HC_TEARDOWN_SHADOW_VM, pkvm->shadow_vm_handle);
++	if (!pa)
++		return;
++
++	free_pages_exact(__va(pa), PAGE_ALIGN(PKVM_SHADOW_VM_SIZE));
++
++	if (list_empty(&pkvm->pinned_pages))
++		return;
++
++	list_for_each_entry_safe(ppage, n, &pkvm->pinned_pages, list) {
++		list_del(&ppage->list);
++		put_page(ppage->page);
++		kfree(ppage);
++	}
++}
++
++int pkvm_init_shadow_vcpu(struct kvm_vcpu *vcpu)
++{
++	struct kvm_protected_vm *pkvm = &vcpu->kvm->pkvm;
++	struct vcpu_vmx *vmx = to_vmx(vcpu);
++	s64 shadow_vcpu_handle;
++	size_t shadow_sz;
++	void *shadow_addr;
++
++	shadow_sz = PAGE_ALIGN(PKVM_SHADOW_VCPU_STATE_SIZE);
++	shadow_addr = alloc_pages_exact(shadow_sz, GFP_KERNEL_ACCOUNT);
++	if (!shadow_addr)
++		return -ENOMEM;
++
++	shadow_vcpu_handle = kvm_hypercall4(PKVM_HC_INIT_SHADOW_VCPU,
++					    pkvm->shadow_vm_handle, (unsigned long)vmx,
++					    (unsigned long)__pa(shadow_addr), shadow_sz);
++	if (shadow_vcpu_handle < 0)
++		goto free_page;
++
++	vcpu->pkvm_shadow_vcpu_handle = shadow_vcpu_handle;
++
++	return 0;
++
++free_page:
++	free_pages_exact(shadow_addr, shadow_sz);
++	return -EINVAL;
++}
++
++void pkvm_teardown_shadow_vcpu(struct kvm_vcpu *vcpu)
++{
++	unsigned long pa = kvm_hypercall1(PKVM_HC_TEARDOWN_SHADOW_VCPU,
++					  vcpu->pkvm_shadow_vcpu_handle);
++
++	if (!pa)
++		return;
++
++	free_pages_exact(__va(pa), PAGE_ALIGN(PKVM_SHADOW_VCPU_STATE_SIZE));
++}
++
++int pkvm_tlb_remote_flush_with_range(struct kvm *kvm, struct kvm_tlb_range *range)
++{
++	int shadow_vm_handle = kvm->pkvm.shadow_vm_handle;
++	u64 start_gpa = 0;
++	u64 size = 0;
++
++	if (shadow_vm_handle <= 0)
++		return -EOPNOTSUPP;
++
++	if (range) {
++		start_gpa = range->start_gfn << PAGE_SHIFT;
++		size = range->pages * PAGE_SIZE;
++	}
++
++	return kvm_hypercall3(PKVM_HC_TLB_REMOTE_FLUSH_RANGE,
++			      shadow_vm_handle, start_gpa, size);
++}
++
++int pkvm_tlb_remote_flush(struct kvm *kvm)
++{
++	return pkvm_tlb_remote_flush_with_range(kvm, NULL);
++}
++
++int pkvm_set_mmio_ve(struct kvm_vcpu *vcpu, unsigned long gfn)
++{
++	if (vcpu->kvm->arch.vm_type == KVM_X86_PROTECTED_VM) {
++		kvm_hypercall1(PKVM_HC_SET_MMIO_VE, gfn);
++		return 1;
++	}
++
++	return 0;
++}
++
++static int pkvm_init_io_emulation(struct pkvm_hyp *pkvm)
++{
++	pkvm->host_vm.io_bitmap = pkvm_sym(pkvm_early_alloc_contig)(2);
++
++	if (!pkvm->host_vm.io_bitmap) {
++		pr_err("pkvm: %s: No page for io_bitmap\n", __func__);
++		return -ENOMEM;
++	}
++
++	memset(pkvm->host_vm.io_bitmap, 0, 2 * PAGE_SIZE);
++
++	return 0;
++}
++
++int __init pkvm_init(void)
++{
++	int ret = 0, cpu;
++
++	if(pkvm_sym(pkvm_hyp)) {
++		pr_err("pkvm hypervisor is running!");
++		return -EBUSY;
++	}
++
++	if (!pkvm_mem_base) {
++		pr_err("pkvm required memory not get reseved!");
++		ret = -ENOMEM;
++		goto out;
++	}
++	pkvm_sym(pkvm_early_alloc_init)(__va(pkvm_mem_base),
++			pkvm_data_struct_pages(PKVM_GLOBAL_PAGES, PKVM_PERCPU_PAGES,
++				num_possible_cpus()) << PAGE_SHIFT);
++
++	/* pkvm hypervisor keeps same VA mapping as deprivileged host */
++	pkvm = pkvm_sym(pkvm_hyp) = pkvm_sym(pkvm_early_alloc_contig)(PKVM_PAGES);
++	if (!pkvm) {
++		ret = -ENOMEM;
++		goto out;
++	}
++
++	ret = pkvm_host_check_and_setup_vmx_cap(pkvm);
++	if (ret)
++		goto out;
++
++	ret = check_and_init_iommu(pkvm);
++	if (ret)
++		goto out;
++
++	ret = pkvm_init_mmu(pkvm);
++	if (ret)
++		goto out;
++
++	ret = pkvm_init_io_emulation(pkvm);
++	if (ret)
++		goto out;
++
++	ret = pkvm_init_pci(pkvm);
++	if (ret)
++		goto out;
++
++	for_each_possible_cpu(cpu) {
++		ret = pkvm_setup_pcpu(pkvm, cpu);
++		if (ret)
++			goto out;
++		ret = pkvm_host_setup_vcpu(pkvm, cpu);
++		if (ret)
++			goto out;
++	}
++
++	ret = pkvm_host_deprivilege_cpus(pkvm);
++	if (ret)
++		goto out;
++
++	pkvm->num_cpus = num_possible_cpus();
++	pkvm_init_debugfs();
++
++	return pkvm_init_finalise();
++
++out:
++	pkvm_sym(pkvm_hyp) = NULL;
++	return ret;
++}
+diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c
+index 2251b60920f8..6ab29b869914 100644
+--- a/arch/x86/kvm/vmx/vmcs12.c
++++ b/arch/x86/kvm/vmx/vmcs12.c
+@@ -112,6 +112,8 @@ const unsigned short vmcs12_field_offsets[] = {
+ 	FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
+ 	FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
+ 	FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value),
++	FIELD(PLE_GAP, ple_gap),
++	FIELD(PLE_WINDOW, ple_window),
+ 	FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
+ 	FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
+ 	FIELD(CR0_READ_SHADOW, cr0_read_shadow),
+@@ -150,5 +152,9 @@ const unsigned short vmcs12_field_offsets[] = {
+ 	FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
+ 	FIELD(HOST_RSP, host_rsp),
+ 	FIELD(HOST_RIP, host_rip),
++	FIELD(EXIT_IO_RCX, exit_io_rcx),
++	FIELD(EXIT_IO_RSI, exit_io_rsi),
++	FIELD(EXIT_IO_RDI, exit_io_rdi),
++	FIELD(EXIT_IO_RIP, exit_io_rip),
+ };
+ const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs12_field_offsets);
+diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
+index 746129ddd5ae..86299ccc97e7 100644
+--- a/arch/x86/kvm/vmx/vmcs12.h
++++ b/arch/x86/kvm/vmx/vmcs12.h
+@@ -117,7 +117,11 @@ struct __packed vmcs12 {
+ 	natural_width host_ia32_sysenter_eip;
+ 	natural_width host_rsp;
+ 	natural_width host_rip;
+-	natural_width paddingl[8]; /* room for future expansion */
++	natural_width exit_io_rcx;
++	natural_width exit_io_rsi;
++	natural_width exit_io_rdi;
++	natural_width exit_io_rip;
++	natural_width paddingl[4]; /* room for future expansion */
+ 	u32 pin_based_vm_exec_control;
+ 	u32 cpu_based_vm_exec_control;
+ 	u32 exception_bitmap;
+@@ -165,7 +169,9 @@ struct __packed vmcs12 {
+ 	u32 guest_sysenter_cs;
+ 	u32 host_ia32_sysenter_cs;
+ 	u32 vmx_preemption_timer_value;
+-	u32 padding32[7]; /* room for future expansion */
++	u32 ple_gap;
++	u32 ple_window;
++	u32 padding32[5]; /* room for future expansion */
+ 	u16 virtual_processor_id;
+ 	u16 posted_intr_nv;
+ 	u16 guest_es_selector;
+@@ -293,6 +299,10 @@ static inline void vmx_check_vmcs12_offsets(void)
+ 	CHECK_OFFSET(host_ia32_sysenter_eip, 656);
+ 	CHECK_OFFSET(host_rsp, 664);
+ 	CHECK_OFFSET(host_rip, 672);
++	CHECK_OFFSET(exit_io_rcx, 680);
++	CHECK_OFFSET(exit_io_rsi, 688);
++	CHECK_OFFSET(exit_io_rdi, 696);
++	CHECK_OFFSET(exit_io_rip, 704);
+ 	CHECK_OFFSET(pin_based_vm_exec_control, 744);
+ 	CHECK_OFFSET(cpu_based_vm_exec_control, 748);
+ 	CHECK_OFFSET(exception_bitmap, 752);
+@@ -340,6 +350,8 @@ static inline void vmx_check_vmcs12_offsets(void)
+ 	CHECK_OFFSET(guest_sysenter_cs, 920);
+ 	CHECK_OFFSET(host_ia32_sysenter_cs, 924);
+ 	CHECK_OFFSET(vmx_preemption_timer_value, 928);
++	CHECK_OFFSET(ple_gap, 932);
++	CHECK_OFFSET(ple_window, 936);
+ 	CHECK_OFFSET(virtual_processor_id, 960);
+ 	CHECK_OFFSET(posted_intr_nv, 962);
+ 	CHECK_OFFSET(guest_es_selector, 964);
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 31a10d774df6..816e3be927f4 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -48,6 +48,7 @@
+ #include <asm/spec-ctrl.h>
+ #include <asm/virtext.h>
+ #include <asm/vmx.h>
++#include <asm/kvm_pkvm.h>
+ 
+ #include "capabilities.h"
+ #include "cpuid.h"
+@@ -66,6 +67,7 @@
+ #include "vmcs12.h"
+ #include "vmx.h"
+ #include "x86.h"
++#include "vmx_lib.h"
+ 
+ MODULE_AUTHOR("Qumranet");
+ MODULE_LICENSE("GPL");
+@@ -119,9 +121,6 @@ module_param(nested, bool, S_IRUGO);
+ bool __read_mostly enable_pml = 1;
+ module_param_named(pml, enable_pml, bool, S_IRUGO);
+ 
+-static bool __read_mostly error_on_inconsistent_vmcs_config = true;
+-module_param(error_on_inconsistent_vmcs_config, bool, 0444);
+-
+ static bool __read_mostly dump_invalid_vmcs = 0;
+ module_param(dump_invalid_vmcs, bool, 0644);
+ 
+@@ -2536,6 +2535,11 @@ static void vmx_hardware_disable(void)
+ 	intel_pt_handle_vmx(0);
+ }
+ 
++void free_vmcs(struct vmcs *vmcs)
++{
++	free_page((unsigned long)vmcs);
++}
++
+ /*
+  * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
+  * directly instead of going through cpu_has(), to ensure KVM is trapping
+@@ -2571,192 +2575,31 @@ static bool cpu_has_perf_global_ctrl_bug(void)
+ 	return false;
+ }
+ 
+-static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
+-				      u32 msr, u32 *result)
+-{
+-	u32 vmx_msr_low, vmx_msr_high;
+-	u32 ctl = ctl_min | ctl_opt;
+-
+-	rdmsr(msr, vmx_msr_low, vmx_msr_high);
+-
+-	ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
+-	ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
+-
+-	/* Ensure minimum (required) set of control bits are supported. */
+-	if (ctl_min & ~ctl)
+-		return -EIO;
+-
+-	*result = ctl;
+-	return 0;
+-}
+-
+-static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
+-{
+-	u64 allowed;
+-
+-	rdmsrl(msr, allowed);
+-
+-	return  ctl_opt & allowed;
+-}
+-
+ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
+ 				    struct vmx_capability *vmx_cap)
+ {
+-	u32 vmx_msr_low, vmx_msr_high;
+-	u32 _pin_based_exec_control = 0;
+-	u32 _cpu_based_exec_control = 0;
+-	u32 _cpu_based_2nd_exec_control = 0;
+-	u64 _cpu_based_3rd_exec_control = 0;
+-	u32 _vmexit_control = 0;
+-	u32 _vmentry_control = 0;
+-	u64 misc_msr;
+-	int i;
+-
+-	/*
+-	 * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory.
+-	 * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always
+-	 * intercepts writes to PAT and EFER, i.e. never enables those controls.
+-	 */
+-	struct {
+-		u32 entry_control;
+-		u32 exit_control;
+-	} const vmcs_entry_exit_pairs[] = {
+-		{ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,	VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL },
+-		{ VM_ENTRY_LOAD_IA32_PAT,		VM_EXIT_LOAD_IA32_PAT },
+-		{ VM_ENTRY_LOAD_IA32_EFER,		VM_EXIT_LOAD_IA32_EFER },
+-		{ VM_ENTRY_LOAD_BNDCFGS,		VM_EXIT_CLEAR_BNDCFGS },
+-		{ VM_ENTRY_LOAD_IA32_RTIT_CTL,		VM_EXIT_CLEAR_IA32_RTIT_CTL },
++	struct vmcs_config_setting setting = {
++		.cpu_based_exec_ctrl_min = KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL,
++		.cpu_based_exec_ctrl_opt = KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL,
++		.cpu_based_2nd_exec_ctrl_min = KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL,
++		.cpu_based_2nd_exec_ctrl_opt = KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL,
++		.pin_based_exec_ctrl_min = KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL,
++		.pin_based_exec_ctrl_opt = KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL,
++		.vmexit_ctrl_min = KVM_REQUIRED_VMX_VM_EXIT_CONTROLS,
++		.vmexit_ctrl_opt = KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS,
++		.vmentry_ctrl_min = KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS,
++		.vmentry_ctrl_opt = KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS,
++		.has_broken_vmx_preemption_timer = cpu_has_broken_vmx_preemption_timer(),
++		.perf_global_ctrl_workaround = cpu_has_perf_global_ctrl_bug(),
+ 	};
+-
+-	memset(vmcs_conf, 0, sizeof(*vmcs_conf));
+-
+-	if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL,
+-				KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL,
+-				MSR_IA32_VMX_PROCBASED_CTLS,
+-				&_cpu_based_exec_control))
+-		return -EIO;
+-	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
+-		if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL,
+-					KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL,
+-					MSR_IA32_VMX_PROCBASED_CTLS2,
+-					&_cpu_based_2nd_exec_control))
+-			return -EIO;
+-	}
+-#ifndef CONFIG_X86_64
+-	if (!(_cpu_based_2nd_exec_control &
+-				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+-		_cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
+-#endif
+-
+-	if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
+-		_cpu_based_2nd_exec_control &= ~(
+-				SECONDARY_EXEC_APIC_REGISTER_VIRT |
+-				SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+-				SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+-
+-	rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
+-		&vmx_cap->ept, &vmx_cap->vpid);
+-
+-	if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
+-	    vmx_cap->ept) {
+-		pr_warn_once("EPT CAP should not exist if not support "
+-				"1-setting enable EPT VM-execution control\n");
+-
+-		if (error_on_inconsistent_vmcs_config)
+-			return -EIO;
+-
+-		vmx_cap->ept = 0;
+-	}
+-	if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
+-	    vmx_cap->vpid) {
+-		pr_warn_once("VPID CAP should not exist if not support "
+-				"1-setting enable VPID VM-execution control\n");
+-
+-		if (error_on_inconsistent_vmcs_config)
+-			return -EIO;
+-
+-		vmx_cap->vpid = 0;
+-	}
++	int ret;
+ 
+ 	if (!cpu_has_sgx())
+-		_cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING;
+-
+-	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS)
+-		_cpu_based_3rd_exec_control =
+-			adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL,
+-					      MSR_IA32_VMX_PROCBASED_CTLS3);
+-
+-	if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS,
+-				KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS,
+-				MSR_IA32_VMX_EXIT_CTLS,
+-				&_vmexit_control))
+-		return -EIO;
+-
+-	if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL,
+-				KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL,
+-				MSR_IA32_VMX_PINBASED_CTLS,
+-				&_pin_based_exec_control))
+-		return -EIO;
++		setting.cpu_based_2nd_exec_ctrl_opt &= ~SECONDARY_EXEC_ENCLS_EXITING;
+ 
+-	if (cpu_has_broken_vmx_preemption_timer())
+-		_pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+-	if (!(_cpu_based_2nd_exec_control &
+-		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
+-		_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
+-
+-	if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS,
+-				KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS,
+-				MSR_IA32_VMX_ENTRY_CTLS,
+-				&_vmentry_control))
+-		return -EIO;
+-
+-	for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) {
+-		u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control;
+-		u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control;
+-
+-		if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl))
+-			continue;
+-
+-		pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n",
+-			     _vmentry_control & n_ctrl, _vmexit_control & x_ctrl);
+-
+-		if (error_on_inconsistent_vmcs_config)
+-			return -EIO;
+-
+-		_vmentry_control &= ~n_ctrl;
+-		_vmexit_control &= ~x_ctrl;
+-	}
+-
+-	rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
+-
+-	/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
+-	if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
+-		return -EIO;
+-
+-#ifdef CONFIG_X86_64
+-	/* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
+-	if (vmx_msr_high & (1u<<16))
+-		return -EIO;
+-#endif
+-
+-	/* Require Write-Back (WB) memory type for VMCS accesses. */
+-	if (((vmx_msr_high >> 18) & 15) != 6)
+-		return -EIO;
+-
+-	rdmsrl(MSR_IA32_VMX_MISC, misc_msr);
+-
+-	vmcs_conf->size = vmx_msr_high & 0x1fff;
+-	vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
+-
+-	vmcs_conf->revision_id = vmx_msr_low;
+-
+-	vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
+-	vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
+-	vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
+-	vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control;
+-	vmcs_conf->vmexit_ctrl         = _vmexit_control;
+-	vmcs_conf->vmentry_ctrl        = _vmentry_control;
+-	vmcs_conf->misc	= misc_msr;
++	ret = __setup_vmcs_config(vmcs_conf, vmx_cap, &setting);
++	if (ret < 0)
++		return ret;
+ 
+ 	return 0;
+ }
+@@ -2784,11 +2627,6 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
+ 	return vmcs;
+ }
+ 
+-void free_vmcs(struct vmcs *vmcs)
+-{
+-	free_page((unsigned long)vmcs);
+-}
+-
+ /*
+  * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
+  */
+@@ -4847,18 +4685,12 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+ 
+ static void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
+ {
+-	exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
++	_vmx_enable_irq_window(to_vmx(vcpu));
+ }
+ 
+ static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
+ {
+-	if (!enable_vnmi ||
+-	    vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
+-		vmx_enable_irq_window(vcpu);
+-		return;
+-	}
+-
+-	exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
++	_vmx_enable_nmi_window(to_vmx(vcpu), enable_vnmi);
+ }
+ 
+ static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
+@@ -7319,6 +7151,8 @@ static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
+ 	free_vpid(vmx->vpid);
+ 	nested_vmx_free_vcpu(vcpu);
+ 	free_loaded_vmcs(vmx->loaded_vmcs);
++
++	pkvm_teardown_shadow_vcpu(vcpu);
+ }
+ 
+ static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
+@@ -7416,7 +7250,7 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
+ 		WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id],
+ 			   __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID);
+ 
+-	return 0;
++	return pkvm_init_shadow_vcpu(vcpu);
+ 
+ free_vmcs:
+ 	free_loaded_vmcs(vmx->loaded_vmcs);
+@@ -7427,6 +7261,15 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
+ 	return err;
+ }
+ 
++static bool vmx_is_vm_type_supported(unsigned long type)
++{
++#ifdef CONFIG_PKVM_INTEL
++	if (type == KVM_X86_PROTECTED_VM)
++		return true;
++#endif
++	return type == KVM_X86_DEFAULT_VM;
++}
++
+ #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
+ #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
+ 
+@@ -7458,7 +7301,13 @@ static int vmx_vm_init(struct kvm *kvm)
+ 			break;
+ 		}
+ 	}
+-	return 0;
++
++	return pkvm_init_shadow_vm(kvm);
++}
++
++static void vmx_vm_free(struct kvm *kvm)
++{
++	pkvm_teardown_shadow_vm(kvm);
+ }
+ 
+ static int __init vmx_check_processor_compat(void)
+@@ -8106,9 +7955,11 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
+ 	.hardware_disable = vmx_hardware_disable,
+ 	.has_emulated_msr = vmx_has_emulated_msr,
+ 
++	.is_vm_type_supported = vmx_is_vm_type_supported,
+ 	.vm_size = sizeof(struct kvm_vmx),
+ 	.vm_init = vmx_vm_init,
+ 	.vm_destroy = vmx_vm_destroy,
++	.vm_free = vmx_vm_free,
+ 
+ 	.vcpu_precreate = vmx_vcpu_precreate,
+ 	.vcpu_create = vmx_vcpu_create,
+@@ -8385,6 +8236,17 @@ static __init int hardware_setup(void)
+ 	}
+ #endif
+ 
++#if IS_ENABLED(CONFIG_PKVM_INTEL)
++	if (!enable_ept || vmx_x86_ops.tlb_remote_flush ||
++			vmx_x86_ops.tlb_remote_flush_with_range) {
++		pr_err_ratelimited("kvm: EPT or tlb_remote_flush ops not available to pKVM-IA\n");
++		return -EOPNOTSUPP;
++	}
++	vmx_x86_ops.tlb_remote_flush = pkvm_tlb_remote_flush;
++	vmx_x86_ops.tlb_remote_flush_with_range =
++			pkvm_tlb_remote_flush_with_range;
++#endif
++
+ 	if (!cpu_has_vmx_ple()) {
+ 		ple_gap = 0;
+ 		ple_window = 0;
+@@ -8496,6 +8358,9 @@ static __init int hardware_setup(void)
+ }
+ 
+ static struct kvm_x86_init_ops vmx_init_ops __initdata = {
++#ifdef CONFIG_PKVM_INTEL
++	.pkvm_init = pkvm_init,
++#endif
+ 	.cpu_has_kvm_support = cpu_has_kvm_support,
+ 	.disabled_by_bios = vmx_disabled_by_bios,
+ 	.check_processor_compatibility = vmx_check_processor_compat,
+diff --git a/arch/x86/kvm/vmx/vmx_lib.h b/arch/x86/kvm/vmx/vmx_lib.h
+new file mode 100644
+index 000000000000..38bae15db417
+--- /dev/null
++++ b/arch/x86/kvm/vmx/vmx_lib.h
+@@ -0,0 +1,241 @@
++// SPDX-License-Identifier: GPL-2.0-only
++#ifndef __KVM_X86_VMX_LIB_H
++#define __KVM_X86_VMX_LIB_H
++
++static bool __read_mostly error_on_inconsistent_vmcs_config = true;
++module_param(error_on_inconsistent_vmcs_config, bool, 0444);
++
++#ifndef __PKVM_HYP__
++struct vmcs_config_setting {
++	u32 cpu_based_exec_ctrl_min;
++	u32 cpu_based_exec_ctrl_opt;
++	u32 cpu_based_2nd_exec_ctrl_min;
++	u32 cpu_based_2nd_exec_ctrl_opt;
++	u32 pin_based_exec_ctrl_min;
++	u32 pin_based_exec_ctrl_opt;
++	u32 vmexit_ctrl_min;
++	u32 vmexit_ctrl_opt;
++	u32 vmentry_ctrl_min;
++	u32 vmentry_ctrl_opt;
++	bool has_broken_vmx_preemption_timer;
++	bool perf_global_ctrl_workaround;
++};
++
++
++static inline u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
++{
++	u64 allowed;
++
++	rdmsrl(msr, allowed);
++
++	return  ctl_opt & allowed;
++}
++
++__attribute__((optimize(0)))
++static inline int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
++				      u32 msr, u32 *result)
++{
++	u32 vmx_msr_low, vmx_msr_high;
++	u32 ctl = ctl_min | ctl_opt;
++
++	rdmsr(msr, vmx_msr_low, vmx_msr_high);
++
++	ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
++	ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
++
++	/* Ensure minimum (required) set of control bits are supported. */
++	if (ctl_min & ~ctl)
++		return -EIO;
++
++	*result = ctl;
++	return 0;
++}
++
++__attribute__((optimize(1)))
++static inline int __setup_vmcs_config(struct vmcs_config *vmcs_conf,
++				    struct vmx_capability *vmx_cap,
++				    struct vmcs_config_setting *setting)
++{
++	u32 vmx_msr_low, vmx_msr_high;
++	u32 min, opt;
++	u32 _pin_based_exec_control = 0;
++	u32 _cpu_based_exec_control = 0;
++	u32 _cpu_based_2nd_exec_control = 0;
++	u64 _cpu_based_3rd_exec_control = 0;
++	u32 _vmexit_control = 0;
++	u32 _vmentry_control = 0;
++	u64 misc_msr;
++	int i;
++
++	/*
++	 * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory.
++	 * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always
++	 * intercepts writes to PAT and EFER, i.e. never enables those controls.
++	 */
++	struct {
++		u32 entry_control;
++		u32 exit_control;
++	} const vmcs_entry_exit_pairs[] = {
++		{ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,	VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL },
++		{ VM_ENTRY_LOAD_IA32_PAT,		VM_EXIT_LOAD_IA32_PAT },
++		{ VM_ENTRY_LOAD_IA32_EFER,		VM_EXIT_LOAD_IA32_EFER },
++		{ VM_ENTRY_LOAD_BNDCFGS,		VM_EXIT_CLEAR_BNDCFGS },
++		{ VM_ENTRY_LOAD_IA32_RTIT_CTL,		VM_EXIT_CLEAR_IA32_RTIT_CTL },
++	};
++
++	memset(vmcs_conf, 0, sizeof(*vmcs_conf));
++
++	min = setting->cpu_based_exec_ctrl_min;
++	opt = setting->cpu_based_exec_ctrl_opt;
++	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
++				&_cpu_based_exec_control) < 0)
++		return -EIO;
++	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
++		min = setting->cpu_based_2nd_exec_ctrl_min;
++		opt = setting->cpu_based_2nd_exec_ctrl_opt;
++		if (adjust_vmx_controls(min, opt,
++					MSR_IA32_VMX_PROCBASED_CTLS2,
++					&_cpu_based_2nd_exec_control) < 0)
++			return -EIO;
++	}
++#ifndef CONFIG_X86_64
++	if (!(_cpu_based_2nd_exec_control &
++				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
++		_cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
++#endif
++
++	if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
++		_cpu_based_2nd_exec_control &= ~(
++				SECONDARY_EXEC_APIC_REGISTER_VIRT |
++				SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
++				SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
++
++	rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
++		&vmx_cap->ept, &vmx_cap->vpid);
++
++	if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
++	    vmx_cap->ept) {
++		pr_warn_once("EPT CAP should not exist if not support "
++				"1-setting enable EPT VM-execution control\n");
++
++		if (error_on_inconsistent_vmcs_config)
++			return -EIO;
++
++		vmx_cap->ept = 0;
++	}
++	if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
++	    vmx_cap->vpid) {
++		pr_warn_once("VPID CAP should not exist if not support "
++				"1-setting enable VPID VM-execution control\n");
++
++		if (error_on_inconsistent_vmcs_config)
++			return -EIO;
++
++		vmx_cap->vpid = 0;
++	}
++
++	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS)
++		_cpu_based_3rd_exec_control =
++			adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL,
++					      MSR_IA32_VMX_PROCBASED_CTLS3);
++
++
++	min = setting->vmexit_ctrl_min;
++	opt = setting->vmexit_ctrl_opt;
++	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
++				&_vmexit_control) < 0)
++		return -EIO;
++
++	min = setting->pin_based_exec_ctrl_min;
++	opt = setting->pin_based_exec_ctrl_opt;
++	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
++				&_pin_based_exec_control) < 0)
++		return -EIO;
++
++	if (setting->has_broken_vmx_preemption_timer)
++		_pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
++	if (!(_cpu_based_2nd_exec_control &
++		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
++		_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
++
++	min = setting->vmentry_ctrl_min;
++	opt = setting->vmentry_ctrl_opt;
++	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
++				&_vmentry_control) < 0)
++		return -EIO;
++
++	if (setting->perf_global_ctrl_workaround) {
++		_vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
++		_vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
++		pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
++				"does not work properly. Using workaround\n");
++	}
++
++	for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) {
++		u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control;
++		u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control;
++
++		if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl))
++			continue;
++
++		pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n",
++			     _vmentry_control & n_ctrl, _vmexit_control & x_ctrl);
++
++		if (error_on_inconsistent_vmcs_config)
++			return -EIO;
++
++		_vmentry_control &= ~n_ctrl;
++		_vmexit_control &= ~x_ctrl;
++	}
++
++	rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
++
++	/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
++	if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
++		return -EIO;
++
++#ifdef CONFIG_X86_64
++	/* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
++	if (vmx_msr_high & (1u<<16))
++		return -EIO;
++#endif
++
++	/* Require Write-Back (WB) memory type for VMCS accesses. */
++	if (((vmx_msr_high >> 18) & 15) != 6)
++		return -EIO;
++
++	rdmsrl(MSR_IA32_VMX_MISC, misc_msr);
++
++	vmcs_conf->size = vmx_msr_high & 0x1fff;
++	vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
++
++	vmcs_conf->revision_id = vmx_msr_low;
++
++	vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
++	vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
++	vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
++	vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control;
++	vmcs_conf->vmexit_ctrl         = _vmexit_control;
++	vmcs_conf->vmentry_ctrl        = _vmentry_control;
++
++	return 0;
++}
++#endif /* !__PKVM_HYP__*/
++
++static inline void _vmx_enable_irq_window(struct vcpu_vmx *vmx)
++{
++	exec_controls_setbit(vmx, CPU_BASED_INTR_WINDOW_EXITING);
++}
++
++static inline void _vmx_enable_nmi_window(struct vcpu_vmx *vmx, bool vnmi_enabled)
++{
++	if (!vnmi_enabled ||
++	    vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
++		_vmx_enable_irq_window(vmx);
++		return;
++	}
++
++	exec_controls_setbit(vmx, CPU_BASED_NMI_WINDOW_EXITING);
++}
++
++#endif
+diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
+index ec268df83ed6..6ca9ef128033 100644
+--- a/arch/x86/kvm/vmx/vmx_ops.h
++++ b/arch/x86/kvm/vmx/vmx_ops.h
+@@ -67,6 +67,12 @@ static __always_inline void vmcs_checkl(unsigned long field)
+ 			 "Natural width accessor invalid for 32-bit field");
+ }
+ 
++#ifdef __PKVM_HYP__
++
++#include "pkvm/hyp/vmx_ops.h"
++
++#else
++
+ static __always_inline unsigned long __vmcs_readl(unsigned long field)
+ {
+ 	unsigned long value;
+@@ -278,6 +284,7 @@ static inline void vmcs_load(struct vmcs *vmcs)
+ 
+ 	vmx_asm1(vmptrld, "m"(phys_addr), vmcs, phys_addr);
+ }
++#endif /*__PKVM_HYP__*/
+ 
+ static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva)
+ {
+@@ -312,6 +319,12 @@ static inline void vpid_sync_vcpu_global(void)
+ 	__invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
+ }
+ 
++static inline void ept_sync_global(void)
++{
++	__invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
++}
++
++#ifndef __PKVM_HYP__
+ static inline void vpid_sync_context(int vpid)
+ {
+ 	if (cpu_has_vmx_invvpid_single())
+@@ -331,11 +344,6 @@ static inline void vpid_sync_vcpu_addr(int vpid, gva_t addr)
+ 		vpid_sync_context(vpid);
+ }
+ 
+-static inline void ept_sync_global(void)
+-{
+-	__invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
+-}
+-
+ static inline void ept_sync_context(u64 eptp)
+ {
+ 	if (cpu_has_vmx_invept_context())
+@@ -343,5 +351,6 @@ static inline void ept_sync_context(u64 eptp)
+ 	else
+ 		ept_sync_global();
+ }
++#endif /* __PKVM_HYP__ */
+ 
+ #endif /* __KVM_X86_VMX_INSN_H */
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 4d6baae1ae74..a8f7fb5729bd 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -4549,6 +4549,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
+ 	case KVM_CAP_X86_NOTIFY_VMEXIT:
+ 		r = kvm_caps.has_notify_vmexit;
+ 		break;
++	case KVM_CAP_VM_TYPES:
++		r = BIT(KVM_X86_DEFAULT_VM);
++		if (static_call(kvm_x86_is_vm_type_supported)(KVM_X86_PROTECTED_VM))
++			r |= BIT(KVM_X86_PROTECTED_VM);
++		break;
+ 	default:
+ 		break;
+ 	}
+@@ -9404,6 +9409,14 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
+ 		return -EEXIST;
+ 	}
+ 
++#ifdef CONFIG_PKVM_INTEL
++	r = ops->pkvm_init();
++	if (r) {
++		pr_err_ratelimited("kvm: pkvm init fail\n");
++		return r;
++	}
++#endif
++
+ 	if (!ops->cpu_has_kvm_support()) {
+ 		pr_err_ratelimited("kvm: no hardware support for '%s'\n",
+ 				   ops->runtime_ops->name);
+@@ -9702,11 +9715,53 @@ static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
+ 	return kvm_skip_emulated_instruction(vcpu);
+ }
+ 
++int kvm_pkvm_hypercall(struct kvm_vcpu *vcpu)
++{
++	unsigned long val, nr;
++	int size;
++	gpa_t gpa;
++	int ret;
++
++	nr = kvm_rax_read(vcpu);
++	gpa = kvm_rbx_read(vcpu);
++	size = kvm_rcx_read(vcpu);
++	val = kvm_rdx_read(vcpu);
++
++	/*
++	 * Reuse the sev_es handler to emulate the mmio.
++	 */
++	switch (nr) {
++	case PKVM_GHC_IOREAD:
++		vcpu->mmio_is_write = 0;
++		ret = kvm_sev_es_mmio_read(vcpu, gpa, size,
++				&vcpu->arch.regs[VCPU_REGS_RAX]);
++		break;
++	case PKVM_GHC_IOWRITE:
++		vcpu->mmio_is_write = 1;
++		ret = kvm_sev_es_mmio_write(vcpu, gpa, size, &val);
++		break;
++	default:
++		ret = 1;
++		break;
++	}
++
++	/*
++	 * We assume calling this function will always succeed which will update
++	 * the GUEST_RIP to skip the current instruction.
++	 */
++	static_call(kvm_x86_skip_emulated_instruction)(vcpu);
++
++	return ret;
++}
++
+ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
+ {
+ 	unsigned long nr, a0, a1, a2, a3, ret;
+ 	int op_64_bit;
+ 
++	if (vcpu->kvm->arch.vm_type == KVM_X86_PROTECTED_VM)
++		return kvm_pkvm_hypercall(vcpu);
++
+ 	if (kvm_xen_hypercall_enabled(vcpu->kvm))
+ 		return kvm_xen_hypercall(vcpu);
+ 
+@@ -12446,9 +12501,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
+ 	int ret;
+ 	unsigned long flags;
+ 
+-	if (type)
++	if (!static_call(kvm_x86_is_vm_type_supported)(type))
+ 		return -EINVAL;
+ 
++	kvm->arch.vm_type = type;
++
+ 	ret = kvm_page_track_init(kvm);
+ 	if (ret)
+ 		goto out;
+@@ -12641,6 +12698,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
+ 	kvm_page_track_cleanup(kvm);
+ 	kvm_xen_destroy_vm(kvm);
+ 	kvm_hv_destroy_vm(kvm);
++	static_call_cond(kvm_x86_vm_free)(kvm);
+ }
+ 
+ static void memslot_rmap_free(struct kvm_memory_slot *slot)
+diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
+index 5f0ce77a259d..0d619878a0aa 100644
+--- a/arch/x86/mm/pat/set_memory.c
++++ b/arch/x86/mm/pat/set_memory.c
+@@ -32,6 +32,7 @@
+ #include <asm/memtype.h>
+ #include <asm/hyperv-tlfs.h>
+ #include <asm/mshyperv.h>
++#include <asm/pkvm.h>
+ 
+ #include "../mm_internal.h"
+ 
+@@ -2121,6 +2122,9 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc)
+ 
+ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
+ {
++	if (pkvm_is_protected_guest())
++		return pkvm_set_mem_host_visibility(addr, numpages, enc);
++
+ 	if (hv_is_isolation_supported())
+ 		return hv_set_mem_host_visibility(addr, numpages, !enc);
+ 
+diff --git a/drivers/iommu/intel/debugfs.c b/drivers/iommu/intel/debugfs.c
+index 1f925285104e..30d9954ffb60 100644
+--- a/drivers/iommu/intel/debugfs.c
++++ b/drivers/iommu/intel/debugfs.c
+@@ -136,13 +136,13 @@ static int iommu_regset_show(struct seq_file *m, void *unused)
+ 		 */
+ 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
+ 		for (i = 0 ; i < ARRAY_SIZE(iommu_regs_32); i++) {
+-			value = dmar_readl(iommu->reg + iommu_regs_32[i].offset);
++			value = dmar_readl(iommu, iommu_regs_32[i].offset);
+ 			seq_printf(m, "%-16s\t0x%02x\t\t0x%016llx\n",
+ 				   iommu_regs_32[i].regs, iommu_regs_32[i].offset,
+ 				   value);
+ 		}
+ 		for (i = 0 ; i < ARRAY_SIZE(iommu_regs_64); i++) {
+-			value = dmar_readq(iommu->reg + iommu_regs_64[i].offset);
++			value = dmar_readq(iommu, iommu_regs_64[i].offset);
+ 			seq_printf(m, "%-16s\t0x%02x\t\t0x%016llx\n",
+ 				   iommu_regs_64[i].regs, iommu_regs_64[i].offset,
+ 				   value);
+@@ -250,7 +250,7 @@ static void ctx_tbl_walk(struct seq_file *m, struct intel_iommu *iommu, u16 bus)
+ 		tbl_wlk.ctx_entry = context;
+ 		m->private = &tbl_wlk;
+ 
+-		if (dmar_readq(iommu->reg + DMAR_RTADDR_REG) & DMA_RTADDR_SMT) {
++		if (dmar_readq(iommu, DMAR_RTADDR_REG) & DMA_RTADDR_SMT) {
+ 			pasid_dir_ptr = context->lo & VTD_PAGE_MASK;
+ 			pasid_dir_size = get_pasid_dir_size(context);
+ 			pasid_dir_walk(m, pasid_dir_ptr, pasid_dir_size);
+@@ -288,7 +288,7 @@ static int dmar_translation_struct_show(struct seq_file *m, void *unused)
+ 
+ 	rcu_read_lock();
+ 	for_each_active_iommu(iommu, drhd) {
+-		sts = dmar_readl(iommu->reg + DMAR_GSTS_REG);
++		sts = dmar_readl(iommu, DMAR_GSTS_REG);
+ 		if (!(sts & DMA_GSTS_TES)) {
+ 			seq_printf(m, "DMA Remapping is not enabled on %s\n",
+ 				   iommu->name);
+@@ -441,8 +441,8 @@ static int invalidation_queue_show(struct seq_file *m, void *unused)
+ 		raw_spin_lock_irqsave(&qi->q_lock, flags);
+ 		seq_printf(m, " Base: 0x%llx\tHead: %lld\tTail: %lld\n",
+ 			   (u64)virt_to_phys(qi->desc),
+-			   dmar_readq(iommu->reg + DMAR_IQH_REG) >> shift,
+-			   dmar_readq(iommu->reg + DMAR_IQT_REG) >> shift);
++			   dmar_readq(iommu, DMAR_IQH_REG) >> shift,
++			   dmar_readq(iommu, DMAR_IQT_REG) >> shift);
+ 		invalidation_queue_entry_show(m, iommu);
+ 		raw_spin_unlock_irqrestore(&qi->q_lock, flags);
+ 		seq_putc(m, '\n');
+@@ -523,7 +523,7 @@ static int ir_translation_struct_show(struct seq_file *m, void *unused)
+ 		seq_printf(m, "Remapped Interrupt supported on IOMMU: %s\n",
+ 			   iommu->name);
+ 
+-		sts = dmar_readl(iommu->reg + DMAR_GSTS_REG);
++		sts = dmar_readl(iommu, DMAR_GSTS_REG);
+ 		if (iommu->ir_table && (sts & DMA_GSTS_IRES)) {
+ 			irta = virt_to_phys(iommu->ir_table->base);
+ 			seq_printf(m, " IR table address:%llx\n", irta);
+diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
+index f800989ea046..36a22c0a6f30 100644
+--- a/drivers/iommu/intel/dmar.c
++++ b/drivers/iommu/intel/dmar.c
+@@ -878,7 +878,7 @@ static int __ref
+ dmar_validate_one_drhd(struct acpi_dmar_header *entry, void *arg)
+ {
+ 	struct acpi_dmar_hardware_unit *drhd;
+-	void __iomem *addr;
++	struct intel_iommu iommu;
+ 	u64 cap, ecap;
+ 
+ 	drhd = (void *)entry;
+@@ -887,22 +887,23 @@ dmar_validate_one_drhd(struct acpi_dmar_header *entry, void *arg)
+ 		return -EINVAL;
+ 	}
+ 
++	iommu.reg_phys = drhd->address;
+ 	if (arg)
+-		addr = ioremap(drhd->address, VTD_PAGE_SIZE);
++		iommu.reg = ioremap(drhd->address, VTD_PAGE_SIZE);
+ 	else
+-		addr = early_ioremap(drhd->address, VTD_PAGE_SIZE);
+-	if (!addr) {
++		iommu.reg = early_ioremap(drhd->address, VTD_PAGE_SIZE);
++	if (!iommu.reg) {
+ 		pr_warn("Can't validate DRHD address: %llx\n", drhd->address);
+ 		return -EINVAL;
+ 	}
+ 
+-	cap = dmar_readq(addr + DMAR_CAP_REG);
+-	ecap = dmar_readq(addr + DMAR_ECAP_REG);
++	cap = dmar_readq(&iommu, DMAR_CAP_REG);
++	ecap = dmar_readq(&iommu, DMAR_ECAP_REG);
+ 
+ 	if (arg)
+-		iounmap(addr);
++		iounmap(iommu.reg);
+ 	else
+-		early_iounmap(addr, VTD_PAGE_SIZE);
++		early_iounmap(iommu.reg, VTD_PAGE_SIZE);
+ 
+ 	if (cap == (uint64_t)-1 && ecap == (uint64_t)-1) {
+ 		warn_invalid_dmar(drhd->address, " returns all ones");
+@@ -981,16 +982,19 @@ static int map_iommu(struct intel_iommu *iommu, u64 phys_addr)
+ 		goto release;
+ 	}
+ 
+-	iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG);
+-	iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG);
++	iommu->cap = dmar_readq(iommu, DMAR_CAP_REG);
++	iommu->ecap = dmar_readq(iommu, DMAR_ECAP_REG);
+ 
+ 	if (iommu->cap == (uint64_t)-1 && iommu->ecap == (uint64_t)-1) {
+ 		err = -EINVAL;
+ 		warn_invalid_dmar(phys_addr, " returns all ones");
+ 		goto unmap;
+ 	}
++#ifdef CONFIG_PKVM_INTEL
++	pkvm_update_iommu_virtual_caps(&iommu->cap, &iommu->ecap);
++#endif
+ 	if (ecap_vcs(iommu->ecap))
+-		iommu->vccap = dmar_readq(iommu->reg + DMAR_VCCAP_REG);
++		iommu->vccap = dmar_readq(iommu, DMAR_VCCAP_REG);
+ 
+ 	/* the registers might be more than one page */
+ 	map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
+@@ -1087,7 +1091,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd)
+ 
+ 	iommu->node = NUMA_NO_NODE;
+ 
+-	ver = readl(iommu->reg + DMAR_VER_REG);
++	ver = dmar_readl(iommu, DMAR_VER_REG);
+ 	pr_info("%s: reg_base_addr %llx ver %d:%d cap %llx ecap %llx\n",
+ 		iommu->name,
+ 		(unsigned long long)drhd->reg_base_addr,
+@@ -1096,7 +1100,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd)
+ 		(unsigned long long)iommu->ecap);
+ 
+ 	/* Reflect status in gcmd */
+-	sts = readl(iommu->reg + DMAR_GSTS_REG);
++	sts = dmar_readl(iommu, DMAR_GSTS_REG);
+ 	if (sts & DMA_GSTS_IRES)
+ 		iommu->gcmd |= DMA_GCMD_IRE;
+ 	if (sts & DMA_GSTS_TES)
+@@ -1211,8 +1215,8 @@ static const char *qi_type_string(u8 type)
+ 
+ static void qi_dump_fault(struct intel_iommu *iommu, u32 fault)
+ {
+-	unsigned int head = dmar_readl(iommu->reg + DMAR_IQH_REG);
+-	u64 iqe_err = dmar_readq(iommu->reg + DMAR_IQER_REG);
++	unsigned int head = dmar_readl(iommu, DMAR_IQH_REG);
++	u64 iqe_err = dmar_readq(iommu, DMAR_IQER_REG);
+ 	struct qi_desc *desc = iommu->qi->desc + head;
+ 
+ 	if (fault & DMA_FSTS_IQE)
+@@ -1250,7 +1254,7 @@ static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index)
+ 	if (qi->desc_status[wait_index] == QI_ABORT)
+ 		return -EAGAIN;
+ 
+-	fault = readl(iommu->reg + DMAR_FSTS_REG);
++	fault = dmar_readl(iommu, DMAR_FSTS_REG);
+ 	if (fault & (DMA_FSTS_IQE | DMA_FSTS_ITE | DMA_FSTS_ICE))
+ 		qi_dump_fault(iommu, fault);
+ 
+@@ -1260,7 +1264,7 @@ static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index)
+ 	 * is cleared.
+ 	 */
+ 	if (fault & DMA_FSTS_IQE) {
+-		head = readl(iommu->reg + DMAR_IQH_REG);
++		head = dmar_readl(iommu, DMAR_IQH_REG);
+ 		if ((head >> shift) == index) {
+ 			struct qi_desc *desc = qi->desc + head;
+ 
+@@ -1271,7 +1275,7 @@ static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index)
+ 			 */
+ 			memcpy(desc, qi->desc + (wait_index << shift),
+ 			       1 << shift);
+-			writel(DMA_FSTS_IQE, iommu->reg + DMAR_FSTS_REG);
++			dmar_writel(iommu, DMAR_FSTS_REG, DMA_FSTS_IQE);
+ 			pr_info("Invalidation Queue Error (IQE) cleared\n");
+ 			return -EINVAL;
+ 		}
+@@ -1282,13 +1286,13 @@ static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index)
+ 	 * No new descriptors are fetched until the ITE is cleared.
+ 	 */
+ 	if (fault & DMA_FSTS_ITE) {
+-		head = readl(iommu->reg + DMAR_IQH_REG);
++		head = dmar_readl(iommu, DMAR_IQH_REG);
+ 		head = ((head >> shift) - 1 + QI_LENGTH) % QI_LENGTH;
+ 		head |= 1;
+-		tail = readl(iommu->reg + DMAR_IQT_REG);
++		tail = dmar_readl(iommu, DMAR_IQT_REG);
+ 		tail = ((tail >> shift) - 1 + QI_LENGTH) % QI_LENGTH;
+ 
+-		writel(DMA_FSTS_ITE, iommu->reg + DMAR_FSTS_REG);
++		dmar_writel(iommu, DMAR_FSTS_REG, DMA_FSTS_ITE);
+ 		pr_info("Invalidation Time-out Error (ITE) cleared\n");
+ 
+ 		do {
+@@ -1302,7 +1306,7 @@ static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index)
+ 	}
+ 
+ 	if (fault & DMA_FSTS_ICE) {
+-		writel(DMA_FSTS_ICE, iommu->reg + DMAR_FSTS_REG);
++		dmar_writel(iommu, DMAR_FSTS_REG, DMA_FSTS_ICE);
+ 		pr_info("Invalidation Completion Error (ICE) cleared\n");
+ 	}
+ 
+@@ -1393,7 +1397,7 @@ int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc,
+ 	 * update the HW tail register indicating the presence of
+ 	 * new descriptors.
+ 	 */
+-	writel(qi->free_head << shift, iommu->reg + DMAR_IQT_REG);
++	dmar_writel(iommu, DMAR_IQT_REG, qi->free_head << shift);
+ 
+ 	while (qi->desc_status[wait_index] != QI_DONE) {
+ 		/*
+@@ -1621,22 +1625,22 @@ void dmar_disable_qi(struct intel_iommu *iommu)
+ 
+ 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
+ 
+-	sts =  readl(iommu->reg + DMAR_GSTS_REG);
++	sts =  dmar_readl(iommu, DMAR_GSTS_REG);
+ 	if (!(sts & DMA_GSTS_QIES))
+ 		goto end;
+ 
+ 	/*
+ 	 * Give a chance to HW to complete the pending invalidation requests.
+ 	 */
+-	while ((readl(iommu->reg + DMAR_IQT_REG) !=
+-		readl(iommu->reg + DMAR_IQH_REG)) &&
++	while ((dmar_readl(iommu, DMAR_IQT_REG) !=
++		dmar_readl(iommu, DMAR_IQH_REG)) &&
+ 		(DMAR_OPERATION_TIMEOUT > (get_cycles() - start_time)))
+ 		cpu_relax();
+ 
+ 	iommu->gcmd &= ~DMA_GCMD_QIE;
+-	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
++	dmar_writel(iommu, DMAR_GCMD_REG, iommu->gcmd);
+ 
+-	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl,
++	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
+ 		      !(sts & DMA_GSTS_QIES), sts);
+ end:
+ 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+@@ -1665,15 +1669,15 @@ static void __dmar_enable_qi(struct intel_iommu *iommu)
+ 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
+ 
+ 	/* write zero to the tail reg */
+-	writel(0, iommu->reg + DMAR_IQT_REG);
++	dmar_writel(iommu, DMAR_IQT_REG, 0);
+ 
+-	dmar_writeq(iommu->reg + DMAR_IQA_REG, val);
++	dmar_writeq(iommu, DMAR_IQA_REG, val);
+ 
+ 	iommu->gcmd |= DMA_GCMD_QIE;
+-	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
++	dmar_writel(iommu, DMAR_GCMD_REG, iommu->gcmd);
+ 
+ 	/* Make sure hardware complete it */
+-	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, (sts & DMA_GSTS_QIES), sts);
++	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl, (sts & DMA_GSTS_QIES), sts);
+ 
+ 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+ }
+@@ -1865,9 +1869,9 @@ void dmar_msi_unmask(struct irq_data *data)
+ 
+ 	/* unmask it */
+ 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
+-	writel(0, iommu->reg + reg);
++	dmar_writel(iommu, reg, 0);
+ 	/* Read a reg to force flush the post write */
+-	readl(iommu->reg + reg);
++	dmar_readl(iommu, reg);
+ 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+ }
+ 
+@@ -1879,9 +1883,9 @@ void dmar_msi_mask(struct irq_data *data)
+ 
+ 	/* mask it */
+ 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
+-	writel(DMA_FECTL_IM, iommu->reg + reg);
++	dmar_writel(iommu, reg, DMA_FECTL_IM);
+ 	/* Read a reg to force flush the post write */
+-	readl(iommu->reg + reg);
++	dmar_readl(iommu, reg);
+ 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+ }
+ 
+@@ -1892,9 +1896,9 @@ void dmar_msi_write(int irq, struct msi_msg *msg)
+ 	unsigned long flag;
+ 
+ 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
+-	writel(msg->data, iommu->reg + reg + 4);
+-	writel(msg->address_lo, iommu->reg + reg + 8);
+-	writel(msg->address_hi, iommu->reg + reg + 12);
++	dmar_writel(iommu, reg + 4, msg->data);
++	dmar_writel(iommu, reg + 8, msg->address_lo);
++	dmar_writel(iommu, reg + 12, msg->address_hi);
+ 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+ }
+ 
+@@ -1905,9 +1909,9 @@ void dmar_msi_read(int irq, struct msi_msg *msg)
+ 	unsigned long flag;
+ 
+ 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
+-	msg->data = readl(iommu->reg + reg + 4);
+-	msg->address_lo = readl(iommu->reg + reg + 8);
+-	msg->address_hi = readl(iommu->reg + reg + 12);
++	msg->data = dmar_readl(iommu, reg + 4);
++	msg->address_lo = dmar_readl(iommu, reg + 8);
++	msg->address_hi = dmar_readl(iommu, reg + 12);
+ 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+ }
+ 
+@@ -1959,7 +1963,7 @@ irqreturn_t dmar_fault(int irq, void *dev_id)
+ 				      DEFAULT_RATELIMIT_BURST);
+ 
+ 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
+-	fault_status = readl(iommu->reg + DMAR_FSTS_REG);
++	fault_status = dmar_readl(iommu, DMAR_FSTS_REG);
+ 	if (fault_status && __ratelimit(&rs))
+ 		pr_err("DRHD: handling fault status reg %x\n", fault_status);
+ 
+@@ -1981,7 +1985,7 @@ irqreturn_t dmar_fault(int irq, void *dev_id)
+ 		bool pasid_present;
+ 
+ 		/* highest 32 bits */
+-		data = readl(iommu->reg + reg +
++		data = dmar_readl(iommu, reg +
+ 				fault_index * PRIMARY_FAULT_REG_LEN + 12);
+ 		if (!(data & DMA_FRCD_F))
+ 			break;
+@@ -1991,19 +1995,19 @@ irqreturn_t dmar_fault(int irq, void *dev_id)
+ 			type = dma_frcd_type(data);
+ 
+ 			pasid = dma_frcd_pasid_value(data);
+-			data = readl(iommu->reg + reg +
++			data = dmar_readl(iommu, reg +
+ 				     fault_index * PRIMARY_FAULT_REG_LEN + 8);
+ 			source_id = dma_frcd_source_id(data);
+ 
+ 			pasid_present = dma_frcd_pasid_present(data);
+-			guest_addr = dmar_readq(iommu->reg + reg +
++			guest_addr = dmar_readq(iommu, reg +
+ 					fault_index * PRIMARY_FAULT_REG_LEN);
+ 			guest_addr = dma_frcd_page_addr(guest_addr);
+ 		}
+ 
+ 		/* clear the fault */
+-		writel(DMA_FRCD_F, iommu->reg + reg +
+-			fault_index * PRIMARY_FAULT_REG_LEN + 12);
++		dmar_writel(iommu, reg +
++			fault_index * PRIMARY_FAULT_REG_LEN + 12, DMA_FRCD_F);
+ 
+ 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+ 
+@@ -2019,8 +2023,8 @@ irqreturn_t dmar_fault(int irq, void *dev_id)
+ 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
+ 	}
+ 
+-	writel(DMA_FSTS_PFO | DMA_FSTS_PPF | DMA_FSTS_PRO,
+-	       iommu->reg + DMAR_FSTS_REG);
++	dmar_writel(iommu, DMAR_FSTS_REG,
++		DMA_FSTS_PFO | DMA_FSTS_PPF | DMA_FSTS_PRO);
+ 
+ unlock_exit:
+ 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+@@ -2073,8 +2077,8 @@ int __init enable_drhd_fault_handling(void)
+ 		 * Clear any previous faults.
+ 		 */
+ 		dmar_fault(iommu->irq, iommu);
+-		fault_status = readl(iommu->reg + DMAR_FSTS_REG);
+-		writel(fault_status, iommu->reg + DMAR_FSTS_REG);
++		fault_status = dmar_readl(iommu, DMAR_FSTS_REG);
++		dmar_writel(iommu, DMAR_FSTS_REG, fault_status);
+ 	}
+ 
+ 	return 0;
+diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
+index 5c4f5aa8e87e..f706d7c36207 100644
+--- a/drivers/iommu/intel/iommu.c
++++ b/drivers/iommu/intel/iommu.c
+@@ -309,7 +309,7 @@ static void init_translation_status(struct intel_iommu *iommu)
+ {
+ 	u32 gsts;
+ 
+-	gsts = readl(iommu->reg + DMAR_GSTS_REG);
++	gsts = dmar_readl(iommu, DMAR_GSTS_REG);
+ 	if (gsts & DMA_GSTS_TES)
+ 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
+ }
+@@ -1227,13 +1227,13 @@ static void iommu_set_root_entry(struct intel_iommu *iommu)
+ 		addr |= DMA_RTADDR_SMT;
+ 
+ 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
+-	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
++	dmar_writeq(iommu, DMAR_RTADDR_REG, addr);
+ 
+-	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
++	dmar_writel(iommu, DMAR_GCMD_REG, iommu->gcmd | DMA_GCMD_SRTP);
+ 
+ 	/* Make sure hardware complete it */
+ 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+-		      readl, (sts & DMA_GSTS_RTPS), sts);
++		      dmar_readl, (sts & DMA_GSTS_RTPS), sts);
+ 
+ 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+ 
+@@ -1259,11 +1259,11 @@ void iommu_flush_write_buffer(struct intel_iommu *iommu)
+ 		return;
+ 
+ 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
+-	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
++	dmar_writel(iommu, DMAR_GCMD_REG, iommu->gcmd | DMA_GCMD_WBF);
+ 
+ 	/* Make sure hardware complete it */
+ 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+-		      readl, (!(val & DMA_GSTS_WBFS)), val);
++		      dmar_readl, (!(val & DMA_GSTS_WBFS)), val);
+ 
+ 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+ }
+@@ -1293,7 +1293,7 @@ static void __iommu_flush_context(struct intel_iommu *iommu,
+ 	val |= DMA_CCMD_ICC;
+ 
+ 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
+-	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
++	dmar_writeq(iommu, DMAR_CCMD_REG, val);
+ 
+ 	/* Make sure hardware complete it */
+ 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
+@@ -1341,8 +1341,8 @@ static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
+ 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
+ 	/* Note: Only uses first TLB reg currently */
+ 	if (val_iva)
+-		dmar_writeq(iommu->reg + tlb_offset, val_iva);
+-	dmar_writeq(iommu->reg + tlb_offset + 8, val);
++		dmar_writeq(iommu, tlb_offset, val_iva);
++	dmar_writeq(iommu, tlb_offset + 8, val);
+ 
+ 	/* Make sure hardware complete it */
+ 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
+@@ -1619,13 +1619,13 @@ static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
+ 		return;
+ 
+ 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
+-	pmen = readl(iommu->reg + DMAR_PMEN_REG);
++	pmen = dmar_readl(iommu, DMAR_PMEN_REG);
+ 	pmen &= ~DMA_PMEN_EPM;
+-	writel(pmen, iommu->reg + DMAR_PMEN_REG);
++	dmar_writel(iommu, DMAR_PMEN_REG, pmen);
+ 
+ 	/* wait for the protected region status bit to clear */
+ 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
+-		readl, !(pmen & DMA_PMEN_PRS), pmen);
++		dmar_readl, !(pmen & DMA_PMEN_PRS), pmen);
+ 
+ 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+ }
+@@ -1637,11 +1637,11 @@ static void iommu_enable_translation(struct intel_iommu *iommu)
+ 
+ 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
+ 	iommu->gcmd |= DMA_GCMD_TE;
+-	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
++	dmar_writel(iommu, DMAR_GCMD_REG, iommu->gcmd);
+ 
+ 	/* Make sure hardware complete it */
+ 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+-		      readl, (sts & DMA_GSTS_TES), sts);
++		      dmar_readl, (sts & DMA_GSTS_TES), sts);
+ 
+ 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+ }
+@@ -1657,11 +1657,11 @@ static void iommu_disable_translation(struct intel_iommu *iommu)
+ 
+ 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
+ 	iommu->gcmd &= ~DMA_GCMD_TE;
+-	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
++	dmar_writel(iommu, DMAR_GCMD_REG, iommu->gcmd);
+ 
+ 	/* Make sure hardware complete it */
+ 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+-		      readl, (!(sts & DMA_GSTS_TES)), sts);
++		      dmar_readl, (!(sts & DMA_GSTS_TES)), sts);
+ 
+ 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+ }
+@@ -2764,7 +2764,7 @@ static int copy_translation_tables(struct intel_iommu *iommu)
+ 	int bus, ret;
+ 	bool new_ext, ext;
+ 
+-	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
++	rtaddr_reg = dmar_readq(iommu, DMAR_RTADDR_REG);
+ 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
+ 	new_ext    = !!sm_supported(iommu);
+ 
+@@ -3171,13 +3171,13 @@ static int iommu_suspend(void)
+ 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
+ 
+ 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
+-			readl(iommu->reg + DMAR_FECTL_REG);
++			dmar_readl(iommu, DMAR_FECTL_REG);
+ 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
+-			readl(iommu->reg + DMAR_FEDATA_REG);
++			dmar_readl(iommu, DMAR_FEDATA_REG);
+ 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
+-			readl(iommu->reg + DMAR_FEADDR_REG);
++			dmar_readl(iommu, DMAR_FEADDR_REG);
+ 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
+-			readl(iommu->reg + DMAR_FEUADDR_REG);
++			dmar_readl(iommu, DMAR_FEUADDR_REG);
+ 
+ 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+ 	}
+@@ -3202,14 +3202,14 @@ static void iommu_resume(void)
+ 
+ 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
+ 
+-		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
+-			iommu->reg + DMAR_FECTL_REG);
+-		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
+-			iommu->reg + DMAR_FEDATA_REG);
+-		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
+-			iommu->reg + DMAR_FEADDR_REG);
+-		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
+-			iommu->reg + DMAR_FEUADDR_REG);
++		dmar_writel(iommu, DMAR_FECTL_REG,
++			iommu->iommu_state[SR_DMAR_FECTL_REG]);
++		dmar_writel(iommu, DMAR_FEDATA_REG,
++			iommu->iommu_state[SR_DMAR_FEDATA_REG]);
++		dmar_writel(iommu, DMAR_FEADDR_REG,
++			iommu->iommu_state[SR_DMAR_FEADDR_REG]);
++		dmar_writel(iommu, DMAR_FEUADDR_REG,
++			iommu->iommu_state[SR_DMAR_FEUADDR_REG]);
+ 
+ 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+ 	}
+@@ -3785,7 +3785,7 @@ static ssize_t version_show(struct device *dev,
+ 			    struct device_attribute *attr, char *buf)
+ {
+ 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
+-	u32 ver = readl(iommu->reg + DMAR_VER_REG);
++	u32 ver = dmar_readl(iommu, DMAR_VER_REG);
+ 	return sprintf(buf, "%d:%d\n",
+ 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
+ }
+diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
+index c99cb715bd9a..0cc852fa9713 100644
+--- a/drivers/iommu/intel/iommu.h
++++ b/drivers/iommu/intel/iommu.h
+@@ -135,10 +135,18 @@
+ 
+ #define OFFSET_STRIDE		(9)
+ 
+-#define dmar_readq(a) readq(a)
+-#define dmar_writeq(a,v) writeq(v,a)
+-#define dmar_readl(a) readl(a)
+-#define dmar_writel(a, v) writel(v, a)
++#ifdef CONFIG_PKVM_INTEL
++#include <asm/pkvm.h>
++#define dmar_readq(iommu, o)		pkvm_readq((iommu)->reg, (iommu)->reg_phys, o)
++#define dmar_writeq(iommu, o, v)	pkvm_writeq((iommu)->reg, (iommu)->reg_phys, o, v)
++#define dmar_readl(iommu, o)		pkvm_readl((iommu)->reg, (iommu)->reg_phys, o)
++#define dmar_writel(iommu, o, v)	pkvm_writel((iommu)->reg, (iommu)->reg_phys, o, v)
++#else
++#define dmar_readq(iommu, o) readq((iommu)->reg + o)
++#define dmar_writeq(iommu, o, v) writeq(v, (iommu)->reg + o)
++#define dmar_readl(iommu, o) readl((iommu)->reg + o)
++#define dmar_writel(iommu, o, v) writel(v, (iommu)->reg + o)
++#endif
+ 
+ #define DMAR_VER_MAJOR(v)		(((v) & 0xf0) >> 4)
+ #define DMAR_VER_MINOR(v)		((v) & 0x0f)
+@@ -313,7 +321,7 @@
+ do {									\
+ 	cycles_t start_time = get_cycles();				\
+ 	while (1) {							\
+-		sts = op(iommu->reg + offset);				\
++		sts = op(iommu, offset);				\
+ 		if (cond)						\
+ 			break;						\
+ 		if (DMAR_OPERATION_TIMEOUT < (get_cycles() - start_time))\
+diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
+index 5962bb5027d0..65cff7667ee4 100644
+--- a/drivers/iommu/intel/irq_remapping.c
++++ b/drivers/iommu/intel/irq_remapping.c
+@@ -97,7 +97,7 @@ static void init_ir_status(struct intel_iommu *iommu)
+ {
+ 	u32 gsts;
+ 
+-	gsts = readl(iommu->reg + DMAR_GSTS_REG);
++	gsts = dmar_readl(iommu, DMAR_GSTS_REG);
+ 	if (gsts & DMA_GSTS_IRES)
+ 		iommu->flags |= VTD_FLAG_IRQ_REMAP_PRE_ENABLED;
+ }
+@@ -437,7 +437,7 @@ static int iommu_load_old_irte(struct intel_iommu *iommu)
+ 	u64 irta;
+ 
+ 	/* Check whether the old ir-table has the same size as ours */
+-	irta = dmar_readq(iommu->reg + DMAR_IRTA_REG);
++	irta = dmar_readq(iommu, DMAR_IRTA_REG);
+ 	if ((irta & INTR_REMAP_TABLE_REG_SIZE_MASK)
+ 	     != INTR_REMAP_TABLE_REG_SIZE)
+ 		return -EINVAL;
+@@ -480,14 +480,14 @@ static void iommu_set_irq_remapping(struct intel_iommu *iommu, int mode)
+ 
+ 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
+ 
+-	dmar_writeq(iommu->reg + DMAR_IRTA_REG,
++	dmar_writeq(iommu, DMAR_IRTA_REG,
+ 		    (addr) | IR_X2APIC_MODE(mode) | INTR_REMAP_TABLE_REG_SIZE);
+ 
+ 	/* Set interrupt-remapping table pointer */
+-	writel(iommu->gcmd | DMA_GCMD_SIRTP, iommu->reg + DMAR_GCMD_REG);
++	dmar_writel(iommu, DMAR_GCMD_REG, iommu->gcmd | DMA_GCMD_SIRTP);
+ 
+ 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+-		      readl, (sts & DMA_GSTS_IRTPS), sts);
++		      dmar_readl, (sts & DMA_GSTS_IRTPS), sts);
+ 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+ 
+ 	/*
+@@ -507,16 +507,16 @@ static void iommu_enable_irq_remapping(struct intel_iommu *iommu)
+ 
+ 	/* Enable interrupt-remapping */
+ 	iommu->gcmd |= DMA_GCMD_IRE;
+-	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
++	dmar_writel(iommu, DMAR_GCMD_REG, iommu->gcmd);
+ 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+-		      readl, (sts & DMA_GSTS_IRES), sts);
++		      dmar_readl, (sts & DMA_GSTS_IRES), sts);
+ 
+ 	/* Block compatibility-format MSIs */
+ 	if (sts & DMA_GSTS_CFIS) {
+ 		iommu->gcmd &= ~DMA_GCMD_CFI;
+-		writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
++		dmar_writel(iommu, DMAR_GCMD_REG, iommu->gcmd);
+ 		IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+-			      readl, !(sts & DMA_GSTS_CFIS), sts);
++			      dmar_readl, !(sts & DMA_GSTS_CFIS), sts);
+ 	}
+ 
+ 	/*
+@@ -686,15 +686,15 @@ static void iommu_disable_irq_remapping(struct intel_iommu *iommu)
+ 
+ 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
+ 
+-	sts = readl(iommu->reg + DMAR_GSTS_REG);
++	sts = dmar_readl(iommu, DMAR_GSTS_REG);
+ 	if (!(sts & DMA_GSTS_IRES))
+ 		goto end;
+ 
+ 	iommu->gcmd &= ~DMA_GCMD_IRE;
+-	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
++	dmar_writel(iommu, DMAR_GCMD_REG, iommu->gcmd);
+ 
+ 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+-		      readl, !(sts & DMA_GSTS_IRES), sts);
++		      dmar_readl, !(sts & DMA_GSTS_IRES), sts);
+ 
+ end:
+ 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
+index 3f03039e5cce..dcf0b02187e7 100644
+--- a/drivers/iommu/intel/pasid.c
++++ b/drivers/iommu/intel/pasid.c
+@@ -34,7 +34,7 @@ int vcmd_alloc_pasid(struct intel_iommu *iommu, u32 *pasid)
+ 	u64 res;
+ 
+ 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
+-	dmar_writeq(iommu->reg + DMAR_VCMD_REG, VCMD_CMD_ALLOC);
++	dmar_writeq(iommu, DMAR_VCMD_REG, VCMD_CMD_ALLOC);
+ 	IOMMU_WAIT_OP(iommu, DMAR_VCRSP_REG, dmar_readq,
+ 		      !(res & VCMD_VRSP_IP), res);
+ 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+@@ -64,7 +64,7 @@ void vcmd_free_pasid(struct intel_iommu *iommu, u32 pasid)
+ 	u64 res;
+ 
+ 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
+-	dmar_writeq(iommu->reg + DMAR_VCMD_REG,
++	dmar_writeq(iommu, DMAR_VCMD_REG,
+ 		    VCMD_CMD_OPERAND(pasid) | VCMD_CMD_FREE);
+ 	IOMMU_WAIT_OP(iommu, DMAR_VCRSP_REG, dmar_readq,
+ 		      !(res & VCMD_VRSP_IP), res);
+diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
+index 03b25358946c..326ff8f03f68 100644
+--- a/drivers/iommu/intel/svm.c
++++ b/drivers/iommu/intel/svm.c
+@@ -107,9 +107,9 @@ int intel_svm_enable_prq(struct intel_iommu *iommu)
+ 		       iommu->name);
+ 		goto free_iopfq;
+ 	}
+-	dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
+-	dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
+-	dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER);
++	dmar_writeq(iommu, DMAR_PQH_REG, 0ULL);
++	dmar_writeq(iommu, DMAR_PQT_REG, 0ULL);
++	dmar_writeq(iommu, DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER);
+ 
+ 	init_completion(&iommu->prq_complete);
+ 
+@@ -130,9 +130,9 @@ int intel_svm_enable_prq(struct intel_iommu *iommu)
+ 
+ int intel_svm_finish_prq(struct intel_iommu *iommu)
+ {
+-	dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
+-	dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
+-	dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL);
++	dmar_writeq(iommu, DMAR_PQH_REG, 0ULL);
++	dmar_writeq(iommu, DMAR_PQT_REG, 0ULL);
++	dmar_writeq(iommu, DMAR_PQA_REG, 0ULL);
+ 
+ 	if (iommu->pr_irq) {
+ 		free_irq(iommu->pr_irq, iommu);
+@@ -536,8 +536,8 @@ static void intel_svm_drain_prq(struct device *dev, u32 pasid)
+ 	 */
+ prq_retry:
+ 	reinit_completion(&iommu->prq_complete);
+-	tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
+-	head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
++	tail = dmar_readq(iommu, DMAR_PQT_REG) & PRQ_RING_MASK;
++	head = dmar_readq(iommu, DMAR_PQH_REG) & PRQ_RING_MASK;
+ 	while (head != tail) {
+ 		struct page_req_dsc *req;
+ 
+@@ -585,7 +585,7 @@ static void intel_svm_drain_prq(struct device *dev, u32 pasid)
+ qi_retry:
+ 	reinit_completion(&iommu->prq_complete);
+ 	qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN);
+-	if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) {
++	if (dmar_readl(iommu, DMAR_PRS_REG) & DMA_PRS_PRO) {
+ 		wait_for_completion(&iommu->prq_complete);
+ 		goto qi_retry;
+ 	}
+@@ -702,10 +702,10 @@ static irqreturn_t prq_event_thread(int irq, void *d)
+ 	 * Clear PPR bit before reading head/tail registers, to ensure that
+ 	 * we get a new interrupt if needed.
+ 	 */
+-	writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG);
++	dmar_writel(iommu, DMAR_PRS_REG, DMA_PRS_PPR);
+ 
+-	tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
+-	head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
++	tail = dmar_readq(iommu, DMAR_PQT_REG) & PRQ_RING_MASK;
++	head = dmar_readq(iommu, DMAR_PQH_REG) & PRQ_RING_MASK;
+ 	handled = (head != tail);
+ 	while (head != tail) {
+ 		req = &iommu->prq[head / sizeof(*req)];
+@@ -762,20 +762,20 @@ static irqreturn_t prq_event_thread(int irq, void *d)
+ 		head = (head + sizeof(*req)) & PRQ_RING_MASK;
+ 	}
+ 
+-	dmar_writeq(iommu->reg + DMAR_PQH_REG, tail);
++	dmar_writeq(iommu, DMAR_PQH_REG, tail);
+ 
+ 	/*
+ 	 * Clear the page request overflow bit and wake up all threads that
+ 	 * are waiting for the completion of this handling.
+ 	 */
+-	if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) {
++	if (dmar_readl(iommu, DMAR_PRS_REG) & DMA_PRS_PRO) {
+ 		pr_info_ratelimited("IOMMU: %s: PRQ overflow detected\n",
+ 				    iommu->name);
+-		head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
+-		tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
++		head = dmar_readq(iommu, DMAR_PQH_REG) & PRQ_RING_MASK;
++		tail = dmar_readq(iommu, DMAR_PQT_REG) & PRQ_RING_MASK;
+ 		if (head == tail) {
+ 			iopf_queue_discard_partial(iommu->iopf_queue);
+-			writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG);
++			dmar_writel(iommu, DMAR_PRS_REG, DMA_PRS_PRO);
+ 			pr_info_ratelimited("IOMMU: %s: PRQ overflow cleared",
+ 					    iommu->name);
+ 		}
+diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
+index 7ad6f51b3d91..17ae144de43d 100644
+--- a/include/asm-generic/vmlinux.lds.h
++++ b/include/asm-generic/vmlinux.lds.h
+@@ -439,6 +439,22 @@
+ 	__end_ro_after_init = .;
+ #endif
+ 
++#ifdef CONFIG_PKVM_INTEL
++#include <asm/pkvm_image.h>
++#define PKVM_RODATA							\
++	PKVM_SECTION_NAME(.rodata) : 					\
++		AT(ADDR(PKVM_SECTION_NAME(.rodata)) - LOAD_OFFSET) {	\
++	. = ALIGN(PAGE_SIZE);						\
++	__pkvm_rodata_start = .;					\
++	*(PKVM_SECTION_NAME(.rodata))					\
++	*(PKVM_SECTION_NAME(.data..ro_after_init))			\
++	. = ALIGN(PAGE_SIZE);						\
++	__pkvm_rodata_end = .;						\
++	}
++#else
++#define PKVM_RODATA
++#endif
++
+ /*
+  * .kcfi_traps contains a list KCFI trap locations.
+  */
+diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
+new file mode 100644
+index 000000000000..81da7107e3bd
+--- /dev/null
++++ b/include/linux/intel-iommu.h
+@@ -0,0 +1,863 @@
++/* SPDX-License-Identifier: GPL-2.0-only */
++/*
++ * Copyright © 2006-2015, Intel Corporation.
++ *
++ * Authors: Ashok Raj <ashok.raj@intel.com>
++ *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
++ *          David Woodhouse <David.Woodhouse@intel.com>
++ */
++
++#ifndef _INTEL_IOMMU_H_
++#define _INTEL_IOMMU_H_
++
++#include <linux/types.h>
++#include <linux/iova.h>
++#include <linux/io.h>
++#include <linux/idr.h>
++#include <linux/mmu_notifier.h>
++#include <linux/list.h>
++#include <linux/iommu.h>
++#include <linux/io-64-nonatomic-lo-hi.h>
++#include <linux/dmar.h>
++#include <linux/ioasid.h>
++#include <linux/bitfield.h>
++
++#include <asm/cacheflush.h>
++#include <asm/iommu.h>
++
++/*
++ * VT-d hardware uses 4KiB page size regardless of host page size.
++ */
++#define VTD_PAGE_SHIFT		(12)
++#define VTD_PAGE_SIZE		(1UL << VTD_PAGE_SHIFT)
++#define VTD_PAGE_MASK		(((u64)-1) << VTD_PAGE_SHIFT)
++#define VTD_PAGE_ALIGN(addr)	(((addr) + VTD_PAGE_SIZE - 1) & VTD_PAGE_MASK)
++
++#define VTD_STRIDE_SHIFT        (9)
++#define VTD_STRIDE_MASK         (((u64)-1) << VTD_STRIDE_SHIFT)
++
++#define DMA_PTE_READ		BIT_ULL(0)
++#define DMA_PTE_WRITE		BIT_ULL(1)
++#define DMA_PTE_LARGE_PAGE	BIT_ULL(7)
++#define DMA_PTE_SNP		BIT_ULL(11)
++
++#define DMA_FL_PTE_PRESENT	BIT_ULL(0)
++#define DMA_FL_PTE_US		BIT_ULL(2)
++#define DMA_FL_PTE_ACCESS	BIT_ULL(5)
++#define DMA_FL_PTE_DIRTY	BIT_ULL(6)
++#define DMA_FL_PTE_XD		BIT_ULL(63)
++
++#define ADDR_WIDTH_5LEVEL	(57)
++#define ADDR_WIDTH_4LEVEL	(48)
++
++#define CONTEXT_TT_MULTI_LEVEL	0
++#define CONTEXT_TT_DEV_IOTLB	1
++#define CONTEXT_TT_PASS_THROUGH 2
++#define CONTEXT_PASIDE		BIT_ULL(3)
++
++/*
++ * Intel IOMMU register specification per version 1.0 public spec.
++ */
++#define	DMAR_VER_REG	0x0	/* Arch version supported by this IOMMU */
++#define	DMAR_CAP_REG	0x8	/* Hardware supported capabilities */
++#define	DMAR_ECAP_REG	0x10	/* Extended capabilities supported */
++#define	DMAR_GCMD_REG	0x18	/* Global command register */
++#define	DMAR_GSTS_REG	0x1c	/* Global status register */
++#define	DMAR_RTADDR_REG	0x20	/* Root entry table */
++#define	DMAR_CCMD_REG	0x28	/* Context command reg */
++#define	DMAR_FSTS_REG	0x34	/* Fault Status register */
++#define	DMAR_FECTL_REG	0x38	/* Fault control register */
++#define	DMAR_FEDATA_REG	0x3c	/* Fault event interrupt data register */
++#define	DMAR_FEADDR_REG	0x40	/* Fault event interrupt addr register */
++#define	DMAR_FEUADDR_REG 0x44	/* Upper address register */
++#define	DMAR_AFLOG_REG	0x58	/* Advanced Fault control */
++#define	DMAR_PMEN_REG	0x64	/* Enable Protected Memory Region */
++#define	DMAR_PLMBASE_REG 0x68	/* PMRR Low addr */
++#define	DMAR_PLMLIMIT_REG 0x6c	/* PMRR low limit */
++#define	DMAR_PHMBASE_REG 0x70	/* pmrr high base addr */
++#define	DMAR_PHMLIMIT_REG 0x78	/* pmrr high limit */
++#define DMAR_IQH_REG	0x80	/* Invalidation queue head register */
++#define DMAR_IQT_REG	0x88	/* Invalidation queue tail register */
++#define DMAR_IQ_SHIFT	4	/* Invalidation queue head/tail shift */
++#define DMAR_IQA_REG	0x90	/* Invalidation queue addr register */
++#define DMAR_ICS_REG	0x9c	/* Invalidation complete status register */
++#define DMAR_IQER_REG	0xb0	/* Invalidation queue error record register */
++#define DMAR_IRTA_REG	0xb8    /* Interrupt remapping table addr register */
++#define DMAR_PQH_REG	0xc0	/* Page request queue head register */
++#define DMAR_PQT_REG	0xc8	/* Page request queue tail register */
++#define DMAR_PQA_REG	0xd0	/* Page request queue address register */
++#define DMAR_PRS_REG	0xdc	/* Page request status register */
++#define DMAR_PECTL_REG	0xe0	/* Page request event control register */
++#define	DMAR_PEDATA_REG	0xe4	/* Page request event interrupt data register */
++#define	DMAR_PEADDR_REG	0xe8	/* Page request event interrupt addr register */
++#define	DMAR_PEUADDR_REG 0xec	/* Page request event Upper address register */
++#define DMAR_MTRRCAP_REG 0x100	/* MTRR capability register */
++#define DMAR_MTRRDEF_REG 0x108	/* MTRR default type register */
++#define DMAR_MTRR_FIX64K_00000_REG 0x120 /* MTRR Fixed range registers */
++#define DMAR_MTRR_FIX16K_80000_REG 0x128
++#define DMAR_MTRR_FIX16K_A0000_REG 0x130
++#define DMAR_MTRR_FIX4K_C0000_REG 0x138
++#define DMAR_MTRR_FIX4K_C8000_REG 0x140
++#define DMAR_MTRR_FIX4K_D0000_REG 0x148
++#define DMAR_MTRR_FIX4K_D8000_REG 0x150
++#define DMAR_MTRR_FIX4K_E0000_REG 0x158
++#define DMAR_MTRR_FIX4K_E8000_REG 0x160
++#define DMAR_MTRR_FIX4K_F0000_REG 0x168
++#define DMAR_MTRR_FIX4K_F8000_REG 0x170
++#define DMAR_MTRR_PHYSBASE0_REG 0x180 /* MTRR Variable range registers */
++#define DMAR_MTRR_PHYSMASK0_REG 0x188
++#define DMAR_MTRR_PHYSBASE1_REG 0x190
++#define DMAR_MTRR_PHYSMASK1_REG 0x198
++#define DMAR_MTRR_PHYSBASE2_REG 0x1a0
++#define DMAR_MTRR_PHYSMASK2_REG 0x1a8
++#define DMAR_MTRR_PHYSBASE3_REG 0x1b0
++#define DMAR_MTRR_PHYSMASK3_REG 0x1b8
++#define DMAR_MTRR_PHYSBASE4_REG 0x1c0
++#define DMAR_MTRR_PHYSMASK4_REG 0x1c8
++#define DMAR_MTRR_PHYSBASE5_REG 0x1d0
++#define DMAR_MTRR_PHYSMASK5_REG 0x1d8
++#define DMAR_MTRR_PHYSBASE6_REG 0x1e0
++#define DMAR_MTRR_PHYSMASK6_REG 0x1e8
++#define DMAR_MTRR_PHYSBASE7_REG 0x1f0
++#define DMAR_MTRR_PHYSMASK7_REG 0x1f8
++#define DMAR_MTRR_PHYSBASE8_REG 0x200
++#define DMAR_MTRR_PHYSMASK8_REG 0x208
++#define DMAR_MTRR_PHYSBASE9_REG 0x210
++#define DMAR_MTRR_PHYSMASK9_REG 0x218
++#define DMAR_VCCAP_REG		0xe30 /* Virtual command capability register */
++#define DMAR_VCMD_REG		0xe00 /* Virtual command register */
++#define DMAR_VCRSP_REG		0xe10 /* Virtual command response register */
++
++#define DMAR_IQER_REG_IQEI(reg)		FIELD_GET(GENMASK_ULL(3, 0), reg)
++#define DMAR_IQER_REG_ITESID(reg)	FIELD_GET(GENMASK_ULL(47, 32), reg)
++#define DMAR_IQER_REG_ICESID(reg)	FIELD_GET(GENMASK_ULL(63, 48), reg)
++
++#define OFFSET_STRIDE		(9)
++
++#define dmar_readq(a) readq(a)
++#define dmar_writeq(a,v) writeq(v,a)
++#define dmar_readl(a) readl(a)
++#define dmar_writel(a, v) writel(v, a)
++
++#define DMAR_VER_MAJOR(v)		(((v) & 0xf0) >> 4)
++#define DMAR_VER_MINOR(v)		((v) & 0x0f)
++
++/*
++ * Decoding Capability Register
++ */
++#define cap_5lp_support(c)	(((c) >> 60) & 1)
++#define cap_pi_support(c)	(((c) >> 59) & 1)
++#define cap_fl1gp_support(c)	(((c) >> 56) & 1)
++#define cap_read_drain(c)	(((c) >> 55) & 1)
++#define cap_write_drain(c)	(((c) >> 54) & 1)
++#define cap_max_amask_val(c)	(((c) >> 48) & 0x3f)
++#define cap_num_fault_regs(c)	((((c) >> 40) & 0xff) + 1)
++#define cap_pgsel_inv(c)	(((c) >> 39) & 1)
++
++#define cap_super_page_val(c)	(((c) >> 34) & 0xf)
++#define cap_super_offset(c)	(((find_first_bit(&cap_super_page_val(c), 4)) \
++					* OFFSET_STRIDE) + 21)
++
++#define cap_fault_reg_offset(c)	((((c) >> 24) & 0x3ff) * 16)
++#define cap_max_fault_reg_offset(c) \
++	(cap_fault_reg_offset(c) + cap_num_fault_regs(c) * 16)
++
++#define cap_zlr(c)		(((c) >> 22) & 1)
++#define cap_isoch(c)		(((c) >> 23) & 1)
++#define cap_mgaw(c)		((((c) >> 16) & 0x3f) + 1)
++#define cap_sagaw(c)		(((c) >> 8) & 0x1f)
++#define cap_caching_mode(c)	(((c) >> 7) & 1)
++#define cap_phmr(c)		(((c) >> 6) & 1)
++#define cap_plmr(c)		(((c) >> 5) & 1)
++#define cap_rwbf(c)		(((c) >> 4) & 1)
++#define cap_afl(c)		(((c) >> 3) & 1)
++#define cap_ndoms(c)		(((unsigned long)1) << (4 + 2 * ((c) & 0x7)))
++/*
++ * Extended Capability Register
++ */
++
++#define	ecap_rps(e)		(((e) >> 49) & 0x1)
++#define ecap_smpwc(e)		(((e) >> 48) & 0x1)
++#define ecap_flts(e)		(((e) >> 47) & 0x1)
++#define ecap_slts(e)		(((e) >> 46) & 0x1)
++#define ecap_slads(e)		(((e) >> 45) & 0x1)
++#define ecap_vcs(e)		(((e) >> 44) & 0x1)
++#define ecap_smts(e)		(((e) >> 43) & 0x1)
++#define ecap_dit(e)		(((e) >> 41) & 0x1)
++#define ecap_pds(e)		(((e) >> 42) & 0x1)
++#define ecap_pasid(e)		(((e) >> 40) & 0x1)
++#define ecap_pss(e)		(((e) >> 35) & 0x1f)
++#define ecap_eafs(e)		(((e) >> 34) & 0x1)
++#define ecap_nwfs(e)		(((e) >> 33) & 0x1)
++#define ecap_srs(e)		(((e) >> 31) & 0x1)
++#define ecap_ers(e)		(((e) >> 30) & 0x1)
++#define ecap_prs(e)		(((e) >> 29) & 0x1)
++#define ecap_broken_pasid(e)	(((e) >> 28) & 0x1)
++#define ecap_dis(e)		(((e) >> 27) & 0x1)
++#define ecap_nest(e)		(((e) >> 26) & 0x1)
++#define ecap_mts(e)		(((e) >> 25) & 0x1)
++#define ecap_iotlb_offset(e) 	((((e) >> 8) & 0x3ff) * 16)
++#define ecap_max_iotlb_offset(e) (ecap_iotlb_offset(e) + 16)
++#define ecap_coherent(e)	((e) & 0x1)
++#define ecap_qis(e)		((e) & 0x2)
++#define ecap_pass_through(e)	(((e) >> 6) & 0x1)
++#define ecap_eim_support(e)	(((e) >> 4) & 0x1)
++#define ecap_ir_support(e)	(((e) >> 3) & 0x1)
++#define ecap_dev_iotlb_support(e)	(((e) >> 2) & 0x1)
++#define ecap_max_handle_mask(e) (((e) >> 20) & 0xf)
++#define ecap_sc_support(e)	(((e) >> 7) & 0x1) /* Snooping Control */
++
++/* Virtual command interface capability */
++#define vccap_pasid(v)		(((v) & DMA_VCS_PAS)) /* PASID allocation */
++
++/* IOTLB_REG */
++#define DMA_TLB_FLUSH_GRANU_OFFSET  60
++#define DMA_TLB_GLOBAL_FLUSH (((u64)1) << 60)
++#define DMA_TLB_DSI_FLUSH (((u64)2) << 60)
++#define DMA_TLB_PSI_FLUSH (((u64)3) << 60)
++#define DMA_TLB_IIRG(type) ((type >> 60) & 3)
++#define DMA_TLB_IAIG(val) (((val) >> 57) & 3)
++#define DMA_TLB_READ_DRAIN (((u64)1) << 49)
++#define DMA_TLB_WRITE_DRAIN (((u64)1) << 48)
++#define DMA_TLB_DID(id)	(((u64)((id) & 0xffff)) << 32)
++#define DMA_TLB_IVT (((u64)1) << 63)
++#define DMA_TLB_IH_NONLEAF (((u64)1) << 6)
++#define DMA_TLB_MAX_SIZE (0x3f)
++
++/* INVALID_DESC */
++#define DMA_CCMD_INVL_GRANU_OFFSET  61
++#define DMA_ID_TLB_GLOBAL_FLUSH	(((u64)1) << 4)
++#define DMA_ID_TLB_DSI_FLUSH	(((u64)2) << 4)
++#define DMA_ID_TLB_PSI_FLUSH	(((u64)3) << 4)
++#define DMA_ID_TLB_READ_DRAIN	(((u64)1) << 7)
++#define DMA_ID_TLB_WRITE_DRAIN	(((u64)1) << 6)
++#define DMA_ID_TLB_DID(id)	(((u64)((id & 0xffff) << 16)))
++#define DMA_ID_TLB_IH_NONLEAF	(((u64)1) << 6)
++#define DMA_ID_TLB_ADDR(addr)	(addr)
++#define DMA_ID_TLB_ADDR_MASK(mask)	(mask)
++
++/* PMEN_REG */
++#define DMA_PMEN_EPM (((u32)1)<<31)
++#define DMA_PMEN_PRS (((u32)1)<<0)
++
++/* GCMD_REG */
++#define DMA_GCMD_TE (((u32)1) << 31)
++#define DMA_GCMD_SRTP (((u32)1) << 30)
++#define DMA_GCMD_SFL (((u32)1) << 29)
++#define DMA_GCMD_EAFL (((u32)1) << 28)
++#define DMA_GCMD_WBF (((u32)1) << 27)
++#define DMA_GCMD_QIE (((u32)1) << 26)
++#define DMA_GCMD_SIRTP (((u32)1) << 24)
++#define DMA_GCMD_IRE (((u32) 1) << 25)
++#define DMA_GCMD_CFI (((u32) 1) << 23)
++
++/* GSTS_REG */
++#define DMA_GSTS_TES (((u32)1) << 31)
++#define DMA_GSTS_RTPS (((u32)1) << 30)
++#define DMA_GSTS_FLS (((u32)1) << 29)
++#define DMA_GSTS_AFLS (((u32)1) << 28)
++#define DMA_GSTS_WBFS (((u32)1) << 27)
++#define DMA_GSTS_QIES (((u32)1) << 26)
++#define DMA_GSTS_IRTPS (((u32)1) << 24)
++#define DMA_GSTS_IRES (((u32)1) << 25)
++#define DMA_GSTS_CFIS (((u32)1) << 23)
++
++/* DMA_RTADDR_REG */
++#define DMA_RTADDR_SMT (((u64)1) << 10)
++
++/* CCMD_REG */
++#define DMA_CCMD_ICC (((u64)1) << 63)
++#define DMA_CCMD_GLOBAL_INVL (((u64)1) << 61)
++#define DMA_CCMD_DOMAIN_INVL (((u64)2) << 61)
++#define DMA_CCMD_DEVICE_INVL (((u64)3) << 61)
++#define DMA_CCMD_FM(m) (((u64)((m) & 0x3)) << 32)
++#define DMA_CCMD_MASK_NOBIT 0
++#define DMA_CCMD_MASK_1BIT 1
++#define DMA_CCMD_MASK_2BIT 2
++#define DMA_CCMD_MASK_3BIT 3
++#define DMA_CCMD_SID(s) (((u64)((s) & 0xffff)) << 16)
++#define DMA_CCMD_DID(d) ((u64)((d) & 0xffff))
++
++/* FECTL_REG */
++#define DMA_FECTL_IM (((u32)1) << 31)
++
++/* FSTS_REG */
++#define DMA_FSTS_PFO (1 << 0) /* Primary Fault Overflow */
++#define DMA_FSTS_PPF (1 << 1) /* Primary Pending Fault */
++#define DMA_FSTS_IQE (1 << 4) /* Invalidation Queue Error */
++#define DMA_FSTS_ICE (1 << 5) /* Invalidation Completion Error */
++#define DMA_FSTS_ITE (1 << 6) /* Invalidation Time-out Error */
++#define DMA_FSTS_PRO (1 << 7) /* Page Request Overflow */
++#define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff)
++
++/* FRCD_REG, 32 bits access */
++#define DMA_FRCD_F (((u32)1) << 31)
++#define dma_frcd_type(d) ((d >> 30) & 1)
++#define dma_frcd_fault_reason(c) (c & 0xff)
++#define dma_frcd_source_id(c) (c & 0xffff)
++#define dma_frcd_pasid_value(c) (((c) >> 8) & 0xfffff)
++#define dma_frcd_pasid_present(c) (((c) >> 31) & 1)
++/* low 64 bit */
++#define dma_frcd_page_addr(d) (d & (((u64)-1) << PAGE_SHIFT))
++
++/* PRS_REG */
++#define DMA_PRS_PPR	((u32)1)
++#define DMA_PRS_PRO	((u32)2)
++
++#define DMA_VCS_PAS	((u64)1)
++
++#define IOMMU_WAIT_OP(iommu, offset, op, cond, sts)			\
++do {									\
++	cycles_t start_time = get_cycles();				\
++	while (1) {							\
++		sts = op(iommu->reg + offset);				\
++		if (cond)						\
++			break;						\
++		if (DMAR_OPERATION_TIMEOUT < (get_cycles() - start_time))\
++			panic("DMAR hardware is malfunctioning\n");	\
++		cpu_relax();						\
++	}								\
++} while (0)
++
++#define QI_LENGTH	256	/* queue length */
++
++enum {
++	QI_FREE,
++	QI_IN_USE,
++	QI_DONE,
++	QI_ABORT
++};
++
++#define QI_CC_TYPE		0x1
++#define QI_IOTLB_TYPE		0x2
++#define QI_DIOTLB_TYPE		0x3
++#define QI_IEC_TYPE		0x4
++#define QI_IWD_TYPE		0x5
++#define QI_EIOTLB_TYPE		0x6
++#define QI_PC_TYPE		0x7
++#define QI_DEIOTLB_TYPE		0x8
++#define QI_PGRP_RESP_TYPE	0x9
++#define QI_PSTRM_RESP_TYPE	0xa
++
++#define QI_IEC_SELECTIVE	(((u64)1) << 4)
++#define QI_IEC_IIDEX(idx)	(((u64)(idx & 0xffff) << 32))
++#define QI_IEC_IM(m)		(((u64)(m & 0x1f) << 27))
++
++#define QI_IWD_STATUS_DATA(d)	(((u64)d) << 32)
++#define QI_IWD_STATUS_WRITE	(((u64)1) << 5)
++#define QI_IWD_FENCE		(((u64)1) << 6)
++#define QI_IWD_PRQ_DRAIN	(((u64)1) << 7)
++
++#define QI_IOTLB_DID(did) 	(((u64)did) << 16)
++#define QI_IOTLB_DR(dr) 	(((u64)dr) << 7)
++#define QI_IOTLB_DW(dw) 	(((u64)dw) << 6)
++#define QI_IOTLB_GRAN(gran) 	(((u64)gran) >> (DMA_TLB_FLUSH_GRANU_OFFSET-4))
++#define QI_IOTLB_ADDR(addr)	(((u64)addr) & VTD_PAGE_MASK)
++#define QI_IOTLB_IH(ih)		(((u64)ih) << 6)
++#define QI_IOTLB_AM(am)		(((u8)am) & 0x3f)
++
++#define QI_CC_FM(fm)		(((u64)fm) << 48)
++#define QI_CC_SID(sid)		(((u64)sid) << 32)
++#define QI_CC_DID(did)		(((u64)did) << 16)
++#define QI_CC_GRAN(gran)	(((u64)gran) >> (DMA_CCMD_INVL_GRANU_OFFSET-4))
++
++#define QI_DEV_IOTLB_SID(sid)	((u64)((sid) & 0xffff) << 32)
++#define QI_DEV_IOTLB_QDEP(qdep)	(((qdep) & 0x1f) << 16)
++#define QI_DEV_IOTLB_ADDR(addr)	((u64)(addr) & VTD_PAGE_MASK)
++#define QI_DEV_IOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | \
++				   ((u64)((pfsid >> 4) & 0xfff) << 52))
++#define QI_DEV_IOTLB_SIZE	1
++#define QI_DEV_IOTLB_MAX_INVS	32
++
++#define QI_PC_PASID(pasid)	(((u64)pasid) << 32)
++#define QI_PC_DID(did)		(((u64)did) << 16)
++#define QI_PC_GRAN(gran)	(((u64)gran) << 4)
++
++/* PASID cache invalidation granu */
++#define QI_PC_ALL_PASIDS	0
++#define QI_PC_PASID_SEL		1
++#define QI_PC_GLOBAL		3
++
++#define QI_EIOTLB_ADDR(addr)	((u64)(addr) & VTD_PAGE_MASK)
++#define QI_EIOTLB_IH(ih)	(((u64)ih) << 6)
++#define QI_EIOTLB_AM(am)	(((u64)am) & 0x3f)
++#define QI_EIOTLB_PASID(pasid) 	(((u64)pasid) << 32)
++#define QI_EIOTLB_DID(did)	(((u64)did) << 16)
++#define QI_EIOTLB_GRAN(gran) 	(((u64)gran) << 4)
++
++/* QI Dev-IOTLB inv granu */
++#define QI_DEV_IOTLB_GRAN_ALL		1
++#define QI_DEV_IOTLB_GRAN_PASID_SEL	0
++
++#define QI_DEV_EIOTLB_ADDR(a)	((u64)(a) & VTD_PAGE_MASK)
++#define QI_DEV_EIOTLB_SIZE	(((u64)1) << 11)
++#define QI_DEV_EIOTLB_PASID(p)	((u64)((p) & 0xfffff) << 32)
++#define QI_DEV_EIOTLB_SID(sid)	((u64)((sid) & 0xffff) << 16)
++#define QI_DEV_EIOTLB_QDEP(qd)	((u64)((qd) & 0x1f) << 4)
++#define QI_DEV_EIOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | \
++				    ((u64)((pfsid >> 4) & 0xfff) << 52))
++#define QI_DEV_EIOTLB_MAX_INVS	32
++
++/* Page group response descriptor QW0 */
++#define QI_PGRP_PASID_P(p)	(((u64)(p)) << 4)
++#define QI_PGRP_PDP(p)		(((u64)(p)) << 5)
++#define QI_PGRP_RESP_CODE(res)	(((u64)(res)) << 12)
++#define QI_PGRP_DID(rid)	(((u64)(rid)) << 16)
++#define QI_PGRP_PASID(pasid)	(((u64)(pasid)) << 32)
++
++/* Page group response descriptor QW1 */
++#define QI_PGRP_LPIG(x)		(((u64)(x)) << 2)
++#define QI_PGRP_IDX(idx)	(((u64)(idx)) << 3)
++
++
++#define QI_RESP_SUCCESS		0x0
++#define QI_RESP_INVALID		0x1
++#define QI_RESP_FAILURE		0xf
++
++#define QI_GRAN_NONG_PASID		2
++#define QI_GRAN_PSI_PASID		3
++
++#define qi_shift(iommu)		(DMAR_IQ_SHIFT + !!ecap_smts((iommu)->ecap))
++
++struct qi_desc {
++	u64 qw0;
++	u64 qw1;
++	u64 qw2;
++	u64 qw3;
++};
++
++struct q_inval {
++	raw_spinlock_t  q_lock;
++	void		*desc;          /* invalidation queue */
++	int             *desc_status;   /* desc status */
++	int             free_head;      /* first free entry */
++	int             free_tail;      /* last free entry */
++	int             free_cnt;
++};
++
++struct dmar_pci_notify_info;
++
++#ifdef CONFIG_IRQ_REMAP
++/* 1MB - maximum possible interrupt remapping table size */
++#define INTR_REMAP_PAGE_ORDER	8
++#define INTR_REMAP_TABLE_REG_SIZE	0xf
++#define INTR_REMAP_TABLE_REG_SIZE_MASK  0xf
++
++#define INTR_REMAP_TABLE_ENTRIES	65536
++
++struct irq_domain;
++
++struct ir_table {
++	struct irte *base;
++	unsigned long *bitmap;
++};
++
++void intel_irq_remap_add_device(struct dmar_pci_notify_info *info);
++#else
++static inline void
++intel_irq_remap_add_device(struct dmar_pci_notify_info *info) { }
++#endif
++
++struct iommu_flush {
++	void (*flush_context)(struct intel_iommu *iommu, u16 did, u16 sid,
++			      u8 fm, u64 type);
++	void (*flush_iotlb)(struct intel_iommu *iommu, u16 did, u64 addr,
++			    unsigned int size_order, u64 type);
++};
++
++enum {
++	SR_DMAR_FECTL_REG,
++	SR_DMAR_FEDATA_REG,
++	SR_DMAR_FEADDR_REG,
++	SR_DMAR_FEUADDR_REG,
++	MAX_SR_DMAR_REGS
++};
++
++#define VTD_FLAG_TRANS_PRE_ENABLED	(1 << 0)
++#define VTD_FLAG_IRQ_REMAP_PRE_ENABLED	(1 << 1)
++#define VTD_FLAG_SVM_CAPABLE		(1 << 2)
++
++extern int intel_iommu_sm;
++extern spinlock_t device_domain_lock;
++
++#define sm_supported(iommu)	(intel_iommu_sm && ecap_smts((iommu)->ecap))
++#define pasid_supported(iommu)	(sm_supported(iommu) &&			\
++				 ecap_pasid((iommu)->ecap))
++
++struct pasid_entry;
++struct pasid_state_entry;
++struct page_req_dsc;
++
++/*
++ * 0: Present
++ * 1-11: Reserved
++ * 12-63: Context Ptr (12 - (haw-1))
++ * 64-127: Reserved
++ */
++struct root_entry {
++	u64     lo;
++	u64     hi;
++};
++
++/*
++ * low 64 bits:
++ * 0: present
++ * 1: fault processing disable
++ * 2-3: translation type
++ * 12-63: address space root
++ * high 64 bits:
++ * 0-2: address width
++ * 3-6: aval
++ * 8-23: domain id
++ */
++struct context_entry {
++	u64 lo;
++	u64 hi;
++};
++
++/* si_domain contains mulitple devices */
++#define DOMAIN_FLAG_STATIC_IDENTITY		BIT(0)
++
++/*
++ * When VT-d works in the scalable mode, it allows DMA translation to
++ * happen through either first level or second level page table. This
++ * bit marks that the DMA translation for the domain goes through the
++ * first level page table, otherwise, it goes through the second level.
++ */
++#define DOMAIN_FLAG_USE_FIRST_LEVEL		BIT(1)
++
++/*
++ * Domain represents a virtual machine which demands iommu nested
++ * translation mode support.
++ */
++#define DOMAIN_FLAG_NESTING_MODE		BIT(2)
++
++struct dmar_domain {
++	int	nid;			/* node id */
++
++	unsigned int iommu_refcnt[DMAR_UNITS_SUPPORTED];
++					/* Refcount of devices per iommu */
++
++
++	u16		iommu_did[DMAR_UNITS_SUPPORTED];
++					/* Domain ids per IOMMU. Use u16 since
++					 * domain ids are 16 bit wide according
++					 * to VT-d spec, section 9.3 */
++
++	u8 has_iotlb_device: 1;
++	u8 iommu_coherency: 1;		/* indicate coherency of iommu access */
++	u8 iommu_snooping: 1;		/* indicate snooping control feature */
++
++	struct list_head devices;	/* all devices' list */
++	struct list_head subdevices;	/* all subdevices' list */
++	struct iova_domain iovad;	/* iova's that belong to this domain */
++
++	struct dma_pte	*pgd;		/* virtual address */
++	int		gaw;		/* max guest address width */
++
++	/* adjusted guest address width, 0 is level 2 30-bit */
++	int		agaw;
++
++	int		flags;		/* flags to find out type of domain */
++	int		iommu_superpage;/* Level of superpages supported:
++					   0 == 4KiB (no superpages), 1 == 2MiB,
++					   2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
++	u64		max_addr;	/* maximum mapped address */
++
++	u32		default_pasid;	/*
++					 * The default pasid used for non-SVM
++					 * traffic on mediated devices.
++					 */
++
++	struct iommu_domain domain;	/* generic domain data structure for
++					   iommu core */
++};
++
++struct intel_iommu {
++	void __iomem	*reg; /* Pointer to hardware regs, virtual addr */
++	u64 		reg_phys; /* physical address of hw register set */
++	u64		reg_size; /* size of hw register set */
++	u64		cap;
++	u64		ecap;
++	u64		vccap;
++	u32		gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */
++	raw_spinlock_t	register_lock; /* protect register handling */
++	int		seq_id;	/* sequence id of the iommu */
++	int		agaw; /* agaw of this iommu */
++	int		msagaw; /* max sagaw of this iommu */
++	unsigned int 	irq, pr_irq;
++	u16		segment;     /* PCI segment# */
++	unsigned char 	name[13];    /* Device Name */
++
++#ifdef CONFIG_INTEL_IOMMU
++	unsigned long 	*domain_ids; /* bitmap of domains */
++	struct dmar_domain ***domains; /* ptr to domains */
++	unsigned long	*copied_tables; /* bitmap of copied tables */
++	spinlock_t	lock; /* protect context, domain ids */
++	struct root_entry *root_entry; /* virtual address */
++
++	struct iommu_flush flush;
++#endif
++#ifdef CONFIG_INTEL_IOMMU_SVM
++	struct page_req_dsc *prq;
++	unsigned char prq_name[16];    /* Name for PRQ interrupt */
++	struct completion prq_complete;
++	struct ioasid_allocator_ops pasid_allocator; /* Custom allocator for PASIDs */
++#endif
++	struct iopf_queue *iopf_queue;
++	unsigned char iopfq_name[16];
++	struct q_inval  *qi;            /* Queued invalidation info */
++	u32 *iommu_state; /* Store iommu states between suspend and resume.*/
++
++#ifdef CONFIG_IRQ_REMAP
++	struct ir_table *ir_table;	/* Interrupt remapping info */
++	struct irq_domain *ir_domain;
++	struct irq_domain *ir_msi_domain;
++#endif
++	struct iommu_device iommu;  /* IOMMU core code handle */
++	int		node;
++	u32		flags;      /* Software defined flags */
++
++	struct dmar_drhd_unit *drhd;
++	void *perf_statistic;
++};
++
++/* Per subdevice private data */
++struct subdev_domain_info {
++	struct list_head link_phys;	/* link to phys device siblings */
++	struct list_head link_domain;	/* link to domain siblings */
++	struct device *pdev;		/* physical device derived from */
++	struct dmar_domain *domain;	/* aux-domain */
++	int users;			/* user count */
++};
++
++/* PCI domain-device relationship */
++struct device_domain_info {
++	struct list_head link;	/* link to domain siblings */
++	struct list_head global; /* link to global list */
++	struct list_head table;	/* link to pasid table */
++	struct list_head subdevices; /* subdevices sibling */
++	u32 segment;		/* PCI segment number */
++	u8 bus;			/* PCI bus number */
++	u8 devfn;		/* PCI devfn number */
++	u16 pfsid;		/* SRIOV physical function source ID */
++	u8 pasid_supported:3;
++	u8 pasid_enabled:1;
++	u8 pri_supported:1;
++	u8 pri_enabled:1;
++	u8 ats_supported:1;
++	u8 ats_enabled:1;
++	u8 auxd_enabled:1;	/* Multiple domains per device */
++	u8 ats_qdep;
++	struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
++	struct intel_iommu *iommu; /* IOMMU used by this device */
++	struct dmar_domain *domain; /* pointer to domain */
++	struct pasid_table *pasid_table; /* pasid table */
++};
++
++static inline void __iommu_flush_cache(
++	struct intel_iommu *iommu, void *addr, int size)
++{
++	if (!ecap_coherent(iommu->ecap))
++		clflush_cache_range(addr, size);
++}
++
++/* Convert generic struct iommu_domain to private struct dmar_domain */
++static inline struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
++{
++	return container_of(dom, struct dmar_domain, domain);
++}
++
++/*
++ * 0: readable
++ * 1: writable
++ * 2-6: reserved
++ * 7: super page
++ * 8-10: available
++ * 11: snoop behavior
++ * 12-63: Host physical address
++ */
++struct dma_pte {
++	u64 val;
++};
++
++static inline void dma_clear_pte(struct dma_pte *pte)
++{
++	pte->val = 0;
++}
++
++static inline u64 dma_pte_addr(struct dma_pte *pte)
++{
++#ifdef CONFIG_64BIT
++	return pte->val & VTD_PAGE_MASK & (~DMA_FL_PTE_XD);
++#else
++	/* Must have a full atomic 64-bit read */
++	return  __cmpxchg64(&pte->val, 0ULL, 0ULL) &
++			VTD_PAGE_MASK & (~DMA_FL_PTE_XD);
++#endif
++}
++
++static inline bool dma_pte_present(struct dma_pte *pte)
++{
++	return (pte->val & 3) != 0;
++}
++
++static inline bool dma_pte_superpage(struct dma_pte *pte)
++{
++	return (pte->val & DMA_PTE_LARGE_PAGE);
++}
++
++static inline int first_pte_in_page(struct dma_pte *pte)
++{
++	return !((unsigned long)pte & ~VTD_PAGE_MASK);
++}
++
++static inline bool context_present(struct context_entry *context)
++{
++	return (context->lo & 1);
++}
++
++extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev);
++extern int dmar_find_matched_atsr_unit(struct pci_dev *dev);
++
++extern int dmar_enable_qi(struct intel_iommu *iommu);
++extern void dmar_disable_qi(struct intel_iommu *iommu);
++extern int dmar_reenable_qi(struct intel_iommu *iommu);
++extern void qi_global_iec(struct intel_iommu *iommu);
++
++extern void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid,
++			     u8 fm, u64 type);
++extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
++			  unsigned int size_order, u64 type);
++extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
++			u16 qdep, u64 addr, unsigned mask);
++
++void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr,
++		     unsigned long npages, bool ih);
++
++void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid,
++			      u32 pasid, u16 qdep, u64 addr,
++			      unsigned int size_order);
++void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, u64 granu,
++			  u32 pasid);
++
++int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc,
++		   unsigned int count, unsigned long options);
++/*
++ * Options used in qi_submit_sync:
++ * QI_OPT_WAIT_DRAIN - Wait for PRQ drain completion, spec 6.5.2.8.
++ */
++#define QI_OPT_WAIT_DRAIN		BIT(0)
++
++extern int dmar_ir_support(void);
++
++void *alloc_pgtable_page(int node);
++void free_pgtable_page(void *vaddr);
++struct intel_iommu *domain_get_iommu(struct dmar_domain *domain);
++int for_each_device_domain(int (*fn)(struct device_domain_info *info,
++				     void *data), void *data);
++void iommu_flush_write_buffer(struct intel_iommu *iommu);
++int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev);
++struct dmar_domain *find_domain(struct device *dev);
++struct device_domain_info *get_domain_info(struct device *dev);
++struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn);
++
++#ifdef CONFIG_INTEL_IOMMU_SVM
++extern void intel_svm_check(struct intel_iommu *iommu);
++extern int intel_svm_enable_prq(struct intel_iommu *iommu);
++extern int intel_svm_finish_prq(struct intel_iommu *iommu);
++int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
++			  struct iommu_gpasid_bind_data *data);
++int intel_svm_unbind_gpasid(struct device *dev, u32 pasid);
++struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm,
++				 void *drvdata);
++void intel_svm_unbind(struct iommu_sva *handle);
++u32 intel_svm_get_pasid(struct iommu_sva *handle);
++int intel_svm_page_response(struct device *dev, struct iommu_fault_event *evt,
++			    struct iommu_page_response *msg);
++
++struct intel_svm_dev {
++	struct list_head list;
++	struct rcu_head rcu;
++	struct device *dev;
++	struct intel_iommu *iommu;
++	struct iommu_sva sva;
++	unsigned long prq_seq_number;
++	u32 pasid;
++	int users;
++	u16 did;
++	u16 dev_iotlb:1;
++	u16 sid, qdep;
++};
++
++struct intel_svm {
++	struct mmu_notifier notifier;
++	struct mm_struct *mm;
++
++	unsigned int flags;
++	u32 pasid;
++	int gpasid; /* In case that guest PASID is different from host PASID */
++	struct list_head devs;
++};
++#else
++static inline void intel_svm_check(struct intel_iommu *iommu) {}
++#endif
++
++#ifdef CONFIG_INTEL_IOMMU_DEBUGFS
++void intel_iommu_debugfs_init(void);
++#else
++static inline void intel_iommu_debugfs_init(void) {}
++#endif /* CONFIG_INTEL_IOMMU_DEBUGFS */
++
++extern const struct attribute_group *intel_iommu_groups[];
++struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
++					 u8 devfn, int alloc);
++
++#ifdef CONFIG_INTEL_IOMMU
++extern int iommu_calculate_agaw(struct intel_iommu *iommu);
++extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu);
++extern int dmar_disabled;
++extern int intel_iommu_enabled;
++extern int intel_iommu_gfx_mapped;
++#else
++static inline int iommu_calculate_agaw(struct intel_iommu *iommu)
++{
++	return 0;
++}
++static inline int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
++{
++	return 0;
++}
++#define dmar_disabled	(1)
++#define intel_iommu_enabled (0)
++#endif
++
++static inline const char *decode_prq_descriptor(char *str, size_t size,
++		u64 dw0, u64 dw1, u64 dw2, u64 dw3)
++{
++	char *buf = str;
++	int bytes;
++
++	bytes = snprintf(buf, size,
++			 "rid=0x%llx addr=0x%llx %c%c%c%c%c pasid=0x%llx index=0x%llx",
++			 FIELD_GET(GENMASK_ULL(31, 16), dw0),
++			 FIELD_GET(GENMASK_ULL(63, 12), dw1),
++			 dw1 & BIT_ULL(0) ? 'r' : '-',
++			 dw1 & BIT_ULL(1) ? 'w' : '-',
++			 dw0 & BIT_ULL(52) ? 'x' : '-',
++			 dw0 & BIT_ULL(53) ? 'p' : '-',
++			 dw1 & BIT_ULL(2) ? 'l' : '-',
++			 FIELD_GET(GENMASK_ULL(51, 32), dw0),
++			 FIELD_GET(GENMASK_ULL(11, 3), dw1));
++
++	/* Private Data */
++	if (dw0 & BIT_ULL(9)) {
++		size -= bytes;
++		buf += bytes;
++		snprintf(buf, size, " private=0x%llx/0x%llx\n", dw2, dw3);
++	}
++
++	return str;
++}
++
++#endif
+diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
+index 637a60607c7d..453d4ee759fd 100644
+--- a/include/linux/kvm_host.h
++++ b/include/linux/kvm_host.h
+@@ -379,6 +379,12 @@ struct kvm_vcpu {
+ 	 */
+ 	struct kvm_memory_slot *last_used_slot;
+ 	u64 last_used_slot_gen;
++
++	/*
++	 * Save the handle returned from the pkvm when init a shadow vcpu. This
++	 * will be used when teardown this shadow vcpu.
++	 */
++	s64 pkvm_shadow_vcpu_handle;
+ };
+ 
+ /*
+@@ -686,6 +692,18 @@ struct kvm_memslots {
+ 	int node_idx;
+ };
+ 
++struct kvm_pinned_page {
++	struct list_head list;
++	struct page *page;
++};
++
++struct kvm_protected_vm {
++	int shadow_vm_handle;
++
++	struct list_head pinned_pages;
++	spinlock_t pinned_page_lock;
++};
++
+ struct kvm {
+ #ifdef KVM_HAVE_MMU_RWLOCK
+ 	rwlock_t mmu_lock;
+@@ -786,6 +804,8 @@ struct kvm {
+ 	struct notifier_block pm_notifier;
+ #endif
+ 	char stats_id[KVM_STATS_NAME_SIZE];
++
++	struct kvm_protected_vm pkvm;
+ };
+ 
+ #define kvm_err(fmt, ...) \
+@@ -1358,6 +1378,8 @@ int kvm_vcpu_yield_to(struct kvm_vcpu *target);
+ void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool usermode_vcpu_not_eligible);
+ 
+ void kvm_flush_remote_tlbs(struct kvm *kvm);
++int kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
++	struct kvm_tlb_range *range);
+ 
+ #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
+ int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min);
+@@ -1490,6 +1512,14 @@ static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
+ }
+ #endif
+ 
++#ifndef __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB_WITH_RANGE
++static inline int kvm_arch_flush_remote_tlb_with_range(struct kvm *kvm,
++	struct kvm_tlb_range *range)
++{
++	return -ENOTSUPP;
++}
++#endif
++
+ #ifdef __KVM_HAVE_ARCH_NONCOHERENT_DMA
+ void kvm_arch_register_noncoherent_dma(struct kvm *kvm);
+ void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm);
+@@ -1848,7 +1878,8 @@ struct _kvm_stats_desc {
+ 
+ #define KVM_GENERIC_VM_STATS()						       \
+ 	STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush),		       \
+-	STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush_requests)
++	STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush_requests),	       \
++	STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush_with_range)
+ 
+ #define KVM_GENERIC_VCPU_STATS()					       \
+ 	STATS_DESC_COUNTER(VCPU_GENERIC, halt_successful_poll),		       \
+@@ -2282,4 +2313,5 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr)
+ /* Max number of entries allowed for each kvm dirty ring */
+ #define  KVM_DIRTY_RING_MAX_ENTRIES  65536
+ 
++int kvm_arch_add_device_to_pkvm(struct kvm *kvm, struct iommu_group *grp);
+ #endif
+diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
+index 3ca3db020e0e..e556fcf5036a 100644
+--- a/include/linux/kvm_types.h
++++ b/include/linux/kvm_types.h
+@@ -104,6 +104,7 @@ struct kvm_mmu_memory_cache {
+ struct kvm_vm_stat_generic {
+ 	u64 remote_tlb_flush;
+ 	u64 remote_tlb_flush_requests;
++	u64 remote_tlb_flush_with_range;
+ };
+ 
+ struct kvm_vcpu_stat_generic {
+diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
+index 0d5d4419139a..22dcc759fd07 100644
+--- a/include/uapi/linux/kvm.h
++++ b/include/uapi/linux/kvm.h
+@@ -1178,6 +1178,7 @@ struct kvm_ppc_resize_hpt {
+ #define KVM_CAP_S390_ZPCI_OP 221
+ #define KVM_CAP_S390_CPU_TOPOLOGY 222
+ #define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223
++#define KVM_CAP_VM_TYPES 224
+ 
+ #ifdef KVM_CAP_IRQ_ROUTING
+ 
+diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
+index 960c7e93d1a9..a06f4a9e5033 100644
+--- a/include/uapi/linux/kvm_para.h
++++ b/include/uapi/linux/kvm_para.h
+@@ -31,6 +31,17 @@
+ #define KVM_HC_SCHED_YIELD		11
+ #define KVM_HC_MAP_GPA_RANGE		12
+ 
++#define KVM_HC_PKVM_OP			20
++
++/* PKVM provided hypercalls for guest use. */
++#define PKVM_GHC_NUM(x)	(x + KVM_HC_PKVM_OP)
++
++#define PKVM_GHC_SHARE_MEM		PKVM_GHC_NUM(1)
++#define PKVM_GHC_UNSHARE_MEM		PKVM_GHC_NUM(2)
++#define PKVM_GHC_IOREAD			PKVM_GHC_NUM(3)
++#define PKVM_GHC_IOWRITE		PKVM_GHC_NUM(4)
++#define PKVM_GHC_GET_VE_INFO		PKVM_GHC_NUM(5)
++
+ /*
+  * hypercalls use architecture specific
+  */
+diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
+index 46de10a809ec..f513dc0ae610 100644
+--- a/tools/arch/x86/include/uapi/asm/kvm.h
++++ b/tools/arch/x86/include/uapi/asm/kvm.h
+@@ -532,4 +532,7 @@ struct kvm_pmu_event_filter {
+ #define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */
+ #define   KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */
+ 
++#define KVM_X86_DEFAULT_VM	0
++#define KVM_X86_PROTECTED_VM	1
++
+ #endif /* _ASM_X86_KVM_H */
+diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
+index 0d5d4419139a..22dcc759fd07 100644
+--- a/tools/include/uapi/linux/kvm.h
++++ b/tools/include/uapi/linux/kvm.h
+@@ -1178,6 +1178,7 @@ struct kvm_ppc_resize_hpt {
+ #define KVM_CAP_S390_ZPCI_OP 221
+ #define KVM_CAP_S390_CPU_TOPOLOGY 222
+ #define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223
++#define KVM_CAP_VM_TYPES 224
+ 
+ #ifdef KVM_CAP_IRQ_ROUTING
+ 
+diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
+index 8123f4d15930..1a1cc36d20a4 100644
+--- a/virt/kvm/kvm_main.c
++++ b/virt/kvm/kvm_main.c
+@@ -378,6 +378,33 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
+ EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
+ #endif
+ 
++int kvm_flush_remote_tlbs_with_range(struct kvm *kvm, struct kvm_tlb_range *range)
++{
++	int ret;
++
++	ret = kvm_arch_flush_remote_tlb_with_range(kvm, range);
++	if (!ret)
++		++kvm->stat.generic.remote_tlb_flush_with_range;
++
++	return ret;
++}
++EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs_with_range);
++
++static bool kvm_try_flush_remote_tlbs_with_range(struct kvm *kvm,
++	struct kvm_gfn_range *gfn_range)
++{
++#ifdef CONFIG_PKVM_INTEL
++	struct kvm_tlb_range tlb_range = {
++		.start_gfn = gfn_range->start,
++		.pages = gfn_range->end - gfn_range->start,
++	};
++
++	return !!kvm_flush_remote_tlbs_with_range(kvm, &tlb_range);
++#else
++	return true;
++#endif
++}
++
+ static void kvm_flush_shadow_all(struct kvm *kvm)
+ {
+ 	kvm_arch_flush_shadow_all(kvm);
+@@ -578,7 +605,7 @@ static void kvm_null_fn(void)
+ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
+ 						  const struct kvm_hva_range *range)
+ {
+-	bool ret = false, locked = false;
++	bool ret = false, locked = false, need_global_flush = false;
+ 	struct kvm_gfn_range gfn_range;
+ 	struct kvm_memory_slot *slot;
+ 	struct kvm_memslots *slots;
+@@ -633,10 +660,14 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
+ 					break;
+ 			}
+ 			ret |= range->handler(kvm, &gfn_range);
++			if (range->flush_on_ret && ret)
++				need_global_flush |=
++					kvm_try_flush_remote_tlbs_with_range(kvm, &gfn_range);
++
+ 		}
+ 	}
+ 
+-	if (range->flush_on_ret && ret)
++	if (range->flush_on_ret && ret && need_global_flush)
+ 		kvm_flush_remote_tlbs(kvm);
+ 
+ 	if (locked) {
+diff --git a/virt/kvm/pkvm/buddy_memory.h b/virt/kvm/pkvm/buddy_memory.h
+new file mode 100644
+index 000000000000..56ae67a1c294
+--- /dev/null
++++ b/virt/kvm/pkvm/buddy_memory.h
+@@ -0,0 +1,36 @@
++/* SPDX-License-Identifier: GPL-2.0-only */
++#ifndef __PKVM_BUDDY_MEMORY_H
++#define __PKVM_BUDDY_MEMORY_H
++
++#include <asm/kvm_pkvm.h>
++#include <asm/page.h>
++
++#include <linux/types.h>
++
++struct pkvm_page {
++	unsigned short refcount;
++	unsigned short order;
++};
++
++extern u64 __pkvm_vmemmap;
++#define pkvm_vmemmap ((struct pkvm_page *)__pkvm_vmemmap)
++
++#define pkvm_phys_to_pfn(phys)	((phys) >> PAGE_SHIFT)
++#define pkvm_pfn_to_phys(pfn)	((phys_addr_t)((pfn) << PAGE_SHIFT))
++#define pkvm_phys_to_page(phys)	(&pkvm_vmemmap[pkvm_phys_to_pfn(phys)])
++#define pkvm_virt_to_page(virt)	pkvm_phys_to_page(__pkvm_pa(virt))
++#define pkvm_virt_to_pfn(virt)	pkvm_phys_to_pfn(__pkvm_pa(virt))
++
++#define pkvm_page_to_pfn(page)	((struct pkvm_page *)(page) - pkvm_vmemmap)
++#define pkvm_page_to_phys(page)  pkvm_pfn_to_phys((pkvm_page_to_pfn(page)))
++#define pkvm_page_to_virt(page)	__pkvm_va(pkvm_page_to_phys(page))
++#define pkvm_page_to_pool(page)	(((struct pkvm_page *)page)->pool)
++
++static inline int pkvm_page_count(void *addr)
++{
++	struct pkvm_page *p = pkvm_virt_to_page(addr);
++
++	return p->refcount;
++}
++
++#endif /* __PKVM_BUDDY_MEMORY_H */
+diff --git a/virt/kvm/pkvm/gfp.h b/virt/kvm/pkvm/gfp.h
+new file mode 100644
+index 000000000000..47351de0522e
+--- /dev/null
++++ b/virt/kvm/pkvm/gfp.h
+@@ -0,0 +1,35 @@
++/* SPDX-License-Identifier: GPL-2.0-only */
++#ifndef __PKVM_GFP_H
++#define __PKVM_GFP_H
++
++#include <linux/mmzone.h>
++#include <linux/list.h>
++
++#include <buddy_memory.h>
++#include <pkvm_spinlock.h>
++
++#define PKVM_NO_ORDER	USHRT_MAX
++
++struct pkvm_pool {
++	/*
++	 * Spinlock protecting concurrent changes to the memory pool as well as
++	 * the struct pkvm_page of the pool's pages until we have a proper atomic
++	 * API at hypervisor.
++	 */
++	pkvm_spinlock_t lock;
++	struct list_head free_area[MAX_ORDER];
++	phys_addr_t range_start;
++	phys_addr_t range_end;
++	unsigned short max_order;
++};
++
++/* Allocation */
++void *pkvm_alloc_pages(struct pkvm_pool *pool, unsigned short order);
++void pkvm_split_page(struct pkvm_page *page);
++void pkvm_get_page(struct pkvm_pool *pool, void *addr);
++void pkvm_put_page(struct pkvm_pool *pool, void *addr);
++
++/* Used pages cannot be freed */
++int pkvm_pool_init(struct pkvm_pool *pool, u64 pfn, unsigned int nr_pages,
++		  unsigned int reserved_pages);
++#endif /* __PKVM_GFP_H */
+diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/virt/kvm/pkvm/page_alloc.c
+similarity index 56%
+rename from arch/arm64/kvm/hyp/nvhe/page_alloc.c
+rename to virt/kvm/pkvm/page_alloc.c
+index d40f0b30b534..83a61c55cf0f 100644
+--- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c
++++ b/virt/kvm/pkvm/page_alloc.c
+@@ -4,13 +4,12 @@
+  * Author: Quentin Perret <qperret@google.com>
+  */
+ 
+-#include <asm/kvm_hyp.h>
+-#include <nvhe/gfp.h>
++#include <gfp.h>
+ 
+-u64 __hyp_vmemmap;
++u64 __pkvm_vmemmap;
+ 
+ /*
+- * Index the hyp_vmemmap to find a potential buddy page, but make no assumption
++ * Index the pkvm_vmemmap to find a potential buddy page, but make no assumption
+  * about its current state.
+  *
+  * Example buddy-tree for a 4-pages physically contiguous pool:
+@@ -30,30 +29,30 @@ u64 __hyp_vmemmap;
+  *   __find_buddy_nocheck(pool, page 1, order 0) => page 0
+  *   __find_buddy_nocheck(pool, page 2, order 0) => page 3
+  */
+-static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool,
+-					     struct hyp_page *p,
++static struct pkvm_page *__find_buddy_nocheck(struct pkvm_pool *pool,
++					     struct pkvm_page *p,
+ 					     unsigned short order)
+ {
+-	phys_addr_t addr = hyp_page_to_phys(p);
++	phys_addr_t addr = pkvm_page_to_phys(p);
+ 
+ 	addr ^= (PAGE_SIZE << order);
+ 
+ 	/*
+ 	 * Don't return a page outside the pool range -- it belongs to
+-	 * something else and may not be mapped in hyp_vmemmap.
++	 * something else and may not be mapped in pkvm_vmemmap.
+ 	 */
+ 	if (addr < pool->range_start || addr >= pool->range_end)
+ 		return NULL;
+ 
+-	return hyp_phys_to_page(addr);
++	return pkvm_phys_to_page(addr);
+ }
+ 
+ /* Find a buddy page currently available for allocation */
+-static struct hyp_page *__find_buddy_avail(struct hyp_pool *pool,
+-					   struct hyp_page *p,
++static struct pkvm_page *__find_buddy_avail(struct pkvm_pool *pool,
++					   struct pkvm_page *p,
+ 					   unsigned short order)
+ {
+-	struct hyp_page *buddy = __find_buddy_nocheck(pool, p, order);
++	struct pkvm_page *buddy = __find_buddy_nocheck(pool, p, order);
+ 
+ 	if (!buddy || buddy->order != order || buddy->refcount)
+ 		return NULL;
+@@ -65,46 +64,46 @@ static struct hyp_page *__find_buddy_avail(struct hyp_pool *pool,
+ /*
+  * Pages that are available for allocation are tracked in free-lists, so we use
+  * the pages themselves to store the list nodes to avoid wasting space. As the
+- * allocator always returns zeroed pages (which are zeroed on the hyp_put_page()
++ * allocator always returns zeroed pages (which are zeroed on the pkvm_put_page()
+  * path to optimize allocation speed), we also need to clean-up the list node in
+  * each page when we take it out of the list.
+  */
+-static inline void page_remove_from_list(struct hyp_page *p)
++static inline void page_remove_from_list(struct pkvm_page *p)
+ {
+-	struct list_head *node = hyp_page_to_virt(p);
++	struct list_head *node = pkvm_page_to_virt(p);
+ 
+ 	__list_del_entry(node);
+ 	memset(node, 0, sizeof(*node));
+ }
+ 
+-static inline void page_add_to_list(struct hyp_page *p, struct list_head *head)
++static inline void page_add_to_list(struct pkvm_page *p, struct list_head *head)
+ {
+-	struct list_head *node = hyp_page_to_virt(p);
++	struct list_head *node = pkvm_page_to_virt(p);
+ 
+ 	INIT_LIST_HEAD(node);
+ 	list_add_tail(node, head);
+ }
+ 
+-static inline struct hyp_page *node_to_page(struct list_head *node)
++static inline struct pkvm_page *node_to_page(struct list_head *node)
+ {
+-	return hyp_virt_to_page(node);
++	return pkvm_virt_to_page(node);
+ }
+ 
+-static void __hyp_attach_page(struct hyp_pool *pool,
+-			      struct hyp_page *p)
++static void __pkvm_attach_page(struct pkvm_pool *pool,
++			      struct pkvm_page *p)
+ {
+ 	unsigned short order = p->order;
+-	struct hyp_page *buddy;
++	struct pkvm_page *buddy;
+ 
+-	memset(hyp_page_to_virt(p), 0, PAGE_SIZE << p->order);
++	memset(pkvm_page_to_virt(p), 0, PAGE_SIZE << p->order);
+ 
+ 	/*
+-	 * Only the first struct hyp_page of a high-order page (otherwise known
++	 * Only the first struct pkvm_page of a high-order page (otherwise known
+ 	 * as the 'head') should have p->order set. The non-head pages should
+-	 * have p->order = HYP_NO_ORDER. Here @p may no longer be the head
+-	 * after coalescing, so make sure to mark it HYP_NO_ORDER proactively.
++	 * have p->order = PKVM_NO_ORDER. Here @p may no longer be the head
++	 * after coalescing, so make sure to mark it PKVM_NO_ORDER proactively.
+ 	 */
+-	p->order = HYP_NO_ORDER;
++	p->order = PKVM_NO_ORDER;
+ 	for (; (order + 1) < pool->max_order; order++) {
+ 		buddy = __find_buddy_avail(pool, p, order);
+ 		if (!buddy)
+@@ -112,7 +111,7 @@ static void __hyp_attach_page(struct hyp_pool *pool,
+ 
+ 		/* Take the buddy out of its list, and coalesce with @p */
+ 		page_remove_from_list(buddy);
+-		buddy->order = HYP_NO_ORDER;
++		buddy->order = PKVM_NO_ORDER;
+ 		p = min(p, buddy);
+ 	}
+ 
+@@ -121,16 +120,16 @@ static void __hyp_attach_page(struct hyp_pool *pool,
+ 	page_add_to_list(p, &pool->free_area[order]);
+ }
+ 
+-static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool,
+-					   struct hyp_page *p,
++static struct pkvm_page *__pkvm_extract_page(struct pkvm_pool *pool,
++					   struct pkvm_page *p,
+ 					   unsigned short order)
+ {
+-	struct hyp_page *buddy;
++	struct pkvm_page *buddy;
+ 
+ 	page_remove_from_list(p);
+ 	while (p->order > order) {
+ 		/*
+-		 * The buddy of order n - 1 currently has HYP_NO_ORDER as it
++		 * The buddy of order n - 1 currently has PKVM_NO_ORDER as it
+ 		 * is covered by a higher-level page (whose head is @p). Use
+ 		 * __find_buddy_nocheck() to find it and inject it in the
+ 		 * free_list[n - 1], effectively splitting @p in half.
+@@ -144,103 +143,103 @@ static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool,
+ 	return p;
+ }
+ 
+-static inline void hyp_page_ref_inc(struct hyp_page *p)
++static inline void pkvm_page_ref_inc(struct pkvm_page *p)
+ {
+ 	BUG_ON(p->refcount == USHRT_MAX);
+ 	p->refcount++;
+ }
+ 
+-static inline int hyp_page_ref_dec_and_test(struct hyp_page *p)
++static inline int pkvm_page_ref_dec_and_test(struct pkvm_page *p)
+ {
+ 	BUG_ON(!p->refcount);
+ 	p->refcount--;
+ 	return (p->refcount == 0);
+ }
+ 
+-static inline void hyp_set_page_refcounted(struct hyp_page *p)
++static inline void pkvm_set_page_refcounted(struct pkvm_page *p)
+ {
+ 	BUG_ON(p->refcount);
+ 	p->refcount = 1;
+ }
+ 
+-static void __hyp_put_page(struct hyp_pool *pool, struct hyp_page *p)
++static void __pkvm_put_page(struct pkvm_pool *pool, struct pkvm_page *p)
+ {
+-	if (hyp_page_ref_dec_and_test(p))
+-		__hyp_attach_page(pool, p);
++	if (pkvm_page_ref_dec_and_test(p))
++		__pkvm_attach_page(pool, p);
+ }
+ 
+ /*
+- * Changes to the buddy tree and page refcounts must be done with the hyp_pool
++ * Changes to the buddy tree and page refcounts must be done with the pkvm_pool
+  * lock held. If a refcount change requires an update to the buddy tree (e.g.
+- * hyp_put_page()), both operations must be done within the same critical
++ * pkvm_put_page()), both operations must be done within the same critical
+  * section to guarantee transient states (e.g. a page with null refcount but
+  * not yet attached to a free list) can't be observed by well-behaved readers.
+  */
+-void hyp_put_page(struct hyp_pool *pool, void *addr)
++void pkvm_put_page(struct pkvm_pool *pool, void *addr)
+ {
+-	struct hyp_page *p = hyp_virt_to_page(addr);
++	struct pkvm_page *p = pkvm_virt_to_page(addr);
+ 
+-	hyp_spin_lock(&pool->lock);
+-	__hyp_put_page(pool, p);
+-	hyp_spin_unlock(&pool->lock);
++	pkvm_spin_lock(&pool->lock);
++	__pkvm_put_page(pool, p);
++	pkvm_spin_unlock(&pool->lock);
+ }
+ 
+-void hyp_get_page(struct hyp_pool *pool, void *addr)
++void pkvm_get_page(struct pkvm_pool *pool, void *addr)
+ {
+-	struct hyp_page *p = hyp_virt_to_page(addr);
++	struct pkvm_page *p = pkvm_virt_to_page(addr);
+ 
+-	hyp_spin_lock(&pool->lock);
+-	hyp_page_ref_inc(p);
+-	hyp_spin_unlock(&pool->lock);
++	pkvm_spin_lock(&pool->lock);
++	pkvm_page_ref_inc(p);
++	pkvm_spin_unlock(&pool->lock);
+ }
+ 
+-void hyp_split_page(struct hyp_page *p)
++void pkvm_split_page(struct pkvm_page *p)
+ {
+ 	unsigned short order = p->order;
+ 	unsigned int i;
+ 
+ 	p->order = 0;
+ 	for (i = 1; i < (1 << order); i++) {
+-		struct hyp_page *tail = p + i;
++		struct pkvm_page *tail = p + i;
+ 
+ 		tail->order = 0;
+-		hyp_set_page_refcounted(tail);
++		pkvm_set_page_refcounted(tail);
+ 	}
+ }
+ 
+-void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order)
++void *pkvm_alloc_pages(struct pkvm_pool *pool, unsigned short order)
+ {
+ 	unsigned short i = order;
+-	struct hyp_page *p;
++	struct pkvm_page *p;
+ 
+-	hyp_spin_lock(&pool->lock);
++	pkvm_spin_lock(&pool->lock);
+ 
+ 	/* Look for a high-enough-order page */
+ 	while (i < pool->max_order && list_empty(&pool->free_area[i]))
+ 		i++;
+ 	if (i >= pool->max_order) {
+-		hyp_spin_unlock(&pool->lock);
++		pkvm_spin_unlock(&pool->lock);
+ 		return NULL;
+ 	}
+ 
+ 	/* Extract it from the tree at the right order */
+ 	p = node_to_page(pool->free_area[i].next);
+-	p = __hyp_extract_page(pool, p, order);
++	p = __pkvm_extract_page(pool, p, order);
+ 
+-	hyp_set_page_refcounted(p);
+-	hyp_spin_unlock(&pool->lock);
++	pkvm_set_page_refcounted(p);
++	pkvm_spin_unlock(&pool->lock);
+ 
+-	return hyp_page_to_virt(p);
++	return pkvm_page_to_virt(p);
+ }
+ 
+-int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages,
++int pkvm_pool_init(struct pkvm_pool *pool, u64 pfn, unsigned int nr_pages,
+ 		  unsigned int reserved_pages)
+ {
+-	phys_addr_t phys = hyp_pfn_to_phys(pfn);
+-	struct hyp_page *p;
++	phys_addr_t phys = pkvm_pfn_to_phys(pfn);
++	struct pkvm_page *p;
+ 	int i;
+ 
+-	hyp_spin_lock_init(&pool->lock);
++	pkvm_spinlock_init(&pool->lock);
+ 	pool->max_order = min(MAX_ORDER, get_order((nr_pages + 1) << PAGE_SHIFT));
+ 	for (i = 0; i < pool->max_order; i++)
+ 		INIT_LIST_HEAD(&pool->free_area[i]);
+@@ -248,15 +247,15 @@ int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages,
+ 	pool->range_end = phys + (nr_pages << PAGE_SHIFT);
+ 
+ 	/* Init the vmemmap portion */
+-	p = hyp_phys_to_page(phys);
++	p = pkvm_phys_to_page(phys);
+ 	for (i = 0; i < nr_pages; i++) {
+ 		p[i].order = 0;
+-		hyp_set_page_refcounted(&p[i]);
++		pkvm_set_page_refcounted(&p[i]);
+ 	}
+ 
+ 	/* Attach the unused pages to the buddy tree */
+ 	for (i = reserved_pages; i < nr_pages; i++)
+-		__hyp_put_page(pool, &p[i]);
++		__pkvm_put_page(pool, &p[i]);
+ 
+ 	return 0;
+ }
+diff --git a/virt/kvm/pkvm/pkvm.c b/virt/kvm/pkvm/pkvm.c
+new file mode 100644
+index 000000000000..03894f4ca24c
+--- /dev/null
++++ b/virt/kvm/pkvm/pkvm.c
+@@ -0,0 +1,85 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Copyright (C) 2020 - Google LLC
++ * Author: Quentin Perret <qperret@google.com>
++ */
++
++#include <linux/kvm_host.h>
++#include <linux/memblock.h>
++#include <linux/sort.h>
++
++#include <asm/kvm_pkvm.h>
++
++static struct memblock_region *_pkvm_memory = pkvm_sym(pkvm_memory);
++static unsigned int *pkvm_memblock_nr_ptr = &pkvm_sym(pkvm_memblock_nr);
++
++phys_addr_t pkvm_mem_base;
++phys_addr_t pkvm_mem_size;
++
++static int cmp_pkvm_memblock(const void *p1, const void *p2)
++{
++	const struct memblock_region *r1 = p1;
++	const struct memblock_region *r2 = p2;
++
++	return r1->base < r2->base ? -1 : (r1->base > r2->base);
++}
++
++static void __init sort_memblock_regions(void)
++{
++	sort(_pkvm_memory,
++	     *pkvm_memblock_nr_ptr,
++	     sizeof(struct memblock_region),
++	     cmp_pkvm_memblock,
++	     NULL);
++}
++
++static int __init register_memblock_regions(void)
++{
++	struct memblock_region *reg;
++
++	for_each_mem_region(reg) {
++		if (*pkvm_memblock_nr_ptr >= PKVM_MEMBLOCK_REGIONS)
++			return -ENOMEM;
++
++		_pkvm_memory[*pkvm_memblock_nr_ptr] = *reg;
++		(*pkvm_memblock_nr_ptr)++;
++	}
++	sort_memblock_regions();
++
++	return 0;
++}
++
++void __init pkvm_reserve(void)
++{
++	int ret;
++
++	if (pkvm_pre_reserve_check() < 0)
++		return;
++
++	ret = register_memblock_regions();
++	if (ret) {
++		*pkvm_memblock_nr_ptr = 0;
++		kvm_err("Failed to register pkvm memblocks: %d\n", ret);
++		return;
++	}
++
++	/*
++	 * Try to allocate a PMD-aligned region to reduce TLB pressure once
++	 * this is unmapped from the host stage-2, and fallback to PAGE_SIZE.
++	 */
++	pkvm_mem_size = pkvm_total_reserve_pages() << PAGE_SHIFT;
++	pkvm_mem_base = memblock_phys_alloc(ALIGN(pkvm_mem_size, PMD_SIZE),
++					   PMD_SIZE);
++	if (!pkvm_mem_base)
++		pkvm_mem_base = memblock_phys_alloc(pkvm_mem_size, PAGE_SIZE);
++	else
++		pkvm_mem_size = ALIGN(pkvm_mem_size, PMD_SIZE);
++
++	if (!pkvm_mem_base) {
++		kvm_err("Failed to reserve pkvm memory\n");
++		return;
++	}
++
++	kvm_info("Reserved %lld MiB at 0x%llx\n", pkvm_mem_size >> 20,
++		 pkvm_mem_base);
++}
+diff --git a/virt/kvm/pkvm/pkvm_spinlock.h b/virt/kvm/pkvm/pkvm_spinlock.h
+new file mode 100644
+index 000000000000..d234ed1188e2
+--- /dev/null
++++ b/virt/kvm/pkvm/pkvm_spinlock.h
+@@ -0,0 +1,47 @@
++/*
++ * SPDX-License-Identifier: GPL-2.0
++ * Copyright (C) 2022 Intel Corporation
++ *
++ * pkvm runs in a self-contained environment
++ * and requires a self-contained spinlock implementation
++ * which doesn't rely on any other external symbols.
++ *
++ * This is a common interface with wrapping the arch
++ * specific implementation.
++ * */
++#ifndef __PKVM_SPINLOCK_H
++#define __PKVM_SPINLOCK_H
++
++#include <asm/pkvm_spinlock.h>
++
++typedef struct pkvm_spinlock {
++	arch_pkvm_spinlock_t 	pkvm_lock;
++} pkvm_spinlock_t;
++
++#define __PKVM_SPINLOCK_INITIALIZER 			\
++	{ .pkvm_lock = __ARCH_PKVM_SPINLOCK_UNLOCKED }
++
++#define __PKVM_SPINLOCK_UNLOCKED 			\
++	((pkvm_spinlock_t) __PKVM_SPINLOCK_INITIALIZER)
++
++#define pkvm_spinlock_init(l) 				\
++do {							\
++	*(l) = __PKVM_SPINLOCK_UNLOCKED;		\
++} while (0);
++
++static __always_inline void pkvm_spin_lock(pkvm_spinlock_t *lock)
++{
++	arch_pkvm_spin_lock(&lock->pkvm_lock);
++}
++
++static __always_inline void pkvm_spin_unlock(pkvm_spinlock_t *lock)
++{
++	arch_pkvm_spin_unlock(&lock->pkvm_lock);
++}
++
++static __always_inline void pkvm_assert_lock_held(pkvm_spinlock_t *lock)
++{
++	arch_pkvm_assert_lock_held(&lock->pkvm_lock);
++}
++
++#endif
+diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
+index 365d30779768..c43b975c4729 100644
+--- a/virt/kvm/vfio.c
++++ b/virt/kvm/vfio.c
+@@ -144,10 +144,16 @@ static void kvm_vfio_update_coherency(struct kvm_device *dev)
+ 	mutex_unlock(&kv->lock);
+ }
+ 
++int __weak kvm_arch_add_device_to_pkvm(struct kvm *kvm, struct iommu_group *grp)
++{
++	return 0;
++}
++
+ static int kvm_vfio_file_add(struct kvm_device *dev, unsigned int fd)
+ {
+ 	struct kvm_vfio *kv = dev->private;
+ 	struct kvm_vfio_file *kvf;
++	struct iommu_group *iommu_grp;
+ 	struct file *filp;
+ 	int ret;
+ 
+@@ -177,6 +183,11 @@ static int kvm_vfio_file_add(struct kvm_device *dev, unsigned int fd)
+ 	}
+ 
+ 	kvf->file = filp;
++
++	ret = kvm_arch_add_device_to_pkvm(dev->kvm, iommu_grp);
++	if (ret)
++		goto free_kvf;
++
+ 	list_add_tail(&kvf->node, &kv->file_list);
+ 
+ 	kvm_arch_start_assignment(dev->kvm);
+@@ -187,6 +198,8 @@ static int kvm_vfio_file_add(struct kvm_device *dev, unsigned int fd)
+ 	kvm_vfio_update_coherency(dev);
+ 
+ 	return 0;
++free_kvf:
++	kfree(kvf);
+ err_unlock:
+ 	mutex_unlock(&kv->lock);
+ err_fput:
+-- 
+2.34.1
+
diff --git a/targets/lenovo-x1-carbon.nix b/targets/lenovo-x1-carbon.nix
index 7282e98c9..d88e4e5b0 100644
--- a/targets/lenovo-x1-carbon.nix
+++ b/targets/lenovo-x1-carbon.nix
@@ -197,6 +197,8 @@
             ghaf = {
               host.kernel_hardening.enable = false;
 
+              host.hypervisor_hardening.enable = false;
+
               hardware.x86_64.common.enable = true;
 
               virtualization.microvm-host.enable = true;